1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
2 // Test target codegen - host bc file has to be created first.
3 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
4 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK1
5 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
6 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK2
7 // RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix=CHECK3
8 // expected-no-diagnostics
9 #ifndef HEADER
10 #define HEADER
11 
12 template<typename tx>
13 tx ftemplate(int n) {
14   int a;
15   short b;
16   tx c;
17   float d;
18   double e;
19 
20   #pragma omp target
21   #pragma omp teams reduction(+: e)
22   {
23     e += 5;
24   }
25 
26   #pragma omp target
27   #pragma omp teams reduction(^: c) reduction(*: d)
28   {
29     c ^= 2;
30     d *= 33;
31   }
32 
33   #pragma omp target
34   #pragma omp teams reduction(|: a) reduction(max: b)
35   #pragma omp parallel reduction(|: a) reduction(max: b)
36   {
37     a |= 1;
38     b = 99 > b ? 99 : b;
39   }
40 
41   return a+b+c+d+e;
42 }
43 
44 int bar(int n){
45   int a = 0;
46 
47   a += ftemplate<char>(n);
48 
49   return a;
50 }
51 
52 #endif
53 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker
54 // CHECK4-SAME: () #[[ATTR0:[0-9]+]] {
55 // CHECK4-NEXT:  entry:
56 // CHECK4-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
57 // CHECK4-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
58 // CHECK4-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
59 // CHECK4-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
60 // CHECK4-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
61 // CHECK4:       .await.work:
62 // CHECK4-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
63 // CHECK4-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
64 // CHECK4-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
65 // CHECK4-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
66 // CHECK4-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
67 // CHECK4-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
68 // CHECK4-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
69 // CHECK4:       .select.workers:
70 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
71 // CHECK4-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
72 // CHECK4-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
73 // CHECK4:       .execute.parallel:
74 // CHECK4-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
75 // CHECK4-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
76 // CHECK4-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
77 // CHECK4-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
78 // CHECK4:       .terminate.parallel:
79 // CHECK4-NEXT:    call void @__kmpc_kernel_end_parallel()
80 // CHECK4-NEXT:    br label [[DOTBARRIER_PARALLEL]]
81 // CHECK4:       .barrier.parallel:
82 // CHECK4-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
83 // CHECK4-NEXT:    br label [[DOTAWAIT_WORK]]
84 // CHECK4:       .exit:
85 // CHECK4-NEXT:    ret void
86 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23
87 // CHECK4-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] {
88 // CHECK4-NEXT:  entry:
89 // CHECK4-NEXT:    [[E_ADDR:%.*]] = alloca double*, align 4
90 // CHECK4-NEXT:    [[E7:%.*]] = alloca double, align 8
91 // CHECK4-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
92 // CHECK4-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
93 // CHECK4-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
94 // CHECK4-NEXT:    store double* [[E]], double** [[E_ADDR]], align 4
95 // CHECK4-NEXT:    [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
96 // CHECK4-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
97 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
98 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
99 // CHECK4-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
100 // CHECK4-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
101 // CHECK4-NEXT:    br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
102 // CHECK4:       .worker:
103 // CHECK4-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]]
104 // CHECK4-NEXT:    br label [[DOTEXIT:%.*]]
105 // CHECK4:       .mastercheck:
106 // CHECK4-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
107 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
108 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
109 // CHECK4-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
110 // CHECK4-NEXT:    [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
111 // CHECK4-NEXT:    [[TMP4:%.*]] = xor i32 [[TMP2]], -1
112 // CHECK4-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
113 // CHECK4-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
114 // CHECK4-NEXT:    br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
115 // CHECK4:       .master:
116 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
117 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
118 // CHECK4-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
119 // CHECK4-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
120 // CHECK4-NEXT:    call void @__kmpc_data_sharing_init_stack()
121 // CHECK4-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
122 // CHECK4-NEXT:    [[TMP7:%.*]] = load double, double* [[TMP0]], align 8
123 // CHECK4-NEXT:    store double [[TMP7]], double* [[E7]], align 8
124 // CHECK4-NEXT:    store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4
125 // CHECK4-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]]
126 // CHECK4-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
127 // CHECK4:       .termination.notifier:
128 // CHECK4-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
129 // CHECK4-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
130 // CHECK4-NEXT:    br label [[DOTEXIT]]
131 // CHECK4:       .exit:
132 // CHECK4-NEXT:    ret void
133 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__
134 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] {
135 // CHECK4-NEXT:  entry:
136 // CHECK4-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
137 // CHECK4-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
138 // CHECK4-NEXT:    [[E_ADDR:%.*]] = alloca double*, align 4
139 // CHECK4-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
140 // CHECK4-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
141 // CHECK4-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
142 // CHECK4-NEXT:    store double* [[E]], double** [[E_ADDR]], align 4
143 // CHECK4-NEXT:    [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
144 // CHECK4-NEXT:    [[TMP1:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2
145 // CHECK4-NEXT:    [[TMP2:%.*]] = load i32, i32* @"_openmp_static_kernel$size", align 4
146 // CHECK4-NEXT:    call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP2]], i16 [[TMP1]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**))
147 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4
148 // CHECK4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP3]], i32 0
149 // CHECK4-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to %struct._globalized_locals_ty*
150 // CHECK4-NEXT:    [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP5]], i32 0, i32 0
151 // CHECK4-NEXT:    store double 0.000000e+00, double* [[E1]], align 8
152 // CHECK4-NEXT:    [[TMP6:%.*]] = load double, double* [[E1]], align 8
153 // CHECK4-NEXT:    [[ADD:%.*]] = fadd double [[TMP6]], 5.000000e+00
154 // CHECK4-NEXT:    store double [[ADD]], double* [[E1]], align 8
155 // CHECK4-NEXT:    [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
156 // CHECK4-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
157 // CHECK4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
158 // CHECK4-NEXT:    [[TMP10:%.*]] = bitcast double* [[E1]] to i8*
159 // CHECK4-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 4
160 // CHECK4-NEXT:    [[TMP11:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
161 // CHECK4-NEXT:    [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
162 // CHECK4-NEXT:    [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i8* [[TMP12]], i32 2048, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func)
163 // CHECK4-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1
164 // CHECK4-NEXT:    br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
165 // CHECK4:       .omp.reduction.then:
166 // CHECK4-NEXT:    [[TMP15:%.*]] = load double, double* [[TMP0]], align 8
167 // CHECK4-NEXT:    [[TMP16:%.*]] = load double, double* [[E1]], align 8
168 // CHECK4-NEXT:    [[ADD2:%.*]] = fadd double [[TMP15]], [[TMP16]]
169 // CHECK4-NEXT:    store double [[ADD2]], double* [[TMP0]], align 8
170 // CHECK4-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP8]])
171 // CHECK4-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
172 // CHECK4:       .omp.reduction.done:
173 // CHECK4-NEXT:    [[TMP17:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared", align 2
174 // CHECK4-NEXT:    call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP17]])
175 // CHECK4-NEXT:    ret void
176 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
177 // CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
178 // CHECK4-NEXT:  entry:
179 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
180 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
181 // CHECK4-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
182 // CHECK4-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
183 // CHECK4-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4
184 // CHECK4-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8
185 // CHECK4-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
186 // CHECK4-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
187 // CHECK4-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
188 // CHECK4-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
189 // CHECK4-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
190 // CHECK4-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]*
191 // CHECK4-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
192 // CHECK4-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
193 // CHECK4-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
194 // CHECK4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
195 // CHECK4-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
196 // CHECK4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
197 // CHECK4-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double*
198 // CHECK4-NEXT:    [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1
199 // CHECK4-NEXT:    [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8*
200 // CHECK4-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64*
201 // CHECK4-NEXT:    [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
202 // CHECK4-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8
203 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
204 // CHECK4-NEXT:    [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
205 // CHECK4-NEXT:    [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]])
206 // CHECK4-NEXT:    store i64 [[TMP19]], i64* [[TMP16]], align 8
207 // CHECK4-NEXT:    [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1
208 // CHECK4-NEXT:    [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1
209 // CHECK4-NEXT:    [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
210 // CHECK4-NEXT:    store i8* [[TMP22]], i8** [[TMP11]], align 4
211 // CHECK4-NEXT:    [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0
212 // CHECK4-NEXT:    [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1
213 // CHECK4-NEXT:    [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
214 // CHECK4-NEXT:    [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]]
215 // CHECK4-NEXT:    [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2
216 // CHECK4-NEXT:    [[TMP28:%.*]] = and i16 [[TMP6]], 1
217 // CHECK4-NEXT:    [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0
218 // CHECK4-NEXT:    [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]]
219 // CHECK4-NEXT:    [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0
220 // CHECK4-NEXT:    [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
221 // CHECK4-NEXT:    [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]]
222 // CHECK4-NEXT:    [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]]
223 // CHECK4-NEXT:    br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]]
224 // CHECK4:       then:
225 // CHECK4-NEXT:    [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
226 // CHECK4-NEXT:    [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
227 // CHECK4-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]]
228 // CHECK4-NEXT:    br label [[IFCONT:%.*]]
229 // CHECK4:       else:
230 // CHECK4-NEXT:    br label [[IFCONT]]
231 // CHECK4:       ifcont:
232 // CHECK4-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1
233 // CHECK4-NEXT:    [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
234 // CHECK4-NEXT:    [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
235 // CHECK4-NEXT:    br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
236 // CHECK4:       then4:
237 // CHECK4-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
238 // CHECK4-NEXT:    [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4
239 // CHECK4-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
240 // CHECK4-NEXT:    [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4
241 // CHECK4-NEXT:    [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double*
242 // CHECK4-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double*
243 // CHECK4-NEXT:    [[TMP46:%.*]] = load double, double* [[TMP44]], align 8
244 // CHECK4-NEXT:    store double [[TMP46]], double* [[TMP45]], align 8
245 // CHECK4-NEXT:    br label [[IFCONT6:%.*]]
246 // CHECK4:       else5:
247 // CHECK4-NEXT:    br label [[IFCONT6]]
248 // CHECK4:       ifcont6:
249 // CHECK4-NEXT:    ret void
250 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
251 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
252 // CHECK4-NEXT:  entry:
253 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
254 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
255 // CHECK4-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
256 // CHECK4-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
257 // CHECK4-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
258 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
259 // CHECK4-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
260 // CHECK4-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
261 // CHECK4-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
262 // CHECK4-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
263 // CHECK4-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
264 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
265 // CHECK4-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
266 // CHECK4-NEXT:    store i32 0, i32* [[DOTCNT_ADDR]], align 4
267 // CHECK4-NEXT:    br label [[PRECOND:%.*]]
268 // CHECK4:       precond:
269 // CHECK4-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4
270 // CHECK4-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2
271 // CHECK4-NEXT:    br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]]
272 // CHECK4:       body:
273 // CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]])
274 // CHECK4-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
275 // CHECK4-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
276 // CHECK4:       then:
277 // CHECK4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
278 // CHECK4-NEXT:    [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4
279 // CHECK4-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
280 // CHECK4-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]]
281 // CHECK4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
282 // CHECK4-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
283 // CHECK4-NEXT:    store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
284 // CHECK4-NEXT:    br label [[IFCONT:%.*]]
285 // CHECK4:       else:
286 // CHECK4-NEXT:    br label [[IFCONT]]
287 // CHECK4:       ifcont:
288 // CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
289 // CHECK4-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
290 // CHECK4-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]]
291 // CHECK4-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
292 // CHECK4:       then4:
293 // CHECK4-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
294 // CHECK4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
295 // CHECK4-NEXT:    [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4
296 // CHECK4-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
297 // CHECK4-NEXT:    [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]]
298 // CHECK4-NEXT:    [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4
299 // CHECK4-NEXT:    store i32 [[TMP19]], i32* [[TMP18]], align 4
300 // CHECK4-NEXT:    br label [[IFCONT6:%.*]]
301 // CHECK4:       else5:
302 // CHECK4-NEXT:    br label [[IFCONT6]]
303 // CHECK4:       ifcont6:
304 // CHECK4-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1
305 // CHECK4-NEXT:    store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4
306 // CHECK4-NEXT:    br label [[PRECOND]]
307 // CHECK4:       exit:
308 // CHECK4-NEXT:    ret void
309 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
310 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
311 // CHECK4-NEXT:  entry:
312 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
313 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
314 // CHECK4-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
315 // CHECK4-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
316 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
317 // CHECK4-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
318 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
319 // CHECK4-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
320 // CHECK4-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
321 // CHECK4-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
322 // CHECK4-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
323 // CHECK4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
324 // CHECK4-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
325 // CHECK4-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
326 // CHECK4-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
327 // CHECK4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]]
328 // CHECK4-NEXT:    [[TMP12:%.*]] = load double, double* [[TMP10]], align 8
329 // CHECK4-NEXT:    store double [[TMP12]], double* [[TMP11]], align 128
330 // CHECK4-NEXT:    ret void
331 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
332 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
333 // CHECK4-NEXT:  entry:
334 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
335 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
336 // CHECK4-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
337 // CHECK4-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
338 // CHECK4-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
339 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
340 // CHECK4-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
341 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
342 // CHECK4-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
343 // CHECK4-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
344 // CHECK4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
345 // CHECK4-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
346 // CHECK4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]]
347 // CHECK4-NEXT:    [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
348 // CHECK4-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 4
349 // CHECK4-NEXT:    [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
350 // CHECK4-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
351 // CHECK4-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]]
352 // CHECK4-NEXT:    ret void
353 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
354 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
355 // CHECK4-NEXT:  entry:
356 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
357 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
358 // CHECK4-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
359 // CHECK4-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
360 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
361 // CHECK4-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
362 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
363 // CHECK4-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
364 // CHECK4-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
365 // CHECK4-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
366 // CHECK4-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
367 // CHECK4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
368 // CHECK4-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
369 // CHECK4-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
370 // CHECK4-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
371 // CHECK4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]]
372 // CHECK4-NEXT:    [[TMP12:%.*]] = load double, double* [[TMP11]], align 128
373 // CHECK4-NEXT:    store double [[TMP12]], double* [[TMP10]], align 8
374 // CHECK4-NEXT:    ret void
375 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
376 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
377 // CHECK4-NEXT:  entry:
378 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
379 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
380 // CHECK4-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
381 // CHECK4-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
382 // CHECK4-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
383 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
384 // CHECK4-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
385 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
386 // CHECK4-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
387 // CHECK4-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
388 // CHECK4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
389 // CHECK4-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
390 // CHECK4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]]
391 // CHECK4-NEXT:    [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
392 // CHECK4-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 4
393 // CHECK4-NEXT:    [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
394 // CHECK4-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
395 // CHECK4-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]]
396 // CHECK4-NEXT:    ret void
397 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker
398 // CHECK4-SAME: () #[[ATTR0]] {
399 // CHECK4-NEXT:  entry:
400 // CHECK4-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
401 // CHECK4-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
402 // CHECK4-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
403 // CHECK4-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
404 // CHECK4-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
405 // CHECK4:       .await.work:
406 // CHECK4-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
407 // CHECK4-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
408 // CHECK4-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
409 // CHECK4-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
410 // CHECK4-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
411 // CHECK4-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
412 // CHECK4-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
413 // CHECK4:       .select.workers:
414 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
415 // CHECK4-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
416 // CHECK4-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
417 // CHECK4:       .execute.parallel:
418 // CHECK4-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
419 // CHECK4-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
420 // CHECK4-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
421 // CHECK4-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
422 // CHECK4:       .terminate.parallel:
423 // CHECK4-NEXT:    call void @__kmpc_kernel_end_parallel()
424 // CHECK4-NEXT:    br label [[DOTBARRIER_PARALLEL]]
425 // CHECK4:       .barrier.parallel:
426 // CHECK4-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
427 // CHECK4-NEXT:    br label [[DOTAWAIT_WORK]]
428 // CHECK4:       .exit:
429 // CHECK4-NEXT:    ret void
430 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29
431 // CHECK4-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] {
432 // CHECK4-NEXT:  entry:
433 // CHECK4-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4
434 // CHECK4-NEXT:    [[D_ADDR:%.*]] = alloca i32, align 4
435 // CHECK4-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
436 // CHECK4-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
437 // CHECK4-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
438 // CHECK4-NEXT:    store i32 [[C]], i32* [[C_ADDR]], align 4
439 // CHECK4-NEXT:    store i32 [[D]], i32* [[D_ADDR]], align 4
440 // CHECK4-NEXT:    [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8*
441 // CHECK4-NEXT:    [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float*
442 // CHECK4-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
443 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
444 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
445 // CHECK4-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
446 // CHECK4-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
447 // CHECK4-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
448 // CHECK4:       .worker:
449 // CHECK4-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]]
450 // CHECK4-NEXT:    br label [[DOTEXIT:%.*]]
451 // CHECK4:       .mastercheck:
452 // CHECK4-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
453 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
454 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
455 // CHECK4-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1
456 // CHECK4-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1
457 // CHECK4-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
458 // CHECK4-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
459 // CHECK4-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]]
460 // CHECK4-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
461 // CHECK4:       .master:
462 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
463 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
464 // CHECK4-NEXT:    [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]]
465 // CHECK4-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1)
466 // CHECK4-NEXT:    call void @__kmpc_data_sharing_init_stack()
467 // CHECK4-NEXT:    [[TMP5:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2
468 // CHECK4-NEXT:    [[TMP6:%.*]] = load i32, i32* @"_openmp_static_kernel$size2", align 4
469 // CHECK4-NEXT:    call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i32 [[TMP6]], i16 [[TMP5]], i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**))
470 // CHECK4-NEXT:    [[TMP7:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4
471 // CHECK4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0
472 // CHECK4-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to %struct._globalized_locals_ty.1*
473 // CHECK4-NEXT:    [[TMP10:%.*]] = load i8, i8* [[CONV]], align 4
474 // CHECK4-NEXT:    [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP9]], i32 0, i32 1
475 // CHECK4-NEXT:    store i8 [[TMP10]], i8* [[C8]], align 4
476 // CHECK4-NEXT:    [[TMP11:%.*]] = load float, float* [[CONV1]], align 4
477 // CHECK4-NEXT:    [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP9]], i32 0, i32 0
478 // CHECK4-NEXT:    store float [[TMP11]], float* [[D9]], align 4
479 // CHECK4-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
480 // CHECK4-NEXT:    store i32 [[TMP12]], i32* [[DOTTHREADID_TEMP_]], align 4
481 // CHECK4-NEXT:    call void @__omp_outlined__3(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]]
482 // CHECK4-NEXT:    [[TMP13:%.*]] = load i16, i16* @"_openmp_static_kernel$is_shared1", align 2
483 // CHECK4-NEXT:    call void @__kmpc_restore_team_static_memory(i16 0, i16 [[TMP13]])
484 // CHECK4-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
485 // CHECK4:       .termination.notifier:
486 // CHECK4-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
487 // CHECK4-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
488 // CHECK4-NEXT:    br label [[DOTEXIT]]
489 // CHECK4:       .exit:
490 // CHECK4-NEXT:    ret void
491 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__3
492 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
493 // CHECK4-NEXT:  entry:
494 // CHECK4-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
495 // CHECK4-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
496 // CHECK4-NEXT:    [[C_ADDR:%.*]] = alloca i8*, align 4
497 // CHECK4-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 4
498 // CHECK4-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
499 // CHECK4-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
500 // CHECK4-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
501 // CHECK4-NEXT:    store i8* [[C]], i8** [[C_ADDR]], align 4
502 // CHECK4-NEXT:    store float* [[D]], float** [[D_ADDR]], align 4
503 // CHECK4-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4
504 // CHECK4-NEXT:    [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4
505 // CHECK4-NEXT:    [[TMP2:%.*]] = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 4
506 // CHECK4-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 8
507 // CHECK4-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.2*
508 // CHECK4-NEXT:    [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP4]], i32 0, i32 1
509 // CHECK4-NEXT:    [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP4]], i32 0, i32 0
510 // CHECK4-NEXT:    store i8 0, i8* [[C1]], align 4
511 // CHECK4-NEXT:    store float 1.000000e+00, float* [[D2]], align 4
512 // CHECK4-NEXT:    [[TMP5:%.*]] = load i8, i8* [[C1]], align 4
513 // CHECK4-NEXT:    [[CONV:%.*]] = sext i8 [[TMP5]] to i32
514 // CHECK4-NEXT:    [[XOR:%.*]] = xor i32 [[CONV]], 2
515 // CHECK4-NEXT:    [[CONV3:%.*]] = trunc i32 [[XOR]] to i8
516 // CHECK4-NEXT:    store i8 [[CONV3]], i8* [[C1]], align 4
517 // CHECK4-NEXT:    [[TMP6:%.*]] = load float, float* [[D2]], align 4
518 // CHECK4-NEXT:    [[MUL:%.*]] = fmul float [[TMP6]], 3.300000e+01
519 // CHECK4-NEXT:    store float [[MUL]], float* [[D2]], align 4
520 // CHECK4-NEXT:    [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
521 // CHECK4-NEXT:    [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
522 // CHECK4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
523 // CHECK4-NEXT:    store i8* [[C1]], i8** [[TMP9]], align 4
524 // CHECK4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
525 // CHECK4-NEXT:    [[TMP11:%.*]] = bitcast float* [[D2]] to i8*
526 // CHECK4-NEXT:    store i8* [[TMP11]], i8** [[TMP10]], align 4
527 // CHECK4-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
528 // CHECK4-NEXT:    [[TMP13:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
529 // CHECK4-NEXT:    [[TMP14:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], i8* [[TMP13]], i32 2048, i8* [[TMP12]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func5, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func6, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func8, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func9, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func10)
530 // CHECK4-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP14]], 1
531 // CHECK4-NEXT:    br i1 [[TMP15]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
532 // CHECK4:       .omp.reduction.then:
533 // CHECK4-NEXT:    [[TMP16:%.*]] = load i8, i8* [[TMP0]], align 1
534 // CHECK4-NEXT:    [[CONV4:%.*]] = sext i8 [[TMP16]] to i32
535 // CHECK4-NEXT:    [[TMP17:%.*]] = load i8, i8* [[C1]], align 4
536 // CHECK4-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP17]] to i32
537 // CHECK4-NEXT:    [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
538 // CHECK4-NEXT:    [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
539 // CHECK4-NEXT:    store i8 [[CONV7]], i8* [[TMP0]], align 1
540 // CHECK4-NEXT:    [[TMP18:%.*]] = load float, float* [[TMP1]], align 4
541 // CHECK4-NEXT:    [[TMP19:%.*]] = load float, float* [[D2]], align 4
542 // CHECK4-NEXT:    [[MUL8:%.*]] = fmul float [[TMP18]], [[TMP19]]
543 // CHECK4-NEXT:    store float [[MUL8]], float* [[TMP1]], align 4
544 // CHECK4-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP8]])
545 // CHECK4-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
546 // CHECK4:       .omp.reduction.done:
547 // CHECK4-NEXT:    ret void
548 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func5
549 // CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
550 // CHECK4-NEXT:  entry:
551 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
552 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
553 // CHECK4-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
554 // CHECK4-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
555 // CHECK4-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
556 // CHECK4-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1
557 // CHECK4-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4
558 // CHECK4-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
559 // CHECK4-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
560 // CHECK4-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
561 // CHECK4-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
562 // CHECK4-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
563 // CHECK4-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
564 // CHECK4-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
565 // CHECK4-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
566 // CHECK4-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
567 // CHECK4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
568 // CHECK4-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
569 // CHECK4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
570 // CHECK4-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
571 // CHECK4-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1
572 // CHECK4-NEXT:    [[TMP14:%.*]] = sext i8 [[TMP13]] to i32
573 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
574 // CHECK4-NEXT:    [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
575 // CHECK4-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]])
576 // CHECK4-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
577 // CHECK4-NEXT:    store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1
578 // CHECK4-NEXT:    [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
579 // CHECK4-NEXT:    [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
580 // CHECK4-NEXT:    store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4
581 // CHECK4-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
582 // CHECK4-NEXT:    [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4
583 // CHECK4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
584 // CHECK4-NEXT:    [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float*
585 // CHECK4-NEXT:    [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1
586 // CHECK4-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8*
587 // CHECK4-NEXT:    [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32*
588 // CHECK4-NEXT:    [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32*
589 // CHECK4-NEXT:    [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4
590 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
591 // CHECK4-NEXT:    [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16
592 // CHECK4-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]])
593 // CHECK4-NEXT:    store i32 [[TMP30]], i32* [[TMP27]], align 4
594 // CHECK4-NEXT:    [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1
595 // CHECK4-NEXT:    [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1
596 // CHECK4-NEXT:    [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
597 // CHECK4-NEXT:    store i8* [[TMP33]], i8** [[TMP22]], align 4
598 // CHECK4-NEXT:    [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0
599 // CHECK4-NEXT:    [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1
600 // CHECK4-NEXT:    [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
601 // CHECK4-NEXT:    [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]]
602 // CHECK4-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2
603 // CHECK4-NEXT:    [[TMP39:%.*]] = and i16 [[TMP6]], 1
604 // CHECK4-NEXT:    [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0
605 // CHECK4-NEXT:    [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]]
606 // CHECK4-NEXT:    [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0
607 // CHECK4-NEXT:    [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]]
608 // CHECK4-NEXT:    [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]]
609 // CHECK4-NEXT:    [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]]
610 // CHECK4-NEXT:    br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]]
611 // CHECK4:       then:
612 // CHECK4-NEXT:    [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
613 // CHECK4-NEXT:    [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
614 // CHECK4-NEXT:    call void @"_omp$reduction$reduction_func4"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]]
615 // CHECK4-NEXT:    br label [[IFCONT:%.*]]
616 // CHECK4:       else:
617 // CHECK4-NEXT:    br label [[IFCONT]]
618 // CHECK4:       ifcont:
619 // CHECK4-NEXT:    [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1
620 // CHECK4-NEXT:    [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
621 // CHECK4-NEXT:    [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]]
622 // CHECK4-NEXT:    br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
623 // CHECK4:       then6:
624 // CHECK4-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
625 // CHECK4-NEXT:    [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4
626 // CHECK4-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
627 // CHECK4-NEXT:    [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4
628 // CHECK4-NEXT:    [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1
629 // CHECK4-NEXT:    store i8 [[TMP55]], i8* [[TMP54]], align 1
630 // CHECK4-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
631 // CHECK4-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4
632 // CHECK4-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
633 // CHECK4-NEXT:    [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4
634 // CHECK4-NEXT:    [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float*
635 // CHECK4-NEXT:    [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float*
636 // CHECK4-NEXT:    [[TMP62:%.*]] = load float, float* [[TMP60]], align 4
637 // CHECK4-NEXT:    store float [[TMP62]], float* [[TMP61]], align 4
638 // CHECK4-NEXT:    br label [[IFCONT8:%.*]]
639 // CHECK4:       else7:
640 // CHECK4-NEXT:    br label [[IFCONT8]]
641 // CHECK4:       ifcont8:
642 // CHECK4-NEXT:    ret void
643 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func6
644 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
645 // CHECK4-NEXT:  entry:
646 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
647 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
648 // CHECK4-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
649 // CHECK4-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
650 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
651 // CHECK4-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
652 // CHECK4-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
653 // CHECK4-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
654 // CHECK4-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
655 // CHECK4-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
656 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
657 // CHECK4-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
658 // CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
659 // CHECK4-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
660 // CHECK4-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
661 // CHECK4:       then:
662 // CHECK4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
663 // CHECK4-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4
664 // CHECK4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
665 // CHECK4-NEXT:    [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)*
666 // CHECK4-NEXT:    [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1
667 // CHECK4-NEXT:    store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1
668 // CHECK4-NEXT:    br label [[IFCONT:%.*]]
669 // CHECK4:       else:
670 // CHECK4-NEXT:    br label [[IFCONT]]
671 // CHECK4:       ifcont:
672 // CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
673 // CHECK4-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4
674 // CHECK4-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]]
675 // CHECK4-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
676 // CHECK4:       then4:
677 // CHECK4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
678 // CHECK4-NEXT:    [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)*
679 // CHECK4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
680 // CHECK4-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
681 // CHECK4-NEXT:    [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1
682 // CHECK4-NEXT:    store i8 [[TMP15]], i8* [[TMP14]], align 1
683 // CHECK4-NEXT:    br label [[IFCONT6:%.*]]
684 // CHECK4:       else5:
685 // CHECK4-NEXT:    br label [[IFCONT6]]
686 // CHECK4:       ifcont6:
687 // CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
688 // CHECK4-NEXT:    [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
689 // CHECK4-NEXT:    br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]]
690 // CHECK4:       then8:
691 // CHECK4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
692 // CHECK4-NEXT:    [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
693 // CHECK4-NEXT:    [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32*
694 // CHECK4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
695 // CHECK4-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4
696 // CHECK4-NEXT:    store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4
697 // CHECK4-NEXT:    br label [[IFCONT10:%.*]]
698 // CHECK4:       else9:
699 // CHECK4-NEXT:    br label [[IFCONT10]]
700 // CHECK4:       ifcont10:
701 // CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
702 // CHECK4-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4
703 // CHECK4-NEXT:    [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]]
704 // CHECK4-NEXT:    br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]]
705 // CHECK4:       then12:
706 // CHECK4-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
707 // CHECK4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
708 // CHECK4-NEXT:    [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4
709 // CHECK4-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32*
710 // CHECK4-NEXT:    [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4
711 // CHECK4-NEXT:    store i32 [[TMP26]], i32* [[TMP25]], align 4
712 // CHECK4-NEXT:    br label [[IFCONT14:%.*]]
713 // CHECK4:       else13:
714 // CHECK4-NEXT:    br label [[IFCONT14]]
715 // CHECK4:       ifcont14:
716 // CHECK4-NEXT:    ret void
717 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func7
718 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
719 // CHECK4-NEXT:  entry:
720 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
721 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
722 // CHECK4-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
723 // CHECK4-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
724 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
725 // CHECK4-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
726 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
727 // CHECK4-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
728 // CHECK4-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
729 // CHECK4-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3*
730 // CHECK4-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
731 // CHECK4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
732 // CHECK4-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
733 // CHECK4-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0
734 // CHECK4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]]
735 // CHECK4-NEXT:    [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1
736 // CHECK4-NEXT:    store i8 [[TMP11]], i8* [[TMP10]], align 128
737 // CHECK4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
738 // CHECK4-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
739 // CHECK4-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
740 // CHECK4-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1
741 // CHECK4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]]
742 // CHECK4-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP14]], align 4
743 // CHECK4-NEXT:    store float [[TMP16]], float* [[TMP15]], align 128
744 // CHECK4-NEXT:    ret void
745 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func8
746 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
747 // CHECK4-NEXT:  entry:
748 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
749 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
750 // CHECK4-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
751 // CHECK4-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
752 // CHECK4-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
753 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
754 // CHECK4-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
755 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
756 // CHECK4-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3*
757 // CHECK4-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
758 // CHECK4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
759 // CHECK4-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0
760 // CHECK4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]]
761 // CHECK4-NEXT:    store i8* [[TMP7]], i8** [[TMP6]], align 4
762 // CHECK4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
763 // CHECK4-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1
764 // CHECK4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]]
765 // CHECK4-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
766 // CHECK4-NEXT:    store i8* [[TMP10]], i8** [[TMP8]], align 4
767 // CHECK4-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
768 // CHECK4-NEXT:    [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
769 // CHECK4-NEXT:    call void @"_omp$reduction$reduction_func4"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]]
770 // CHECK4-NEXT:    ret void
771 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func9
772 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
773 // CHECK4-NEXT:  entry:
774 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
775 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
776 // CHECK4-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
777 // CHECK4-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
778 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
779 // CHECK4-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
780 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
781 // CHECK4-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
782 // CHECK4-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
783 // CHECK4-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3*
784 // CHECK4-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
785 // CHECK4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
786 // CHECK4-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
787 // CHECK4-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0
788 // CHECK4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]]
789 // CHECK4-NEXT:    [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128
790 // CHECK4-NEXT:    store i8 [[TMP11]], i8* [[TMP9]], align 1
791 // CHECK4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
792 // CHECK4-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
793 // CHECK4-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
794 // CHECK4-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1
795 // CHECK4-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]]
796 // CHECK4-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP15]], align 128
797 // CHECK4-NEXT:    store float [[TMP16]], float* [[TMP14]], align 4
798 // CHECK4-NEXT:    ret void
799 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func10
800 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
801 // CHECK4-NEXT:  entry:
802 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
803 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
804 // CHECK4-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
805 // CHECK4-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
806 // CHECK4-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
807 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
808 // CHECK4-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
809 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
810 // CHECK4-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3*
811 // CHECK4-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
812 // CHECK4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
813 // CHECK4-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0
814 // CHECK4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]]
815 // CHECK4-NEXT:    store i8* [[TMP7]], i8** [[TMP6]], align 4
816 // CHECK4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
817 // CHECK4-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1
818 // CHECK4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]]
819 // CHECK4-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
820 // CHECK4-NEXT:    store i8* [[TMP10]], i8** [[TMP8]], align 4
821 // CHECK4-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
822 // CHECK4-NEXT:    [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
823 // CHECK4-NEXT:    call void @"_omp$reduction$reduction_func4"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]]
824 // CHECK4-NEXT:    ret void
825 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36
826 // CHECK4-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] {
827 // CHECK4-NEXT:  entry:
828 // CHECK4-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
829 // CHECK4-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
830 // CHECK4-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
831 // CHECK4-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
832 // CHECK4-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
833 // CHECK4-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
834 // CHECK4-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
835 // CHECK4-NEXT:    [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16*
836 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
837 // CHECK4-NEXT:    call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
838 // CHECK4-NEXT:    call void @__kmpc_data_sharing_init_stack_spmd()
839 // CHECK4-NEXT:    br label [[DOTEXECUTE:%.*]]
840 // CHECK4:       .execute:
841 // CHECK4-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
842 // CHECK4-NEXT:    store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4
843 // CHECK4-NEXT:    call void @__omp_outlined__11(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]]
844 // CHECK4-NEXT:    br label [[DOTOMP_DEINIT:%.*]]
845 // CHECK4:       .omp.deinit:
846 // CHECK4-NEXT:    call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
847 // CHECK4-NEXT:    br label [[DOTEXIT:%.*]]
848 // CHECK4:       .exit:
849 // CHECK4-NEXT:    ret void
850 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__11
851 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] {
852 // CHECK4-NEXT:  entry:
853 // CHECK4-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
854 // CHECK4-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
855 // CHECK4-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 4
856 // CHECK4-NEXT:    [[B_ADDR:%.*]] = alloca i16*, align 4
857 // CHECK4-NEXT:    [[A1:%.*]] = alloca i32, align 4
858 // CHECK4-NEXT:    [[B2:%.*]] = alloca i16, align 2
859 // CHECK4-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
860 // CHECK4-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
861 // CHECK4-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
862 // CHECK4-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
863 // CHECK4-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 4
864 // CHECK4-NEXT:    store i16* [[B]], i16** [[B_ADDR]], align 4
865 // CHECK4-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
866 // CHECK4-NEXT:    [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
867 // CHECK4-NEXT:    store i32 0, i32* [[A1]], align 4
868 // CHECK4-NEXT:    store i16 -32768, i16* [[B2]], align 2
869 // CHECK4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
870 // CHECK4-NEXT:    [[TMP3:%.*]] = bitcast i32* [[A1]] to i8*
871 // CHECK4-NEXT:    store i8* [[TMP3]], i8** [[TMP2]], align 4
872 // CHECK4-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
873 // CHECK4-NEXT:    [[TMP5:%.*]] = bitcast i16* [[B2]] to i8*
874 // CHECK4-NEXT:    store i8* [[TMP5]], i8** [[TMP4]], align 4
875 // CHECK4-NEXT:    [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
876 // CHECK4-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
877 // CHECK4-NEXT:    [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
878 // CHECK4-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__12 to i8*), i8* null, i8** [[TMP8]], i32 2)
879 // CHECK4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
880 // CHECK4-NEXT:    [[TMP10:%.*]] = bitcast i32* [[A1]] to i8*
881 // CHECK4-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 4
882 // CHECK4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
883 // CHECK4-NEXT:    [[TMP12:%.*]] = bitcast i16* [[B2]] to i8*
884 // CHECK4-NEXT:    store i8* [[TMP12]], i8** [[TMP11]], align 4
885 // CHECK4-NEXT:    [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
886 // CHECK4-NEXT:    [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
887 // CHECK4-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 2048, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func17, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func18, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func20, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func21, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func22)
888 // CHECK4-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1
889 // CHECK4-NEXT:    br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
890 // CHECK4:       .omp.reduction.then:
891 // CHECK4-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4
892 // CHECK4-NEXT:    [[TMP18:%.*]] = load i32, i32* [[A1]], align 4
893 // CHECK4-NEXT:    [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]]
894 // CHECK4-NEXT:    store i32 [[OR]], i32* [[TMP0]], align 4
895 // CHECK4-NEXT:    [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2
896 // CHECK4-NEXT:    [[CONV:%.*]] = sext i16 [[TMP19]] to i32
897 // CHECK4-NEXT:    [[TMP20:%.*]] = load i16, i16* [[B2]], align 2
898 // CHECK4-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP20]] to i32
899 // CHECK4-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
900 // CHECK4-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
901 // CHECK4:       cond.true:
902 // CHECK4-NEXT:    [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2
903 // CHECK4-NEXT:    br label [[COND_END:%.*]]
904 // CHECK4:       cond.false:
905 // CHECK4-NEXT:    [[TMP22:%.*]] = load i16, i16* [[B2]], align 2
906 // CHECK4-NEXT:    br label [[COND_END]]
907 // CHECK4:       cond.end:
908 // CHECK4-NEXT:    [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ]
909 // CHECK4-NEXT:    store i16 [[COND]], i16* [[TMP1]], align 2
910 // CHECK4-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]])
911 // CHECK4-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
912 // CHECK4:       .omp.reduction.done:
913 // CHECK4-NEXT:    ret void
914 // CHECK4-LABEL: define {{[^@]+}}@__omp_outlined__12
915 // CHECK4-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] {
916 // CHECK4-NEXT:  entry:
917 // CHECK4-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
918 // CHECK4-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
919 // CHECK4-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 4
920 // CHECK4-NEXT:    [[B_ADDR:%.*]] = alloca i16*, align 4
921 // CHECK4-NEXT:    [[A1:%.*]] = alloca i32, align 4
922 // CHECK4-NEXT:    [[B2:%.*]] = alloca i16, align 2
923 // CHECK4-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
924 // CHECK4-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
925 // CHECK4-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
926 // CHECK4-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 4
927 // CHECK4-NEXT:    store i16* [[B]], i16** [[B_ADDR]], align 4
928 // CHECK4-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
929 // CHECK4-NEXT:    [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
930 // CHECK4-NEXT:    store i32 0, i32* [[A1]], align 4
931 // CHECK4-NEXT:    store i16 -32768, i16* [[B2]], align 2
932 // CHECK4-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A1]], align 4
933 // CHECK4-NEXT:    [[OR:%.*]] = or i32 [[TMP2]], 1
934 // CHECK4-NEXT:    store i32 [[OR]], i32* [[A1]], align 4
935 // CHECK4-NEXT:    [[TMP3:%.*]] = load i16, i16* [[B2]], align 2
936 // CHECK4-NEXT:    [[CONV:%.*]] = sext i16 [[TMP3]] to i32
937 // CHECK4-NEXT:    [[CMP:%.*]] = icmp sgt i32 99, [[CONV]]
938 // CHECK4-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
939 // CHECK4:       cond.true:
940 // CHECK4-NEXT:    br label [[COND_END:%.*]]
941 // CHECK4:       cond.false:
942 // CHECK4-NEXT:    [[TMP4:%.*]] = load i16, i16* [[B2]], align 2
943 // CHECK4-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP4]] to i32
944 // CHECK4-NEXT:    br label [[COND_END]]
945 // CHECK4:       cond.end:
946 // CHECK4-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ]
947 // CHECK4-NEXT:    [[CONV4:%.*]] = trunc i32 [[COND]] to i16
948 // CHECK4-NEXT:    store i16 [[CONV4]], i16* [[B2]], align 2
949 // CHECK4-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
950 // CHECK4-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
951 // CHECK4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
952 // CHECK4-NEXT:    [[TMP8:%.*]] = bitcast i32* [[A1]] to i8*
953 // CHECK4-NEXT:    store i8* [[TMP8]], i8** [[TMP7]], align 4
954 // CHECK4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
955 // CHECK4-NEXT:    [[TMP10:%.*]] = bitcast i16* [[B2]] to i8*
956 // CHECK4-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 4
957 // CHECK4-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
958 // CHECK4-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func14, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func15)
959 // CHECK4-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1
960 // CHECK4-NEXT:    br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
961 // CHECK4:       .omp.reduction.then:
962 // CHECK4-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4
963 // CHECK4-NEXT:    [[TMP15:%.*]] = load i32, i32* [[A1]], align 4
964 // CHECK4-NEXT:    [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]]
965 // CHECK4-NEXT:    store i32 [[OR5]], i32* [[TMP0]], align 4
966 // CHECK4-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2
967 // CHECK4-NEXT:    [[CONV6:%.*]] = sext i16 [[TMP16]] to i32
968 // CHECK4-NEXT:    [[TMP17:%.*]] = load i16, i16* [[B2]], align 2
969 // CHECK4-NEXT:    [[CONV7:%.*]] = sext i16 [[TMP17]] to i32
970 // CHECK4-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]]
971 // CHECK4-NEXT:    br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
972 // CHECK4:       cond.true9:
973 // CHECK4-NEXT:    [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2
974 // CHECK4-NEXT:    br label [[COND_END11:%.*]]
975 // CHECK4:       cond.false10:
976 // CHECK4-NEXT:    [[TMP19:%.*]] = load i16, i16* [[B2]], align 2
977 // CHECK4-NEXT:    br label [[COND_END11]]
978 // CHECK4:       cond.end11:
979 // CHECK4-NEXT:    [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ]
980 // CHECK4-NEXT:    store i16 [[COND12]], i16* [[TMP1]], align 2
981 // CHECK4-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
982 // CHECK4-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
983 // CHECK4:       .omp.reduction.done:
984 // CHECK4-NEXT:    ret void
985 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func14
986 // CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
987 // CHECK4-NEXT:  entry:
988 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
989 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
990 // CHECK4-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
991 // CHECK4-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
992 // CHECK4-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
993 // CHECK4-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
994 // CHECK4-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
995 // CHECK4-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
996 // CHECK4-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
997 // CHECK4-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
998 // CHECK4-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
999 // CHECK4-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
1000 // CHECK4-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
1001 // CHECK4-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
1002 // CHECK4-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
1003 // CHECK4-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
1004 // CHECK4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
1005 // CHECK4-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
1006 // CHECK4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
1007 // CHECK4-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
1008 // CHECK4-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
1009 // CHECK4-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
1010 // CHECK4-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
1011 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1012 // CHECK4-NEXT:    [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
1013 // CHECK4-NEXT:    [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]])
1014 // CHECK4-NEXT:    store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
1015 // CHECK4-NEXT:    [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
1016 // CHECK4-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
1017 // CHECK4-NEXT:    [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
1018 // CHECK4-NEXT:    store i8* [[TMP20]], i8** [[TMP11]], align 4
1019 // CHECK4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
1020 // CHECK4-NEXT:    [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4
1021 // CHECK4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
1022 // CHECK4-NEXT:    [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16*
1023 // CHECK4-NEXT:    [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
1024 // CHECK4-NEXT:    [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8*
1025 // CHECK4-NEXT:    [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2
1026 // CHECK4-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP27]] to i32
1027 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1028 // CHECK4-NEXT:    [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16
1029 // CHECK4-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]])
1030 // CHECK4-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
1031 // CHECK4-NEXT:    store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
1032 // CHECK4-NEXT:    [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
1033 // CHECK4-NEXT:    [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
1034 // CHECK4-NEXT:    [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
1035 // CHECK4-NEXT:    store i8* [[TMP34]], i8** [[TMP23]], align 4
1036 // CHECK4-NEXT:    [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0
1037 // CHECK4-NEXT:    [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1
1038 // CHECK4-NEXT:    [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
1039 // CHECK4-NEXT:    [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]]
1040 // CHECK4-NEXT:    [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2
1041 // CHECK4-NEXT:    [[TMP40:%.*]] = and i16 [[TMP6]], 1
1042 // CHECK4-NEXT:    [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0
1043 // CHECK4-NEXT:    [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]]
1044 // CHECK4-NEXT:    [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0
1045 // CHECK4-NEXT:    [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]]
1046 // CHECK4-NEXT:    [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]]
1047 // CHECK4-NEXT:    [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]]
1048 // CHECK4-NEXT:    br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]]
1049 // CHECK4:       then:
1050 // CHECK4-NEXT:    [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
1051 // CHECK4-NEXT:    [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
1052 // CHECK4-NEXT:    call void @"_omp$reduction$reduction_func13"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]]
1053 // CHECK4-NEXT:    br label [[IFCONT:%.*]]
1054 // CHECK4:       else:
1055 // CHECK4-NEXT:    br label [[IFCONT]]
1056 // CHECK4:       ifcont:
1057 // CHECK4-NEXT:    [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1
1058 // CHECK4-NEXT:    [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
1059 // CHECK4-NEXT:    [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]]
1060 // CHECK4-NEXT:    br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
1061 // CHECK4:       then6:
1062 // CHECK4-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
1063 // CHECK4-NEXT:    [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4
1064 // CHECK4-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
1065 // CHECK4-NEXT:    [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
1066 // CHECK4-NEXT:    [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32*
1067 // CHECK4-NEXT:    [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32*
1068 // CHECK4-NEXT:    [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4
1069 // CHECK4-NEXT:    store i32 [[TMP58]], i32* [[TMP57]], align 4
1070 // CHECK4-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
1071 // CHECK4-NEXT:    [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4
1072 // CHECK4-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
1073 // CHECK4-NEXT:    [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
1074 // CHECK4-NEXT:    [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16*
1075 // CHECK4-NEXT:    [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16*
1076 // CHECK4-NEXT:    [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2
1077 // CHECK4-NEXT:    store i16 [[TMP65]], i16* [[TMP64]], align 2
1078 // CHECK4-NEXT:    br label [[IFCONT8:%.*]]
1079 // CHECK4:       else7:
1080 // CHECK4-NEXT:    br label [[IFCONT8]]
1081 // CHECK4:       ifcont8:
1082 // CHECK4-NEXT:    ret void
1083 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func15
1084 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
1085 // CHECK4-NEXT:  entry:
1086 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
1087 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
1088 // CHECK4-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
1089 // CHECK4-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
1090 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
1091 // CHECK4-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1092 // CHECK4-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1093 // CHECK4-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
1094 // CHECK4-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1095 // CHECK4-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
1096 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
1097 // CHECK4-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
1098 // CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]])
1099 // CHECK4-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
1100 // CHECK4-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
1101 // CHECK4:       then:
1102 // CHECK4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
1103 // CHECK4-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4
1104 // CHECK4-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
1105 // CHECK4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
1106 // CHECK4-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4
1107 // CHECK4-NEXT:    store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4
1108 // CHECK4-NEXT:    br label [[IFCONT:%.*]]
1109 // CHECK4:       else:
1110 // CHECK4-NEXT:    br label [[IFCONT]]
1111 // CHECK4:       ifcont:
1112 // CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
1113 // CHECK4-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4
1114 // CHECK4-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]]
1115 // CHECK4-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
1116 // CHECK4:       then4:
1117 // CHECK4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
1118 // CHECK4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
1119 // CHECK4-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
1120 // CHECK4-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32*
1121 // CHECK4-NEXT:    [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4
1122 // CHECK4-NEXT:    store i32 [[TMP15]], i32* [[TMP14]], align 4
1123 // CHECK4-NEXT:    br label [[IFCONT6:%.*]]
1124 // CHECK4:       else5:
1125 // CHECK4-NEXT:    br label [[IFCONT6]]
1126 // CHECK4:       ifcont6:
1127 // CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
1128 // CHECK4-NEXT:    [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
1129 // CHECK4-NEXT:    br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]]
1130 // CHECK4:       then8:
1131 // CHECK4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
1132 // CHECK4-NEXT:    [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
1133 // CHECK4-NEXT:    [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16*
1134 // CHECK4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
1135 // CHECK4-NEXT:    [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)*
1136 // CHECK4-NEXT:    [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2
1137 // CHECK4-NEXT:    store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2
1138 // CHECK4-NEXT:    br label [[IFCONT10:%.*]]
1139 // CHECK4:       else9:
1140 // CHECK4-NEXT:    br label [[IFCONT10]]
1141 // CHECK4:       ifcont10:
1142 // CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
1143 // CHECK4-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4
1144 // CHECK4-NEXT:    [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]]
1145 // CHECK4-NEXT:    br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]]
1146 // CHECK4:       then12:
1147 // CHECK4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
1148 // CHECK4-NEXT:    [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)*
1149 // CHECK4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
1150 // CHECK4-NEXT:    [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4
1151 // CHECK4-NEXT:    [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16*
1152 // CHECK4-NEXT:    [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2
1153 // CHECK4-NEXT:    store i16 [[TMP28]], i16* [[TMP27]], align 2
1154 // CHECK4-NEXT:    br label [[IFCONT14:%.*]]
1155 // CHECK4:       else13:
1156 // CHECK4-NEXT:    br label [[IFCONT14]]
1157 // CHECK4:       ifcont14:
1158 // CHECK4-NEXT:    ret void
1159 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func17
1160 // CHECK4-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
1161 // CHECK4-NEXT:  entry:
1162 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
1163 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
1164 // CHECK4-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
1165 // CHECK4-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
1166 // CHECK4-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
1167 // CHECK4-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
1168 // CHECK4-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
1169 // CHECK4-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
1170 // CHECK4-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
1171 // CHECK4-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
1172 // CHECK4-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
1173 // CHECK4-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
1174 // CHECK4-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
1175 // CHECK4-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
1176 // CHECK4-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
1177 // CHECK4-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
1178 // CHECK4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
1179 // CHECK4-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
1180 // CHECK4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
1181 // CHECK4-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
1182 // CHECK4-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
1183 // CHECK4-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
1184 // CHECK4-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
1185 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1186 // CHECK4-NEXT:    [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
1187 // CHECK4-NEXT:    [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]])
1188 // CHECK4-NEXT:    store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
1189 // CHECK4-NEXT:    [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
1190 // CHECK4-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
1191 // CHECK4-NEXT:    [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
1192 // CHECK4-NEXT:    store i8* [[TMP20]], i8** [[TMP11]], align 4
1193 // CHECK4-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
1194 // CHECK4-NEXT:    [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4
1195 // CHECK4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
1196 // CHECK4-NEXT:    [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16*
1197 // CHECK4-NEXT:    [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
1198 // CHECK4-NEXT:    [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8*
1199 // CHECK4-NEXT:    [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2
1200 // CHECK4-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP27]] to i32
1201 // CHECK4-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1202 // CHECK4-NEXT:    [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16
1203 // CHECK4-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]])
1204 // CHECK4-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
1205 // CHECK4-NEXT:    store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
1206 // CHECK4-NEXT:    [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
1207 // CHECK4-NEXT:    [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
1208 // CHECK4-NEXT:    [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
1209 // CHECK4-NEXT:    store i8* [[TMP34]], i8** [[TMP23]], align 4
1210 // CHECK4-NEXT:    [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0
1211 // CHECK4-NEXT:    [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1
1212 // CHECK4-NEXT:    [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
1213 // CHECK4-NEXT:    [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]]
1214 // CHECK4-NEXT:    [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2
1215 // CHECK4-NEXT:    [[TMP40:%.*]] = and i16 [[TMP6]], 1
1216 // CHECK4-NEXT:    [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0
1217 // CHECK4-NEXT:    [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]]
1218 // CHECK4-NEXT:    [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0
1219 // CHECK4-NEXT:    [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]]
1220 // CHECK4-NEXT:    [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]]
1221 // CHECK4-NEXT:    [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]]
1222 // CHECK4-NEXT:    br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]]
1223 // CHECK4:       then:
1224 // CHECK4-NEXT:    [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
1225 // CHECK4-NEXT:    [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
1226 // CHECK4-NEXT:    call void @"_omp$reduction$reduction_func16"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]]
1227 // CHECK4-NEXT:    br label [[IFCONT:%.*]]
1228 // CHECK4:       else:
1229 // CHECK4-NEXT:    br label [[IFCONT]]
1230 // CHECK4:       ifcont:
1231 // CHECK4-NEXT:    [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1
1232 // CHECK4-NEXT:    [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
1233 // CHECK4-NEXT:    [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]]
1234 // CHECK4-NEXT:    br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
1235 // CHECK4:       then6:
1236 // CHECK4-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
1237 // CHECK4-NEXT:    [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4
1238 // CHECK4-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
1239 // CHECK4-NEXT:    [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
1240 // CHECK4-NEXT:    [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32*
1241 // CHECK4-NEXT:    [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32*
1242 // CHECK4-NEXT:    [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4
1243 // CHECK4-NEXT:    store i32 [[TMP58]], i32* [[TMP57]], align 4
1244 // CHECK4-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
1245 // CHECK4-NEXT:    [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4
1246 // CHECK4-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
1247 // CHECK4-NEXT:    [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
1248 // CHECK4-NEXT:    [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16*
1249 // CHECK4-NEXT:    [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16*
1250 // CHECK4-NEXT:    [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2
1251 // CHECK4-NEXT:    store i16 [[TMP65]], i16* [[TMP64]], align 2
1252 // CHECK4-NEXT:    br label [[IFCONT8:%.*]]
1253 // CHECK4:       else7:
1254 // CHECK4-NEXT:    br label [[IFCONT8]]
1255 // CHECK4:       ifcont8:
1256 // CHECK4-NEXT:    ret void
1257 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func18
1258 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
1259 // CHECK4-NEXT:  entry:
1260 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
1261 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
1262 // CHECK4-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
1263 // CHECK4-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
1264 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
1265 // CHECK4-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1266 // CHECK4-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1267 // CHECK4-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
1268 // CHECK4-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1269 // CHECK4-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
1270 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
1271 // CHECK4-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
1272 // CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
1273 // CHECK4-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
1274 // CHECK4-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
1275 // CHECK4:       then:
1276 // CHECK4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
1277 // CHECK4-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4
1278 // CHECK4-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
1279 // CHECK4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
1280 // CHECK4-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4
1281 // CHECK4-NEXT:    store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4
1282 // CHECK4-NEXT:    br label [[IFCONT:%.*]]
1283 // CHECK4:       else:
1284 // CHECK4-NEXT:    br label [[IFCONT]]
1285 // CHECK4:       ifcont:
1286 // CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
1287 // CHECK4-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4
1288 // CHECK4-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]]
1289 // CHECK4-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
1290 // CHECK4:       then4:
1291 // CHECK4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
1292 // CHECK4-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
1293 // CHECK4-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
1294 // CHECK4-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32*
1295 // CHECK4-NEXT:    [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4
1296 // CHECK4-NEXT:    store i32 [[TMP15]], i32* [[TMP14]], align 4
1297 // CHECK4-NEXT:    br label [[IFCONT6:%.*]]
1298 // CHECK4:       else5:
1299 // CHECK4-NEXT:    br label [[IFCONT6]]
1300 // CHECK4:       ifcont6:
1301 // CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
1302 // CHECK4-NEXT:    [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
1303 // CHECK4-NEXT:    br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]]
1304 // CHECK4:       then8:
1305 // CHECK4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
1306 // CHECK4-NEXT:    [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
1307 // CHECK4-NEXT:    [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16*
1308 // CHECK4-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
1309 // CHECK4-NEXT:    [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)*
1310 // CHECK4-NEXT:    [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2
1311 // CHECK4-NEXT:    store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2
1312 // CHECK4-NEXT:    br label [[IFCONT10:%.*]]
1313 // CHECK4:       else9:
1314 // CHECK4-NEXT:    br label [[IFCONT10]]
1315 // CHECK4:       ifcont10:
1316 // CHECK4-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
1317 // CHECK4-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4
1318 // CHECK4-NEXT:    [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]]
1319 // CHECK4-NEXT:    br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]]
1320 // CHECK4:       then12:
1321 // CHECK4-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
1322 // CHECK4-NEXT:    [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)*
1323 // CHECK4-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
1324 // CHECK4-NEXT:    [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4
1325 // CHECK4-NEXT:    [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16*
1326 // CHECK4-NEXT:    [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2
1327 // CHECK4-NEXT:    store i16 [[TMP28]], i16* [[TMP27]], align 2
1328 // CHECK4-NEXT:    br label [[IFCONT14:%.*]]
1329 // CHECK4:       else13:
1330 // CHECK4-NEXT:    br label [[IFCONT14]]
1331 // CHECK4:       ifcont14:
1332 // CHECK4-NEXT:    ret void
1333 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func19
1334 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
1335 // CHECK4-NEXT:  entry:
1336 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
1337 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
1338 // CHECK4-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
1339 // CHECK4-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
1340 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
1341 // CHECK4-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
1342 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
1343 // CHECK4-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
1344 // CHECK4-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
1345 // CHECK4-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4*
1346 // CHECK4-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
1347 // CHECK4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
1348 // CHECK4-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
1349 // CHECK4-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
1350 // CHECK4-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0
1351 // CHECK4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]]
1352 // CHECK4-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
1353 // CHECK4-NEXT:    store i32 [[TMP12]], i32* [[TMP11]], align 128
1354 // CHECK4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
1355 // CHECK4-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
1356 // CHECK4-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
1357 // CHECK4-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1
1358 // CHECK4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]]
1359 // CHECK4-NEXT:    [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2
1360 // CHECK4-NEXT:    store i16 [[TMP17]], i16* [[TMP16]], align 128
1361 // CHECK4-NEXT:    ret void
1362 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func20
1363 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
1364 // CHECK4-NEXT:  entry:
1365 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
1366 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
1367 // CHECK4-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
1368 // CHECK4-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
1369 // CHECK4-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
1370 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
1371 // CHECK4-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
1372 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
1373 // CHECK4-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4*
1374 // CHECK4-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
1375 // CHECK4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
1376 // CHECK4-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0
1377 // CHECK4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]]
1378 // CHECK4-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
1379 // CHECK4-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 4
1380 // CHECK4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
1381 // CHECK4-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1
1382 // CHECK4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]]
1383 // CHECK4-NEXT:    [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
1384 // CHECK4-NEXT:    store i8* [[TMP11]], i8** [[TMP9]], align 4
1385 // CHECK4-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
1386 // CHECK4-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
1387 // CHECK4-NEXT:    call void @"_omp$reduction$reduction_func16"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]]
1388 // CHECK4-NEXT:    ret void
1389 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func21
1390 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
1391 // CHECK4-NEXT:  entry:
1392 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
1393 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
1394 // CHECK4-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
1395 // CHECK4-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
1396 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
1397 // CHECK4-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
1398 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
1399 // CHECK4-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
1400 // CHECK4-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
1401 // CHECK4-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4*
1402 // CHECK4-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
1403 // CHECK4-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
1404 // CHECK4-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
1405 // CHECK4-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
1406 // CHECK4-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0
1407 // CHECK4-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]]
1408 // CHECK4-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128
1409 // CHECK4-NEXT:    store i32 [[TMP12]], i32* [[TMP10]], align 4
1410 // CHECK4-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
1411 // CHECK4-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
1412 // CHECK4-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
1413 // CHECK4-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1
1414 // CHECK4-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]]
1415 // CHECK4-NEXT:    [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128
1416 // CHECK4-NEXT:    store i16 [[TMP17]], i16* [[TMP15]], align 2
1417 // CHECK4-NEXT:    ret void
1418 // CHECK4-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func22
1419 // CHECK4-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
1420 // CHECK4-NEXT:  entry:
1421 // CHECK4-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
1422 // CHECK4-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
1423 // CHECK4-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
1424 // CHECK4-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
1425 // CHECK4-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
1426 // CHECK4-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
1427 // CHECK4-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
1428 // CHECK4-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
1429 // CHECK4-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4*
1430 // CHECK4-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
1431 // CHECK4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
1432 // CHECK4-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0
1433 // CHECK4-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]]
1434 // CHECK4-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
1435 // CHECK4-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 4
1436 // CHECK4-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
1437 // CHECK4-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1
1438 // CHECK4-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]]
1439 // CHECK4-NEXT:    [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
1440 // CHECK4-NEXT:    store i8* [[TMP11]], i8** [[TMP9]], align 4
1441 // CHECK4-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
1442 // CHECK4-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
1443 // CHECK4-NEXT:    call void @"_omp$reduction$reduction_func16"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]]
1444 // CHECK4-NEXT:    ret void
1445 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker
1446 // CHECK5-SAME: () #[[ATTR0:[0-9]+]] {
1447 // CHECK5-NEXT:  entry:
1448 // CHECK5-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
1449 // CHECK5-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1450 // CHECK5-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
1451 // CHECK5-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1452 // CHECK5-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1453 // CHECK5:       .await.work:
1454 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1455 // CHECK5-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1456 // CHECK5-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1457 // CHECK5-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1458 // CHECK5-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1459 // CHECK5-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1460 // CHECK5-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1461 // CHECK5:       .select.workers:
1462 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1463 // CHECK5-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1464 // CHECK5-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1465 // CHECK5:       .execute.parallel:
1466 // CHECK5-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
1467 // CHECK5-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1468 // CHECK5-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
1469 // CHECK5-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1470 // CHECK5:       .terminate.parallel:
1471 // CHECK5-NEXT:    call void @__kmpc_kernel_end_parallel()
1472 // CHECK5-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1473 // CHECK5:       .barrier.parallel:
1474 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1475 // CHECK5-NEXT:    br label [[DOTAWAIT_WORK]]
1476 // CHECK5:       .exit:
1477 // CHECK5-NEXT:    ret void
1478 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23
1479 // CHECK5-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] {
1480 // CHECK5-NEXT:  entry:
1481 // CHECK5-NEXT:    [[E_ADDR:%.*]] = alloca double*, align 4
1482 // CHECK5-NEXT:    [[E7:%.*]] = alloca double, align 8
1483 // CHECK5-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1484 // CHECK5-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1485 // CHECK5-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1486 // CHECK5-NEXT:    store double* [[E]], double** [[E_ADDR]], align 4
1487 // CHECK5-NEXT:    [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
1488 // CHECK5-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1489 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1490 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1491 // CHECK5-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1492 // CHECK5-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1493 // CHECK5-NEXT:    br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1494 // CHECK5:       .worker:
1495 // CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]]
1496 // CHECK5-NEXT:    br label [[DOTEXIT:%.*]]
1497 // CHECK5:       .mastercheck:
1498 // CHECK5-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1499 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1500 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1501 // CHECK5-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
1502 // CHECK5-NEXT:    [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
1503 // CHECK5-NEXT:    [[TMP4:%.*]] = xor i32 [[TMP2]], -1
1504 // CHECK5-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
1505 // CHECK5-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
1506 // CHECK5-NEXT:    br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1507 // CHECK5:       .master:
1508 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1509 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1510 // CHECK5-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
1511 // CHECK5-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
1512 // CHECK5-NEXT:    call void @__kmpc_data_sharing_init_stack()
1513 // CHECK5-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1514 // CHECK5-NEXT:    [[TMP7:%.*]] = load double, double* [[TMP0]], align 8
1515 // CHECK5-NEXT:    store double [[TMP7]], double* [[E7]], align 8
1516 // CHECK5-NEXT:    store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4
1517 // CHECK5-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]]
1518 // CHECK5-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1519 // CHECK5:       .termination.notifier:
1520 // CHECK5-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1521 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1522 // CHECK5-NEXT:    br label [[DOTEXIT]]
1523 // CHECK5:       .exit:
1524 // CHECK5-NEXT:    ret void
1525 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__
1526 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] {
1527 // CHECK5-NEXT:  entry:
1528 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
1529 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
1530 // CHECK5-NEXT:    [[E_ADDR:%.*]] = alloca double*, align 4
1531 // CHECK5-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
1532 // CHECK5-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
1533 // CHECK5-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
1534 // CHECK5-NEXT:    store double* [[E]], double** [[E_ADDR]], align 4
1535 // CHECK5-NEXT:    [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
1536 // CHECK5-NEXT:    [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1)
1537 // CHECK5-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty*
1538 // CHECK5-NEXT:    [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0
1539 // CHECK5-NEXT:    store double 0.000000e+00, double* [[E1]], align 8
1540 // CHECK5-NEXT:    [[TMP3:%.*]] = load double, double* [[E1]], align 8
1541 // CHECK5-NEXT:    [[ADD:%.*]] = fadd double [[TMP3]], 5.000000e+00
1542 // CHECK5-NEXT:    store double [[ADD]], double* [[E1]], align 8
1543 // CHECK5-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
1544 // CHECK5-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
1545 // CHECK5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
1546 // CHECK5-NEXT:    [[TMP7:%.*]] = bitcast double* [[E1]] to i8*
1547 // CHECK5-NEXT:    store i8* [[TMP7]], i8** [[TMP6]], align 4
1548 // CHECK5-NEXT:    [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
1549 // CHECK5-NEXT:    [[TMP9:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
1550 // CHECK5-NEXT:    [[TMP10:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP9]], i32 1024, i8* [[TMP8]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func)
1551 // CHECK5-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1
1552 // CHECK5-NEXT:    br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
1553 // CHECK5:       .omp.reduction.then:
1554 // CHECK5-NEXT:    [[TMP12:%.*]] = load double, double* [[TMP0]], align 8
1555 // CHECK5-NEXT:    [[TMP13:%.*]] = load double, double* [[E1]], align 8
1556 // CHECK5-NEXT:    [[ADD2:%.*]] = fadd double [[TMP12]], [[TMP13]]
1557 // CHECK5-NEXT:    store double [[ADD2]], double* [[TMP0]], align 8
1558 // CHECK5-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
1559 // CHECK5-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
1560 // CHECK5:       .omp.reduction.done:
1561 // CHECK5-NEXT:    call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]])
1562 // CHECK5-NEXT:    ret void
1563 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
1564 // CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
1565 // CHECK5-NEXT:  entry:
1566 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
1567 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
1568 // CHECK5-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
1569 // CHECK5-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
1570 // CHECK5-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4
1571 // CHECK5-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8
1572 // CHECK5-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
1573 // CHECK5-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
1574 // CHECK5-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
1575 // CHECK5-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
1576 // CHECK5-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
1577 // CHECK5-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]*
1578 // CHECK5-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
1579 // CHECK5-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
1580 // CHECK5-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
1581 // CHECK5-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
1582 // CHECK5-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
1583 // CHECK5-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
1584 // CHECK5-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double*
1585 // CHECK5-NEXT:    [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1
1586 // CHECK5-NEXT:    [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8*
1587 // CHECK5-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64*
1588 // CHECK5-NEXT:    [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
1589 // CHECK5-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8
1590 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1591 // CHECK5-NEXT:    [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
1592 // CHECK5-NEXT:    [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]])
1593 // CHECK5-NEXT:    store i64 [[TMP19]], i64* [[TMP16]], align 8
1594 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1
1595 // CHECK5-NEXT:    [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1
1596 // CHECK5-NEXT:    [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
1597 // CHECK5-NEXT:    store i8* [[TMP22]], i8** [[TMP11]], align 4
1598 // CHECK5-NEXT:    [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0
1599 // CHECK5-NEXT:    [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1
1600 // CHECK5-NEXT:    [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
1601 // CHECK5-NEXT:    [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]]
1602 // CHECK5-NEXT:    [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2
1603 // CHECK5-NEXT:    [[TMP28:%.*]] = and i16 [[TMP6]], 1
1604 // CHECK5-NEXT:    [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0
1605 // CHECK5-NEXT:    [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]]
1606 // CHECK5-NEXT:    [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0
1607 // CHECK5-NEXT:    [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
1608 // CHECK5-NEXT:    [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]]
1609 // CHECK5-NEXT:    [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]]
1610 // CHECK5-NEXT:    br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]]
1611 // CHECK5:       then:
1612 // CHECK5-NEXT:    [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
1613 // CHECK5-NEXT:    [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
1614 // CHECK5-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]]
1615 // CHECK5-NEXT:    br label [[IFCONT:%.*]]
1616 // CHECK5:       else:
1617 // CHECK5-NEXT:    br label [[IFCONT]]
1618 // CHECK5:       ifcont:
1619 // CHECK5-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1
1620 // CHECK5-NEXT:    [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
1621 // CHECK5-NEXT:    [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
1622 // CHECK5-NEXT:    br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
1623 // CHECK5:       then4:
1624 // CHECK5-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
1625 // CHECK5-NEXT:    [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4
1626 // CHECK5-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
1627 // CHECK5-NEXT:    [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4
1628 // CHECK5-NEXT:    [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double*
1629 // CHECK5-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double*
1630 // CHECK5-NEXT:    [[TMP46:%.*]] = load double, double* [[TMP44]], align 8
1631 // CHECK5-NEXT:    store double [[TMP46]], double* [[TMP45]], align 8
1632 // CHECK5-NEXT:    br label [[IFCONT6:%.*]]
1633 // CHECK5:       else5:
1634 // CHECK5-NEXT:    br label [[IFCONT6]]
1635 // CHECK5:       ifcont6:
1636 // CHECK5-NEXT:    ret void
1637 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
1638 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
1639 // CHECK5-NEXT:  entry:
1640 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
1641 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
1642 // CHECK5-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
1643 // CHECK5-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1644 // CHECK5-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
1645 // CHECK5-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
1646 // CHECK5-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1647 // CHECK5-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1648 // CHECK5-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
1649 // CHECK5-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1650 // CHECK5-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
1651 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
1652 // CHECK5-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
1653 // CHECK5-NEXT:    store i32 0, i32* [[DOTCNT_ADDR]], align 4
1654 // CHECK5-NEXT:    br label [[PRECOND:%.*]]
1655 // CHECK5:       precond:
1656 // CHECK5-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4
1657 // CHECK5-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2
1658 // CHECK5-NEXT:    br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]]
1659 // CHECK5:       body:
1660 // CHECK5-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]])
1661 // CHECK5-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
1662 // CHECK5-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
1663 // CHECK5:       then:
1664 // CHECK5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
1665 // CHECK5-NEXT:    [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4
1666 // CHECK5-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
1667 // CHECK5-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]]
1668 // CHECK5-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
1669 // CHECK5-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
1670 // CHECK5-NEXT:    store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
1671 // CHECK5-NEXT:    br label [[IFCONT:%.*]]
1672 // CHECK5:       else:
1673 // CHECK5-NEXT:    br label [[IFCONT]]
1674 // CHECK5:       ifcont:
1675 // CHECK5-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
1676 // CHECK5-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
1677 // CHECK5-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]]
1678 // CHECK5-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
1679 // CHECK5:       then4:
1680 // CHECK5-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
1681 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
1682 // CHECK5-NEXT:    [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4
1683 // CHECK5-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
1684 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]]
1685 // CHECK5-NEXT:    [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4
1686 // CHECK5-NEXT:    store i32 [[TMP19]], i32* [[TMP18]], align 4
1687 // CHECK5-NEXT:    br label [[IFCONT6:%.*]]
1688 // CHECK5:       else5:
1689 // CHECK5-NEXT:    br label [[IFCONT6]]
1690 // CHECK5:       ifcont6:
1691 // CHECK5-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1
1692 // CHECK5-NEXT:    store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4
1693 // CHECK5-NEXT:    br label [[PRECOND]]
1694 // CHECK5:       exit:
1695 // CHECK5-NEXT:    ret void
1696 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
1697 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
1698 // CHECK5-NEXT:  entry:
1699 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
1700 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
1701 // CHECK5-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
1702 // CHECK5-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
1703 // CHECK5-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
1704 // CHECK5-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
1705 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
1706 // CHECK5-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
1707 // CHECK5-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
1708 // CHECK5-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
1709 // CHECK5-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
1710 // CHECK5-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
1711 // CHECK5-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
1712 // CHECK5-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
1713 // CHECK5-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
1714 // CHECK5-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]]
1715 // CHECK5-NEXT:    [[TMP12:%.*]] = load double, double* [[TMP10]], align 8
1716 // CHECK5-NEXT:    store double [[TMP12]], double* [[TMP11]], align 128
1717 // CHECK5-NEXT:    ret void
1718 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
1719 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
1720 // CHECK5-NEXT:  entry:
1721 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
1722 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
1723 // CHECK5-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
1724 // CHECK5-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
1725 // CHECK5-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
1726 // CHECK5-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
1727 // CHECK5-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
1728 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
1729 // CHECK5-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
1730 // CHECK5-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
1731 // CHECK5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
1732 // CHECK5-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
1733 // CHECK5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]]
1734 // CHECK5-NEXT:    [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
1735 // CHECK5-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 4
1736 // CHECK5-NEXT:    [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
1737 // CHECK5-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
1738 // CHECK5-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]]
1739 // CHECK5-NEXT:    ret void
1740 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
1741 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
1742 // CHECK5-NEXT:  entry:
1743 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
1744 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
1745 // CHECK5-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
1746 // CHECK5-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
1747 // CHECK5-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
1748 // CHECK5-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
1749 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
1750 // CHECK5-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
1751 // CHECK5-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
1752 // CHECK5-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
1753 // CHECK5-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
1754 // CHECK5-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
1755 // CHECK5-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
1756 // CHECK5-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
1757 // CHECK5-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
1758 // CHECK5-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]]
1759 // CHECK5-NEXT:    [[TMP12:%.*]] = load double, double* [[TMP11]], align 128
1760 // CHECK5-NEXT:    store double [[TMP12]], double* [[TMP10]], align 8
1761 // CHECK5-NEXT:    ret void
1762 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
1763 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
1764 // CHECK5-NEXT:  entry:
1765 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
1766 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
1767 // CHECK5-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
1768 // CHECK5-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
1769 // CHECK5-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
1770 // CHECK5-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
1771 // CHECK5-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
1772 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
1773 // CHECK5-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
1774 // CHECK5-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
1775 // CHECK5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
1776 // CHECK5-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
1777 // CHECK5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]]
1778 // CHECK5-NEXT:    [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
1779 // CHECK5-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 4
1780 // CHECK5-NEXT:    [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
1781 // CHECK5-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
1782 // CHECK5-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]]
1783 // CHECK5-NEXT:    ret void
1784 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker
1785 // CHECK5-SAME: () #[[ATTR0]] {
1786 // CHECK5-NEXT:  entry:
1787 // CHECK5-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
1788 // CHECK5-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
1789 // CHECK5-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
1790 // CHECK5-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
1791 // CHECK5-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
1792 // CHECK5:       .await.work:
1793 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1794 // CHECK5-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
1795 // CHECK5-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
1796 // CHECK5-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
1797 // CHECK5-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
1798 // CHECK5-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
1799 // CHECK5-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
1800 // CHECK5:       .select.workers:
1801 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
1802 // CHECK5-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
1803 // CHECK5-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
1804 // CHECK5:       .execute.parallel:
1805 // CHECK5-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1806 // CHECK5-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
1807 // CHECK5-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
1808 // CHECK5-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
1809 // CHECK5:       .terminate.parallel:
1810 // CHECK5-NEXT:    call void @__kmpc_kernel_end_parallel()
1811 // CHECK5-NEXT:    br label [[DOTBARRIER_PARALLEL]]
1812 // CHECK5:       .barrier.parallel:
1813 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1814 // CHECK5-NEXT:    br label [[DOTAWAIT_WORK]]
1815 // CHECK5:       .exit:
1816 // CHECK5-NEXT:    ret void
1817 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29
1818 // CHECK5-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] {
1819 // CHECK5-NEXT:  entry:
1820 // CHECK5-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4
1821 // CHECK5-NEXT:    [[D_ADDR:%.*]] = alloca i32, align 4
1822 // CHECK5-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
1823 // CHECK5-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
1824 // CHECK5-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
1825 // CHECK5-NEXT:    store i32 [[C]], i32* [[C_ADDR]], align 4
1826 // CHECK5-NEXT:    store i32 [[D]], i32* [[D_ADDR]], align 4
1827 // CHECK5-NEXT:    [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8*
1828 // CHECK5-NEXT:    [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float*
1829 // CHECK5-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1830 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1831 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1832 // CHECK5-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
1833 // CHECK5-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
1834 // CHECK5-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
1835 // CHECK5:       .worker:
1836 // CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]]
1837 // CHECK5-NEXT:    br label [[DOTEXIT:%.*]]
1838 // CHECK5:       .mastercheck:
1839 // CHECK5-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
1840 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1841 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1842 // CHECK5-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1
1843 // CHECK5-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1
1844 // CHECK5-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
1845 // CHECK5-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
1846 // CHECK5-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]]
1847 // CHECK5-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
1848 // CHECK5:       .master:
1849 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
1850 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1851 // CHECK5-NEXT:    [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]]
1852 // CHECK5-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1)
1853 // CHECK5-NEXT:    call void @__kmpc_data_sharing_init_stack()
1854 // CHECK5-NEXT:    [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1)
1855 // CHECK5-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1*
1856 // CHECK5-NEXT:    [[TMP7:%.*]] = load i8, i8* [[CONV]], align 4
1857 // CHECK5-NEXT:    [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1
1858 // CHECK5-NEXT:    store i8 [[TMP7]], i8* [[C8]], align 4
1859 // CHECK5-NEXT:    [[TMP8:%.*]] = load float, float* [[CONV1]], align 4
1860 // CHECK5-NEXT:    [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0
1861 // CHECK5-NEXT:    store float [[TMP8]], float* [[D9]], align 4
1862 // CHECK5-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
1863 // CHECK5-NEXT:    store i32 [[TMP9]], i32* [[DOTTHREADID_TEMP_]], align 4
1864 // CHECK5-NEXT:    call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]]
1865 // CHECK5-NEXT:    call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]])
1866 // CHECK5-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
1867 // CHECK5:       .termination.notifier:
1868 // CHECK5-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
1869 // CHECK5-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
1870 // CHECK5-NEXT:    br label [[DOTEXIT]]
1871 // CHECK5:       .exit:
1872 // CHECK5-NEXT:    ret void
1873 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__1
1874 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
1875 // CHECK5-NEXT:  entry:
1876 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
1877 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
1878 // CHECK5-NEXT:    [[C_ADDR:%.*]] = alloca i8*, align 4
1879 // CHECK5-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 4
1880 // CHECK5-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
1881 // CHECK5-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
1882 // CHECK5-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
1883 // CHECK5-NEXT:    store i8* [[C]], i8** [[C_ADDR]], align 4
1884 // CHECK5-NEXT:    store float* [[D]], float** [[D_ADDR]], align 4
1885 // CHECK5-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4
1886 // CHECK5-NEXT:    [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4
1887 // CHECK5-NEXT:    [[TMP2:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1)
1888 // CHECK5-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct._globalized_locals_ty.2*
1889 // CHECK5-NEXT:    [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 1
1890 // CHECK5-NEXT:    [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 0
1891 // CHECK5-NEXT:    store i8 0, i8* [[C1]], align 4
1892 // CHECK5-NEXT:    store float 1.000000e+00, float* [[D2]], align 4
1893 // CHECK5-NEXT:    [[TMP4:%.*]] = load i8, i8* [[C1]], align 4
1894 // CHECK5-NEXT:    [[CONV:%.*]] = sext i8 [[TMP4]] to i32
1895 // CHECK5-NEXT:    [[XOR:%.*]] = xor i32 [[CONV]], 2
1896 // CHECK5-NEXT:    [[CONV3:%.*]] = trunc i32 [[XOR]] to i8
1897 // CHECK5-NEXT:    store i8 [[CONV3]], i8* [[C1]], align 4
1898 // CHECK5-NEXT:    [[TMP5:%.*]] = load float, float* [[D2]], align 4
1899 // CHECK5-NEXT:    [[MUL:%.*]] = fmul float [[TMP5]], 3.300000e+01
1900 // CHECK5-NEXT:    store float [[MUL]], float* [[D2]], align 4
1901 // CHECK5-NEXT:    [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
1902 // CHECK5-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
1903 // CHECK5-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
1904 // CHECK5-NEXT:    store i8* [[C1]], i8** [[TMP8]], align 4
1905 // CHECK5-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
1906 // CHECK5-NEXT:    [[TMP10:%.*]] = bitcast float* [[D2]] to i8*
1907 // CHECK5-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 4
1908 // CHECK5-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
1909 // CHECK5-NEXT:    [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
1910 // CHECK5-NEXT:    [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i8* [[TMP12]], i32 1024, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8)
1911 // CHECK5-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1
1912 // CHECK5-NEXT:    br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
1913 // CHECK5:       .omp.reduction.then:
1914 // CHECK5-NEXT:    [[TMP15:%.*]] = load i8, i8* [[TMP0]], align 1
1915 // CHECK5-NEXT:    [[CONV4:%.*]] = sext i8 [[TMP15]] to i32
1916 // CHECK5-NEXT:    [[TMP16:%.*]] = load i8, i8* [[C1]], align 4
1917 // CHECK5-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP16]] to i32
1918 // CHECK5-NEXT:    [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
1919 // CHECK5-NEXT:    [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
1920 // CHECK5-NEXT:    store i8 [[CONV7]], i8* [[TMP0]], align 1
1921 // CHECK5-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP1]], align 4
1922 // CHECK5-NEXT:    [[TMP18:%.*]] = load float, float* [[D2]], align 4
1923 // CHECK5-NEXT:    [[MUL8:%.*]] = fmul float [[TMP17]], [[TMP18]]
1924 // CHECK5-NEXT:    store float [[MUL8]], float* [[TMP1]], align 4
1925 // CHECK5-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]])
1926 // CHECK5-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
1927 // CHECK5:       .omp.reduction.done:
1928 // CHECK5-NEXT:    call void @__kmpc_data_sharing_pop_stack(i8* [[TMP2]])
1929 // CHECK5-NEXT:    ret void
1930 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3
1931 // CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
1932 // CHECK5-NEXT:  entry:
1933 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
1934 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
1935 // CHECK5-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
1936 // CHECK5-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
1937 // CHECK5-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
1938 // CHECK5-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1
1939 // CHECK5-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4
1940 // CHECK5-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
1941 // CHECK5-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
1942 // CHECK5-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
1943 // CHECK5-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
1944 // CHECK5-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
1945 // CHECK5-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
1946 // CHECK5-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
1947 // CHECK5-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
1948 // CHECK5-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
1949 // CHECK5-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
1950 // CHECK5-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
1951 // CHECK5-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
1952 // CHECK5-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
1953 // CHECK5-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1
1954 // CHECK5-NEXT:    [[TMP14:%.*]] = sext i8 [[TMP13]] to i32
1955 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1956 // CHECK5-NEXT:    [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
1957 // CHECK5-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]])
1958 // CHECK5-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
1959 // CHECK5-NEXT:    store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1
1960 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
1961 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
1962 // CHECK5-NEXT:    store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4
1963 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
1964 // CHECK5-NEXT:    [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4
1965 // CHECK5-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
1966 // CHECK5-NEXT:    [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float*
1967 // CHECK5-NEXT:    [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1
1968 // CHECK5-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8*
1969 // CHECK5-NEXT:    [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32*
1970 // CHECK5-NEXT:    [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32*
1971 // CHECK5-NEXT:    [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4
1972 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
1973 // CHECK5-NEXT:    [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16
1974 // CHECK5-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]])
1975 // CHECK5-NEXT:    store i32 [[TMP30]], i32* [[TMP27]], align 4
1976 // CHECK5-NEXT:    [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1
1977 // CHECK5-NEXT:    [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1
1978 // CHECK5-NEXT:    [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
1979 // CHECK5-NEXT:    store i8* [[TMP33]], i8** [[TMP22]], align 4
1980 // CHECK5-NEXT:    [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0
1981 // CHECK5-NEXT:    [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1
1982 // CHECK5-NEXT:    [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
1983 // CHECK5-NEXT:    [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]]
1984 // CHECK5-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2
1985 // CHECK5-NEXT:    [[TMP39:%.*]] = and i16 [[TMP6]], 1
1986 // CHECK5-NEXT:    [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0
1987 // CHECK5-NEXT:    [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]]
1988 // CHECK5-NEXT:    [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0
1989 // CHECK5-NEXT:    [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]]
1990 // CHECK5-NEXT:    [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]]
1991 // CHECK5-NEXT:    [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]]
1992 // CHECK5-NEXT:    br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]]
1993 // CHECK5:       then:
1994 // CHECK5-NEXT:    [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
1995 // CHECK5-NEXT:    [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
1996 // CHECK5-NEXT:    call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]]
1997 // CHECK5-NEXT:    br label [[IFCONT:%.*]]
1998 // CHECK5:       else:
1999 // CHECK5-NEXT:    br label [[IFCONT]]
2000 // CHECK5:       ifcont:
2001 // CHECK5-NEXT:    [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1
2002 // CHECK5-NEXT:    [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
2003 // CHECK5-NEXT:    [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]]
2004 // CHECK5-NEXT:    br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
2005 // CHECK5:       then6:
2006 // CHECK5-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
2007 // CHECK5-NEXT:    [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4
2008 // CHECK5-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
2009 // CHECK5-NEXT:    [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4
2010 // CHECK5-NEXT:    [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1
2011 // CHECK5-NEXT:    store i8 [[TMP55]], i8* [[TMP54]], align 1
2012 // CHECK5-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
2013 // CHECK5-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4
2014 // CHECK5-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
2015 // CHECK5-NEXT:    [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4
2016 // CHECK5-NEXT:    [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float*
2017 // CHECK5-NEXT:    [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float*
2018 // CHECK5-NEXT:    [[TMP62:%.*]] = load float, float* [[TMP60]], align 4
2019 // CHECK5-NEXT:    store float [[TMP62]], float* [[TMP61]], align 4
2020 // CHECK5-NEXT:    br label [[IFCONT8:%.*]]
2021 // CHECK5:       else7:
2022 // CHECK5-NEXT:    br label [[IFCONT8]]
2023 // CHECK5:       ifcont8:
2024 // CHECK5-NEXT:    ret void
2025 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4
2026 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
2027 // CHECK5-NEXT:  entry:
2028 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
2029 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
2030 // CHECK5-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
2031 // CHECK5-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
2032 // CHECK5-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
2033 // CHECK5-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2034 // CHECK5-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2035 // CHECK5-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
2036 // CHECK5-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2037 // CHECK5-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
2038 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
2039 // CHECK5-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
2040 // CHECK5-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
2041 // CHECK5-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
2042 // CHECK5-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
2043 // CHECK5:       then:
2044 // CHECK5-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
2045 // CHECK5-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4
2046 // CHECK5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
2047 // CHECK5-NEXT:    [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)*
2048 // CHECK5-NEXT:    [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1
2049 // CHECK5-NEXT:    store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1
2050 // CHECK5-NEXT:    br label [[IFCONT:%.*]]
2051 // CHECK5:       else:
2052 // CHECK5-NEXT:    br label [[IFCONT]]
2053 // CHECK5:       ifcont:
2054 // CHECK5-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
2055 // CHECK5-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4
2056 // CHECK5-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]]
2057 // CHECK5-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
2058 // CHECK5:       then4:
2059 // CHECK5-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
2060 // CHECK5-NEXT:    [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)*
2061 // CHECK5-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
2062 // CHECK5-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
2063 // CHECK5-NEXT:    [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1
2064 // CHECK5-NEXT:    store i8 [[TMP15]], i8* [[TMP14]], align 1
2065 // CHECK5-NEXT:    br label [[IFCONT6:%.*]]
2066 // CHECK5:       else5:
2067 // CHECK5-NEXT:    br label [[IFCONT6]]
2068 // CHECK5:       ifcont6:
2069 // CHECK5-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
2070 // CHECK5-NEXT:    [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
2071 // CHECK5-NEXT:    br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]]
2072 // CHECK5:       then8:
2073 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
2074 // CHECK5-NEXT:    [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
2075 // CHECK5-NEXT:    [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32*
2076 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
2077 // CHECK5-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4
2078 // CHECK5-NEXT:    store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4
2079 // CHECK5-NEXT:    br label [[IFCONT10:%.*]]
2080 // CHECK5:       else9:
2081 // CHECK5-NEXT:    br label [[IFCONT10]]
2082 // CHECK5:       ifcont10:
2083 // CHECK5-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
2084 // CHECK5-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4
2085 // CHECK5-NEXT:    [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]]
2086 // CHECK5-NEXT:    br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]]
2087 // CHECK5:       then12:
2088 // CHECK5-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
2089 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
2090 // CHECK5-NEXT:    [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4
2091 // CHECK5-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32*
2092 // CHECK5-NEXT:    [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4
2093 // CHECK5-NEXT:    store i32 [[TMP26]], i32* [[TMP25]], align 4
2094 // CHECK5-NEXT:    br label [[IFCONT14:%.*]]
2095 // CHECK5:       else13:
2096 // CHECK5-NEXT:    br label [[IFCONT14]]
2097 // CHECK5:       ifcont14:
2098 // CHECK5-NEXT:    ret void
2099 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5
2100 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
2101 // CHECK5-NEXT:  entry:
2102 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
2103 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
2104 // CHECK5-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
2105 // CHECK5-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
2106 // CHECK5-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
2107 // CHECK5-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
2108 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
2109 // CHECK5-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
2110 // CHECK5-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
2111 // CHECK5-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3*
2112 // CHECK5-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
2113 // CHECK5-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
2114 // CHECK5-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
2115 // CHECK5-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0
2116 // CHECK5-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]]
2117 // CHECK5-NEXT:    [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1
2118 // CHECK5-NEXT:    store i8 [[TMP11]], i8* [[TMP10]], align 128
2119 // CHECK5-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
2120 // CHECK5-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
2121 // CHECK5-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
2122 // CHECK5-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1
2123 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]]
2124 // CHECK5-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP14]], align 4
2125 // CHECK5-NEXT:    store float [[TMP16]], float* [[TMP15]], align 128
2126 // CHECK5-NEXT:    ret void
2127 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6
2128 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
2129 // CHECK5-NEXT:  entry:
2130 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
2131 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
2132 // CHECK5-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
2133 // CHECK5-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
2134 // CHECK5-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
2135 // CHECK5-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
2136 // CHECK5-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
2137 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
2138 // CHECK5-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3*
2139 // CHECK5-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
2140 // CHECK5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
2141 // CHECK5-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0
2142 // CHECK5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]]
2143 // CHECK5-NEXT:    store i8* [[TMP7]], i8** [[TMP6]], align 4
2144 // CHECK5-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
2145 // CHECK5-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1
2146 // CHECK5-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]]
2147 // CHECK5-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
2148 // CHECK5-NEXT:    store i8* [[TMP10]], i8** [[TMP8]], align 4
2149 // CHECK5-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
2150 // CHECK5-NEXT:    [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
2151 // CHECK5-NEXT:    call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]]
2152 // CHECK5-NEXT:    ret void
2153 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7
2154 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
2155 // CHECK5-NEXT:  entry:
2156 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
2157 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
2158 // CHECK5-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
2159 // CHECK5-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
2160 // CHECK5-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
2161 // CHECK5-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
2162 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
2163 // CHECK5-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
2164 // CHECK5-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
2165 // CHECK5-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3*
2166 // CHECK5-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
2167 // CHECK5-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
2168 // CHECK5-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
2169 // CHECK5-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0
2170 // CHECK5-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]]
2171 // CHECK5-NEXT:    [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128
2172 // CHECK5-NEXT:    store i8 [[TMP11]], i8* [[TMP9]], align 1
2173 // CHECK5-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
2174 // CHECK5-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
2175 // CHECK5-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
2176 // CHECK5-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1
2177 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]]
2178 // CHECK5-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP15]], align 128
2179 // CHECK5-NEXT:    store float [[TMP16]], float* [[TMP14]], align 4
2180 // CHECK5-NEXT:    ret void
2181 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8
2182 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
2183 // CHECK5-NEXT:  entry:
2184 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
2185 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
2186 // CHECK5-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
2187 // CHECK5-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
2188 // CHECK5-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
2189 // CHECK5-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
2190 // CHECK5-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
2191 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
2192 // CHECK5-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3*
2193 // CHECK5-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
2194 // CHECK5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
2195 // CHECK5-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0
2196 // CHECK5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]]
2197 // CHECK5-NEXT:    store i8* [[TMP7]], i8** [[TMP6]], align 4
2198 // CHECK5-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
2199 // CHECK5-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1
2200 // CHECK5-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]]
2201 // CHECK5-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
2202 // CHECK5-NEXT:    store i8* [[TMP10]], i8** [[TMP8]], align 4
2203 // CHECK5-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
2204 // CHECK5-NEXT:    [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
2205 // CHECK5-NEXT:    call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]]
2206 // CHECK5-NEXT:    ret void
2207 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36
2208 // CHECK5-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] {
2209 // CHECK5-NEXT:  entry:
2210 // CHECK5-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
2211 // CHECK5-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
2212 // CHECK5-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
2213 // CHECK5-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
2214 // CHECK5-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
2215 // CHECK5-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
2216 // CHECK5-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
2217 // CHECK5-NEXT:    [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16*
2218 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2219 // CHECK5-NEXT:    call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
2220 // CHECK5-NEXT:    call void @__kmpc_data_sharing_init_stack_spmd()
2221 // CHECK5-NEXT:    br label [[DOTEXECUTE:%.*]]
2222 // CHECK5:       .execute:
2223 // CHECK5-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
2224 // CHECK5-NEXT:    store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4
2225 // CHECK5-NEXT:    call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]]
2226 // CHECK5-NEXT:    br label [[DOTOMP_DEINIT:%.*]]
2227 // CHECK5:       .omp.deinit:
2228 // CHECK5-NEXT:    call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
2229 // CHECK5-NEXT:    br label [[DOTEXIT:%.*]]
2230 // CHECK5:       .exit:
2231 // CHECK5-NEXT:    ret void
2232 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__9
2233 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] {
2234 // CHECK5-NEXT:  entry:
2235 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
2236 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
2237 // CHECK5-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 4
2238 // CHECK5-NEXT:    [[B_ADDR:%.*]] = alloca i16*, align 4
2239 // CHECK5-NEXT:    [[A1:%.*]] = alloca i32, align 4
2240 // CHECK5-NEXT:    [[B2:%.*]] = alloca i16, align 2
2241 // CHECK5-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
2242 // CHECK5-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
2243 // CHECK5-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
2244 // CHECK5-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
2245 // CHECK5-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 4
2246 // CHECK5-NEXT:    store i16* [[B]], i16** [[B_ADDR]], align 4
2247 // CHECK5-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
2248 // CHECK5-NEXT:    [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
2249 // CHECK5-NEXT:    store i32 0, i32* [[A1]], align 4
2250 // CHECK5-NEXT:    store i16 -32768, i16* [[B2]], align 2
2251 // CHECK5-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
2252 // CHECK5-NEXT:    [[TMP3:%.*]] = bitcast i32* [[A1]] to i8*
2253 // CHECK5-NEXT:    store i8* [[TMP3]], i8** [[TMP2]], align 4
2254 // CHECK5-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
2255 // CHECK5-NEXT:    [[TMP5:%.*]] = bitcast i16* [[B2]] to i8*
2256 // CHECK5-NEXT:    store i8* [[TMP5]], i8** [[TMP4]], align 4
2257 // CHECK5-NEXT:    [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
2258 // CHECK5-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
2259 // CHECK5-NEXT:    [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
2260 // CHECK5-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2)
2261 // CHECK5-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
2262 // CHECK5-NEXT:    [[TMP10:%.*]] = bitcast i32* [[A1]] to i8*
2263 // CHECK5-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 4
2264 // CHECK5-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
2265 // CHECK5-NEXT:    [[TMP12:%.*]] = bitcast i16* [[B2]] to i8*
2266 // CHECK5-NEXT:    store i8* [[TMP12]], i8** [[TMP11]], align 4
2267 // CHECK5-NEXT:    [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
2268 // CHECK5-NEXT:    [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
2269 // CHECK5-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20)
2270 // CHECK5-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1
2271 // CHECK5-NEXT:    br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
2272 // CHECK5:       .omp.reduction.then:
2273 // CHECK5-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4
2274 // CHECK5-NEXT:    [[TMP18:%.*]] = load i32, i32* [[A1]], align 4
2275 // CHECK5-NEXT:    [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]]
2276 // CHECK5-NEXT:    store i32 [[OR]], i32* [[TMP0]], align 4
2277 // CHECK5-NEXT:    [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2
2278 // CHECK5-NEXT:    [[CONV:%.*]] = sext i16 [[TMP19]] to i32
2279 // CHECK5-NEXT:    [[TMP20:%.*]] = load i16, i16* [[B2]], align 2
2280 // CHECK5-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP20]] to i32
2281 // CHECK5-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
2282 // CHECK5-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
2283 // CHECK5:       cond.true:
2284 // CHECK5-NEXT:    [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2
2285 // CHECK5-NEXT:    br label [[COND_END:%.*]]
2286 // CHECK5:       cond.false:
2287 // CHECK5-NEXT:    [[TMP22:%.*]] = load i16, i16* [[B2]], align 2
2288 // CHECK5-NEXT:    br label [[COND_END]]
2289 // CHECK5:       cond.end:
2290 // CHECK5-NEXT:    [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ]
2291 // CHECK5-NEXT:    store i16 [[COND]], i16* [[TMP1]], align 2
2292 // CHECK5-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]])
2293 // CHECK5-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
2294 // CHECK5:       .omp.reduction.done:
2295 // CHECK5-NEXT:    ret void
2296 // CHECK5-LABEL: define {{[^@]+}}@__omp_outlined__10
2297 // CHECK5-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] {
2298 // CHECK5-NEXT:  entry:
2299 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
2300 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
2301 // CHECK5-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 4
2302 // CHECK5-NEXT:    [[B_ADDR:%.*]] = alloca i16*, align 4
2303 // CHECK5-NEXT:    [[A1:%.*]] = alloca i32, align 4
2304 // CHECK5-NEXT:    [[B2:%.*]] = alloca i16, align 2
2305 // CHECK5-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
2306 // CHECK5-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
2307 // CHECK5-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
2308 // CHECK5-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 4
2309 // CHECK5-NEXT:    store i16* [[B]], i16** [[B_ADDR]], align 4
2310 // CHECK5-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
2311 // CHECK5-NEXT:    [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
2312 // CHECK5-NEXT:    store i32 0, i32* [[A1]], align 4
2313 // CHECK5-NEXT:    store i16 -32768, i16* [[B2]], align 2
2314 // CHECK5-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A1]], align 4
2315 // CHECK5-NEXT:    [[OR:%.*]] = or i32 [[TMP2]], 1
2316 // CHECK5-NEXT:    store i32 [[OR]], i32* [[A1]], align 4
2317 // CHECK5-NEXT:    [[TMP3:%.*]] = load i16, i16* [[B2]], align 2
2318 // CHECK5-NEXT:    [[CONV:%.*]] = sext i16 [[TMP3]] to i32
2319 // CHECK5-NEXT:    [[CMP:%.*]] = icmp sgt i32 99, [[CONV]]
2320 // CHECK5-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
2321 // CHECK5:       cond.true:
2322 // CHECK5-NEXT:    br label [[COND_END:%.*]]
2323 // CHECK5:       cond.false:
2324 // CHECK5-NEXT:    [[TMP4:%.*]] = load i16, i16* [[B2]], align 2
2325 // CHECK5-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP4]] to i32
2326 // CHECK5-NEXT:    br label [[COND_END]]
2327 // CHECK5:       cond.end:
2328 // CHECK5-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ]
2329 // CHECK5-NEXT:    [[CONV4:%.*]] = trunc i32 [[COND]] to i16
2330 // CHECK5-NEXT:    store i16 [[CONV4]], i16* [[B2]], align 2
2331 // CHECK5-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
2332 // CHECK5-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
2333 // CHECK5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
2334 // CHECK5-NEXT:    [[TMP8:%.*]] = bitcast i32* [[A1]] to i8*
2335 // CHECK5-NEXT:    store i8* [[TMP8]], i8** [[TMP7]], align 4
2336 // CHECK5-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
2337 // CHECK5-NEXT:    [[TMP10:%.*]] = bitcast i16* [[B2]] to i8*
2338 // CHECK5-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 4
2339 // CHECK5-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
2340 // CHECK5-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13)
2341 // CHECK5-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1
2342 // CHECK5-NEXT:    br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
2343 // CHECK5:       .omp.reduction.then:
2344 // CHECK5-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4
2345 // CHECK5-NEXT:    [[TMP15:%.*]] = load i32, i32* [[A1]], align 4
2346 // CHECK5-NEXT:    [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]]
2347 // CHECK5-NEXT:    store i32 [[OR5]], i32* [[TMP0]], align 4
2348 // CHECK5-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2
2349 // CHECK5-NEXT:    [[CONV6:%.*]] = sext i16 [[TMP16]] to i32
2350 // CHECK5-NEXT:    [[TMP17:%.*]] = load i16, i16* [[B2]], align 2
2351 // CHECK5-NEXT:    [[CONV7:%.*]] = sext i16 [[TMP17]] to i32
2352 // CHECK5-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]]
2353 // CHECK5-NEXT:    br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
2354 // CHECK5:       cond.true9:
2355 // CHECK5-NEXT:    [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2
2356 // CHECK5-NEXT:    br label [[COND_END11:%.*]]
2357 // CHECK5:       cond.false10:
2358 // CHECK5-NEXT:    [[TMP19:%.*]] = load i16, i16* [[B2]], align 2
2359 // CHECK5-NEXT:    br label [[COND_END11]]
2360 // CHECK5:       cond.end11:
2361 // CHECK5-NEXT:    [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ]
2362 // CHECK5-NEXT:    store i16 [[COND12]], i16* [[TMP1]], align 2
2363 // CHECK5-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
2364 // CHECK5-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
2365 // CHECK5:       .omp.reduction.done:
2366 // CHECK5-NEXT:    ret void
2367 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12
2368 // CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
2369 // CHECK5-NEXT:  entry:
2370 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
2371 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
2372 // CHECK5-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
2373 // CHECK5-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
2374 // CHECK5-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
2375 // CHECK5-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
2376 // CHECK5-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
2377 // CHECK5-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
2378 // CHECK5-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
2379 // CHECK5-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
2380 // CHECK5-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
2381 // CHECK5-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
2382 // CHECK5-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
2383 // CHECK5-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
2384 // CHECK5-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
2385 // CHECK5-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
2386 // CHECK5-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
2387 // CHECK5-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
2388 // CHECK5-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
2389 // CHECK5-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
2390 // CHECK5-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
2391 // CHECK5-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
2392 // CHECK5-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
2393 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2394 // CHECK5-NEXT:    [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
2395 // CHECK5-NEXT:    [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]])
2396 // CHECK5-NEXT:    store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
2397 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
2398 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
2399 // CHECK5-NEXT:    [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
2400 // CHECK5-NEXT:    store i8* [[TMP20]], i8** [[TMP11]], align 4
2401 // CHECK5-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
2402 // CHECK5-NEXT:    [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4
2403 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
2404 // CHECK5-NEXT:    [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16*
2405 // CHECK5-NEXT:    [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
2406 // CHECK5-NEXT:    [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8*
2407 // CHECK5-NEXT:    [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2
2408 // CHECK5-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP27]] to i32
2409 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2410 // CHECK5-NEXT:    [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16
2411 // CHECK5-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]])
2412 // CHECK5-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
2413 // CHECK5-NEXT:    store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
2414 // CHECK5-NEXT:    [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
2415 // CHECK5-NEXT:    [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
2416 // CHECK5-NEXT:    [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
2417 // CHECK5-NEXT:    store i8* [[TMP34]], i8** [[TMP23]], align 4
2418 // CHECK5-NEXT:    [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0
2419 // CHECK5-NEXT:    [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1
2420 // CHECK5-NEXT:    [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
2421 // CHECK5-NEXT:    [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]]
2422 // CHECK5-NEXT:    [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2
2423 // CHECK5-NEXT:    [[TMP40:%.*]] = and i16 [[TMP6]], 1
2424 // CHECK5-NEXT:    [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0
2425 // CHECK5-NEXT:    [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]]
2426 // CHECK5-NEXT:    [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0
2427 // CHECK5-NEXT:    [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]]
2428 // CHECK5-NEXT:    [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]]
2429 // CHECK5-NEXT:    [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]]
2430 // CHECK5-NEXT:    br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]]
2431 // CHECK5:       then:
2432 // CHECK5-NEXT:    [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
2433 // CHECK5-NEXT:    [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
2434 // CHECK5-NEXT:    call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]]
2435 // CHECK5-NEXT:    br label [[IFCONT:%.*]]
2436 // CHECK5:       else:
2437 // CHECK5-NEXT:    br label [[IFCONT]]
2438 // CHECK5:       ifcont:
2439 // CHECK5-NEXT:    [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1
2440 // CHECK5-NEXT:    [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
2441 // CHECK5-NEXT:    [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]]
2442 // CHECK5-NEXT:    br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
2443 // CHECK5:       then6:
2444 // CHECK5-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
2445 // CHECK5-NEXT:    [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4
2446 // CHECK5-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
2447 // CHECK5-NEXT:    [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
2448 // CHECK5-NEXT:    [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32*
2449 // CHECK5-NEXT:    [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32*
2450 // CHECK5-NEXT:    [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4
2451 // CHECK5-NEXT:    store i32 [[TMP58]], i32* [[TMP57]], align 4
2452 // CHECK5-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
2453 // CHECK5-NEXT:    [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4
2454 // CHECK5-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
2455 // CHECK5-NEXT:    [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
2456 // CHECK5-NEXT:    [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16*
2457 // CHECK5-NEXT:    [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16*
2458 // CHECK5-NEXT:    [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2
2459 // CHECK5-NEXT:    store i16 [[TMP65]], i16* [[TMP64]], align 2
2460 // CHECK5-NEXT:    br label [[IFCONT8:%.*]]
2461 // CHECK5:       else7:
2462 // CHECK5-NEXT:    br label [[IFCONT8]]
2463 // CHECK5:       ifcont8:
2464 // CHECK5-NEXT:    ret void
2465 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13
2466 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
2467 // CHECK5-NEXT:  entry:
2468 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
2469 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
2470 // CHECK5-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
2471 // CHECK5-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
2472 // CHECK5-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
2473 // CHECK5-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2474 // CHECK5-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2475 // CHECK5-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
2476 // CHECK5-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2477 // CHECK5-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
2478 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
2479 // CHECK5-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
2480 // CHECK5-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]])
2481 // CHECK5-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
2482 // CHECK5-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
2483 // CHECK5:       then:
2484 // CHECK5-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
2485 // CHECK5-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4
2486 // CHECK5-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
2487 // CHECK5-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
2488 // CHECK5-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4
2489 // CHECK5-NEXT:    store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4
2490 // CHECK5-NEXT:    br label [[IFCONT:%.*]]
2491 // CHECK5:       else:
2492 // CHECK5-NEXT:    br label [[IFCONT]]
2493 // CHECK5:       ifcont:
2494 // CHECK5-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
2495 // CHECK5-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4
2496 // CHECK5-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]]
2497 // CHECK5-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
2498 // CHECK5:       then4:
2499 // CHECK5-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
2500 // CHECK5-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
2501 // CHECK5-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
2502 // CHECK5-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32*
2503 // CHECK5-NEXT:    [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4
2504 // CHECK5-NEXT:    store i32 [[TMP15]], i32* [[TMP14]], align 4
2505 // CHECK5-NEXT:    br label [[IFCONT6:%.*]]
2506 // CHECK5:       else5:
2507 // CHECK5-NEXT:    br label [[IFCONT6]]
2508 // CHECK5:       ifcont6:
2509 // CHECK5-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
2510 // CHECK5-NEXT:    [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
2511 // CHECK5-NEXT:    br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]]
2512 // CHECK5:       then8:
2513 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
2514 // CHECK5-NEXT:    [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
2515 // CHECK5-NEXT:    [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16*
2516 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
2517 // CHECK5-NEXT:    [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)*
2518 // CHECK5-NEXT:    [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2
2519 // CHECK5-NEXT:    store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2
2520 // CHECK5-NEXT:    br label [[IFCONT10:%.*]]
2521 // CHECK5:       else9:
2522 // CHECK5-NEXT:    br label [[IFCONT10]]
2523 // CHECK5:       ifcont10:
2524 // CHECK5-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
2525 // CHECK5-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4
2526 // CHECK5-NEXT:    [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]]
2527 // CHECK5-NEXT:    br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]]
2528 // CHECK5:       then12:
2529 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
2530 // CHECK5-NEXT:    [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)*
2531 // CHECK5-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
2532 // CHECK5-NEXT:    [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4
2533 // CHECK5-NEXT:    [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16*
2534 // CHECK5-NEXT:    [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2
2535 // CHECK5-NEXT:    store i16 [[TMP28]], i16* [[TMP27]], align 2
2536 // CHECK5-NEXT:    br label [[IFCONT14:%.*]]
2537 // CHECK5:       else13:
2538 // CHECK5-NEXT:    br label [[IFCONT14]]
2539 // CHECK5:       ifcont14:
2540 // CHECK5-NEXT:    ret void
2541 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15
2542 // CHECK5-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
2543 // CHECK5-NEXT:  entry:
2544 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
2545 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
2546 // CHECK5-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
2547 // CHECK5-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
2548 // CHECK5-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
2549 // CHECK5-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
2550 // CHECK5-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
2551 // CHECK5-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
2552 // CHECK5-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
2553 // CHECK5-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
2554 // CHECK5-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
2555 // CHECK5-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
2556 // CHECK5-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
2557 // CHECK5-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
2558 // CHECK5-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
2559 // CHECK5-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
2560 // CHECK5-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
2561 // CHECK5-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
2562 // CHECK5-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
2563 // CHECK5-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
2564 // CHECK5-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
2565 // CHECK5-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
2566 // CHECK5-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
2567 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2568 // CHECK5-NEXT:    [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
2569 // CHECK5-NEXT:    [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]])
2570 // CHECK5-NEXT:    store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
2571 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
2572 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
2573 // CHECK5-NEXT:    [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
2574 // CHECK5-NEXT:    store i8* [[TMP20]], i8** [[TMP11]], align 4
2575 // CHECK5-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
2576 // CHECK5-NEXT:    [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4
2577 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
2578 // CHECK5-NEXT:    [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16*
2579 // CHECK5-NEXT:    [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
2580 // CHECK5-NEXT:    [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8*
2581 // CHECK5-NEXT:    [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2
2582 // CHECK5-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP27]] to i32
2583 // CHECK5-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2584 // CHECK5-NEXT:    [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16
2585 // CHECK5-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]])
2586 // CHECK5-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
2587 // CHECK5-NEXT:    store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
2588 // CHECK5-NEXT:    [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
2589 // CHECK5-NEXT:    [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
2590 // CHECK5-NEXT:    [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
2591 // CHECK5-NEXT:    store i8* [[TMP34]], i8** [[TMP23]], align 4
2592 // CHECK5-NEXT:    [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0
2593 // CHECK5-NEXT:    [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1
2594 // CHECK5-NEXT:    [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
2595 // CHECK5-NEXT:    [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]]
2596 // CHECK5-NEXT:    [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2
2597 // CHECK5-NEXT:    [[TMP40:%.*]] = and i16 [[TMP6]], 1
2598 // CHECK5-NEXT:    [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0
2599 // CHECK5-NEXT:    [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]]
2600 // CHECK5-NEXT:    [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0
2601 // CHECK5-NEXT:    [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]]
2602 // CHECK5-NEXT:    [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]]
2603 // CHECK5-NEXT:    [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]]
2604 // CHECK5-NEXT:    br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]]
2605 // CHECK5:       then:
2606 // CHECK5-NEXT:    [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
2607 // CHECK5-NEXT:    [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
2608 // CHECK5-NEXT:    call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]]
2609 // CHECK5-NEXT:    br label [[IFCONT:%.*]]
2610 // CHECK5:       else:
2611 // CHECK5-NEXT:    br label [[IFCONT]]
2612 // CHECK5:       ifcont:
2613 // CHECK5-NEXT:    [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1
2614 // CHECK5-NEXT:    [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
2615 // CHECK5-NEXT:    [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]]
2616 // CHECK5-NEXT:    br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
2617 // CHECK5:       then6:
2618 // CHECK5-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
2619 // CHECK5-NEXT:    [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4
2620 // CHECK5-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
2621 // CHECK5-NEXT:    [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
2622 // CHECK5-NEXT:    [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32*
2623 // CHECK5-NEXT:    [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32*
2624 // CHECK5-NEXT:    [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4
2625 // CHECK5-NEXT:    store i32 [[TMP58]], i32* [[TMP57]], align 4
2626 // CHECK5-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
2627 // CHECK5-NEXT:    [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4
2628 // CHECK5-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
2629 // CHECK5-NEXT:    [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
2630 // CHECK5-NEXT:    [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16*
2631 // CHECK5-NEXT:    [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16*
2632 // CHECK5-NEXT:    [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2
2633 // CHECK5-NEXT:    store i16 [[TMP65]], i16* [[TMP64]], align 2
2634 // CHECK5-NEXT:    br label [[IFCONT8:%.*]]
2635 // CHECK5:       else7:
2636 // CHECK5-NEXT:    br label [[IFCONT8]]
2637 // CHECK5:       ifcont8:
2638 // CHECK5-NEXT:    ret void
2639 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16
2640 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
2641 // CHECK5-NEXT:  entry:
2642 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
2643 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
2644 // CHECK5-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
2645 // CHECK5-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
2646 // CHECK5-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
2647 // CHECK5-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2648 // CHECK5-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2649 // CHECK5-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
2650 // CHECK5-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2651 // CHECK5-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
2652 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
2653 // CHECK5-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
2654 // CHECK5-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
2655 // CHECK5-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
2656 // CHECK5-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
2657 // CHECK5:       then:
2658 // CHECK5-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
2659 // CHECK5-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4
2660 // CHECK5-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
2661 // CHECK5-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
2662 // CHECK5-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4
2663 // CHECK5-NEXT:    store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4
2664 // CHECK5-NEXT:    br label [[IFCONT:%.*]]
2665 // CHECK5:       else:
2666 // CHECK5-NEXT:    br label [[IFCONT]]
2667 // CHECK5:       ifcont:
2668 // CHECK5-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
2669 // CHECK5-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4
2670 // CHECK5-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]]
2671 // CHECK5-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
2672 // CHECK5:       then4:
2673 // CHECK5-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
2674 // CHECK5-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
2675 // CHECK5-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
2676 // CHECK5-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32*
2677 // CHECK5-NEXT:    [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4
2678 // CHECK5-NEXT:    store i32 [[TMP15]], i32* [[TMP14]], align 4
2679 // CHECK5-NEXT:    br label [[IFCONT6:%.*]]
2680 // CHECK5:       else5:
2681 // CHECK5-NEXT:    br label [[IFCONT6]]
2682 // CHECK5:       ifcont6:
2683 // CHECK5-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
2684 // CHECK5-NEXT:    [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
2685 // CHECK5-NEXT:    br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]]
2686 // CHECK5:       then8:
2687 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
2688 // CHECK5-NEXT:    [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
2689 // CHECK5-NEXT:    [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16*
2690 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
2691 // CHECK5-NEXT:    [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)*
2692 // CHECK5-NEXT:    [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2
2693 // CHECK5-NEXT:    store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2
2694 // CHECK5-NEXT:    br label [[IFCONT10:%.*]]
2695 // CHECK5:       else9:
2696 // CHECK5-NEXT:    br label [[IFCONT10]]
2697 // CHECK5:       ifcont10:
2698 // CHECK5-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
2699 // CHECK5-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4
2700 // CHECK5-NEXT:    [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]]
2701 // CHECK5-NEXT:    br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]]
2702 // CHECK5:       then12:
2703 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
2704 // CHECK5-NEXT:    [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)*
2705 // CHECK5-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
2706 // CHECK5-NEXT:    [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4
2707 // CHECK5-NEXT:    [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16*
2708 // CHECK5-NEXT:    [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2
2709 // CHECK5-NEXT:    store i16 [[TMP28]], i16* [[TMP27]], align 2
2710 // CHECK5-NEXT:    br label [[IFCONT14:%.*]]
2711 // CHECK5:       else13:
2712 // CHECK5-NEXT:    br label [[IFCONT14]]
2713 // CHECK5:       ifcont14:
2714 // CHECK5-NEXT:    ret void
2715 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17
2716 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
2717 // CHECK5-NEXT:  entry:
2718 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
2719 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
2720 // CHECK5-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
2721 // CHECK5-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
2722 // CHECK5-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
2723 // CHECK5-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
2724 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
2725 // CHECK5-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
2726 // CHECK5-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
2727 // CHECK5-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4*
2728 // CHECK5-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
2729 // CHECK5-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
2730 // CHECK5-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
2731 // CHECK5-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
2732 // CHECK5-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0
2733 // CHECK5-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]]
2734 // CHECK5-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
2735 // CHECK5-NEXT:    store i32 [[TMP12]], i32* [[TMP11]], align 128
2736 // CHECK5-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
2737 // CHECK5-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
2738 // CHECK5-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
2739 // CHECK5-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1
2740 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]]
2741 // CHECK5-NEXT:    [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2
2742 // CHECK5-NEXT:    store i16 [[TMP17]], i16* [[TMP16]], align 128
2743 // CHECK5-NEXT:    ret void
2744 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18
2745 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
2746 // CHECK5-NEXT:  entry:
2747 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
2748 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
2749 // CHECK5-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
2750 // CHECK5-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
2751 // CHECK5-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
2752 // CHECK5-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
2753 // CHECK5-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
2754 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
2755 // CHECK5-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4*
2756 // CHECK5-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
2757 // CHECK5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
2758 // CHECK5-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0
2759 // CHECK5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]]
2760 // CHECK5-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
2761 // CHECK5-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 4
2762 // CHECK5-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
2763 // CHECK5-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1
2764 // CHECK5-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]]
2765 // CHECK5-NEXT:    [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
2766 // CHECK5-NEXT:    store i8* [[TMP11]], i8** [[TMP9]], align 4
2767 // CHECK5-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
2768 // CHECK5-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
2769 // CHECK5-NEXT:    call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]]
2770 // CHECK5-NEXT:    ret void
2771 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19
2772 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
2773 // CHECK5-NEXT:  entry:
2774 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
2775 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
2776 // CHECK5-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
2777 // CHECK5-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
2778 // CHECK5-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
2779 // CHECK5-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
2780 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
2781 // CHECK5-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
2782 // CHECK5-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
2783 // CHECK5-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4*
2784 // CHECK5-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
2785 // CHECK5-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
2786 // CHECK5-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
2787 // CHECK5-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
2788 // CHECK5-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0
2789 // CHECK5-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]]
2790 // CHECK5-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128
2791 // CHECK5-NEXT:    store i32 [[TMP12]], i32* [[TMP10]], align 4
2792 // CHECK5-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
2793 // CHECK5-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
2794 // CHECK5-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
2795 // CHECK5-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1
2796 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]]
2797 // CHECK5-NEXT:    [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128
2798 // CHECK5-NEXT:    store i16 [[TMP17]], i16* [[TMP15]], align 2
2799 // CHECK5-NEXT:    ret void
2800 // CHECK5-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20
2801 // CHECK5-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
2802 // CHECK5-NEXT:  entry:
2803 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
2804 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
2805 // CHECK5-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
2806 // CHECK5-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
2807 // CHECK5-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
2808 // CHECK5-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
2809 // CHECK5-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
2810 // CHECK5-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
2811 // CHECK5-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4*
2812 // CHECK5-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
2813 // CHECK5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
2814 // CHECK5-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0
2815 // CHECK5-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]]
2816 // CHECK5-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
2817 // CHECK5-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 4
2818 // CHECK5-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
2819 // CHECK5-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1
2820 // CHECK5-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]]
2821 // CHECK5-NEXT:    [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
2822 // CHECK5-NEXT:    store i8* [[TMP11]], i8** [[TMP9]], align 4
2823 // CHECK5-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
2824 // CHECK5-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
2825 // CHECK5-NEXT:    call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]]
2826 // CHECK5-NEXT:    ret void
2827 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker
2828 // CHECK6-SAME: () #[[ATTR0:[0-9]+]] {
2829 // CHECK6-NEXT:  entry:
2830 // CHECK6-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
2831 // CHECK6-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
2832 // CHECK6-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
2833 // CHECK6-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
2834 // CHECK6-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
2835 // CHECK6:       .await.work:
2836 // CHECK6-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2837 // CHECK6-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
2838 // CHECK6-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
2839 // CHECK6-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
2840 // CHECK6-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
2841 // CHECK6-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
2842 // CHECK6-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
2843 // CHECK6:       .select.workers:
2844 // CHECK6-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
2845 // CHECK6-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
2846 // CHECK6-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
2847 // CHECK6:       .execute.parallel:
2848 // CHECK6-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
2849 // CHECK6-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
2850 // CHECK6-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
2851 // CHECK6-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
2852 // CHECK6:       .terminate.parallel:
2853 // CHECK6-NEXT:    call void @__kmpc_kernel_end_parallel()
2854 // CHECK6-NEXT:    br label [[DOTBARRIER_PARALLEL]]
2855 // CHECK6:       .barrier.parallel:
2856 // CHECK6-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2857 // CHECK6-NEXT:    br label [[DOTAWAIT_WORK]]
2858 // CHECK6:       .exit:
2859 // CHECK6-NEXT:    ret void
2860 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23
2861 // CHECK6-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] {
2862 // CHECK6-NEXT:  entry:
2863 // CHECK6-NEXT:    [[E_ADDR:%.*]] = alloca double*, align 4
2864 // CHECK6-NEXT:    [[E7:%.*]] = alloca double, align 8
2865 // CHECK6-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
2866 // CHECK6-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
2867 // CHECK6-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
2868 // CHECK6-NEXT:    store double* [[E]], double** [[E_ADDR]], align 4
2869 // CHECK6-NEXT:    [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
2870 // CHECK6-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2871 // CHECK6-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2872 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2873 // CHECK6-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
2874 // CHECK6-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
2875 // CHECK6-NEXT:    br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
2876 // CHECK6:       .worker:
2877 // CHECK6-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_worker() #[[ATTR3:[0-9]+]]
2878 // CHECK6-NEXT:    br label [[DOTEXIT:%.*]]
2879 // CHECK6:       .mastercheck:
2880 // CHECK6-NEXT:    [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
2881 // CHECK6-NEXT:    [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2882 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2883 // CHECK6-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
2884 // CHECK6-NEXT:    [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
2885 // CHECK6-NEXT:    [[TMP4:%.*]] = xor i32 [[TMP2]], -1
2886 // CHECK6-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
2887 // CHECK6-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
2888 // CHECK6-NEXT:    br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
2889 // CHECK6:       .master:
2890 // CHECK6-NEXT:    [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
2891 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2892 // CHECK6-NEXT:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
2893 // CHECK6-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
2894 // CHECK6-NEXT:    call void @__kmpc_data_sharing_init_stack()
2895 // CHECK6-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
2896 // CHECK6-NEXT:    [[TMP7:%.*]] = load double, double* [[TMP0]], align 8
2897 // CHECK6-NEXT:    store double [[TMP7]], double* [[E7]], align 8
2898 // CHECK6-NEXT:    store i32 [[TMP6]], i32* [[DOTTHREADID_TEMP_]], align 4
2899 // CHECK6-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E7]]) #[[ATTR3]]
2900 // CHECK6-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
2901 // CHECK6:       .termination.notifier:
2902 // CHECK6-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
2903 // CHECK6-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
2904 // CHECK6-NEXT:    br label [[DOTEXIT]]
2905 // CHECK6:       .exit:
2906 // CHECK6-NEXT:    ret void
2907 // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__
2908 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] {
2909 // CHECK6-NEXT:  entry:
2910 // CHECK6-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
2911 // CHECK6-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
2912 // CHECK6-NEXT:    [[E_ADDR:%.*]] = alloca double*, align 4
2913 // CHECK6-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
2914 // CHECK6-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
2915 // CHECK6-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
2916 // CHECK6-NEXT:    store double* [[E]], double** [[E_ADDR]], align 4
2917 // CHECK6-NEXT:    [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
2918 // CHECK6-NEXT:    [[TMP1:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1)
2919 // CHECK6-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to %struct._globalized_locals_ty*
2920 // CHECK6-NEXT:    [[E1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP2]], i32 0, i32 0
2921 // CHECK6-NEXT:    store double 0.000000e+00, double* [[E1]], align 8
2922 // CHECK6-NEXT:    [[TMP3:%.*]] = load double, double* [[E1]], align 8
2923 // CHECK6-NEXT:    [[ADD:%.*]] = fadd double [[TMP3]], 5.000000e+00
2924 // CHECK6-NEXT:    store double [[ADD]], double* [[E1]], align 8
2925 // CHECK6-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
2926 // CHECK6-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
2927 // CHECK6-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
2928 // CHECK6-NEXT:    [[TMP7:%.*]] = bitcast double* [[E1]] to i8*
2929 // CHECK6-NEXT:    store i8* [[TMP7]], i8** [[TMP6]], align 4
2930 // CHECK6-NEXT:    [[TMP8:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
2931 // CHECK6-NEXT:    [[TMP9:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
2932 // CHECK6-NEXT:    [[TMP10:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP9]], i32 2048, i8* [[TMP8]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func)
2933 // CHECK6-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1
2934 // CHECK6-NEXT:    br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
2935 // CHECK6:       .omp.reduction.then:
2936 // CHECK6-NEXT:    [[TMP12:%.*]] = load double, double* [[TMP0]], align 8
2937 // CHECK6-NEXT:    [[TMP13:%.*]] = load double, double* [[E1]], align 8
2938 // CHECK6-NEXT:    [[ADD2:%.*]] = fadd double [[TMP12]], [[TMP13]]
2939 // CHECK6-NEXT:    store double [[ADD2]], double* [[TMP0]], align 8
2940 // CHECK6-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
2941 // CHECK6-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
2942 // CHECK6:       .omp.reduction.done:
2943 // CHECK6-NEXT:    call void @__kmpc_data_sharing_pop_stack(i8* [[TMP1]])
2944 // CHECK6-NEXT:    ret void
2945 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
2946 // CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
2947 // CHECK6-NEXT:  entry:
2948 // CHECK6-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
2949 // CHECK6-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
2950 // CHECK6-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
2951 // CHECK6-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
2952 // CHECK6-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4
2953 // CHECK6-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8
2954 // CHECK6-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
2955 // CHECK6-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
2956 // CHECK6-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
2957 // CHECK6-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
2958 // CHECK6-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
2959 // CHECK6-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]*
2960 // CHECK6-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
2961 // CHECK6-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
2962 // CHECK6-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
2963 // CHECK6-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
2964 // CHECK6-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
2965 // CHECK6-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
2966 // CHECK6-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double*
2967 // CHECK6-NEXT:    [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1
2968 // CHECK6-NEXT:    [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8*
2969 // CHECK6-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64*
2970 // CHECK6-NEXT:    [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
2971 // CHECK6-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8
2972 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
2973 // CHECK6-NEXT:    [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
2974 // CHECK6-NEXT:    [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]])
2975 // CHECK6-NEXT:    store i64 [[TMP19]], i64* [[TMP16]], align 8
2976 // CHECK6-NEXT:    [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1
2977 // CHECK6-NEXT:    [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1
2978 // CHECK6-NEXT:    [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
2979 // CHECK6-NEXT:    store i8* [[TMP22]], i8** [[TMP11]], align 4
2980 // CHECK6-NEXT:    [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0
2981 // CHECK6-NEXT:    [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1
2982 // CHECK6-NEXT:    [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
2983 // CHECK6-NEXT:    [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]]
2984 // CHECK6-NEXT:    [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2
2985 // CHECK6-NEXT:    [[TMP28:%.*]] = and i16 [[TMP6]], 1
2986 // CHECK6-NEXT:    [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0
2987 // CHECK6-NEXT:    [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]]
2988 // CHECK6-NEXT:    [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0
2989 // CHECK6-NEXT:    [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]]
2990 // CHECK6-NEXT:    [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]]
2991 // CHECK6-NEXT:    [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]]
2992 // CHECK6-NEXT:    br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]]
2993 // CHECK6:       then:
2994 // CHECK6-NEXT:    [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
2995 // CHECK6-NEXT:    [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
2996 // CHECK6-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR3]]
2997 // CHECK6-NEXT:    br label [[IFCONT:%.*]]
2998 // CHECK6:       else:
2999 // CHECK6-NEXT:    br label [[IFCONT]]
3000 // CHECK6:       ifcont:
3001 // CHECK6-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1
3002 // CHECK6-NEXT:    [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
3003 // CHECK6-NEXT:    [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
3004 // CHECK6-NEXT:    br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
3005 // CHECK6:       then4:
3006 // CHECK6-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
3007 // CHECK6-NEXT:    [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4
3008 // CHECK6-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
3009 // CHECK6-NEXT:    [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4
3010 // CHECK6-NEXT:    [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double*
3011 // CHECK6-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double*
3012 // CHECK6-NEXT:    [[TMP46:%.*]] = load double, double* [[TMP44]], align 8
3013 // CHECK6-NEXT:    store double [[TMP46]], double* [[TMP45]], align 8
3014 // CHECK6-NEXT:    br label [[IFCONT6:%.*]]
3015 // CHECK6:       else5:
3016 // CHECK6-NEXT:    br label [[IFCONT6]]
3017 // CHECK6:       ifcont6:
3018 // CHECK6-NEXT:    ret void
3019 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
3020 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
3021 // CHECK6-NEXT:  entry:
3022 // CHECK6-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
3023 // CHECK6-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
3024 // CHECK6-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
3025 // CHECK6-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
3026 // CHECK6-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
3027 // CHECK6-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
3028 // CHECK6-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
3029 // CHECK6-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
3030 // CHECK6-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
3031 // CHECK6-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
3032 // CHECK6-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
3033 // CHECK6-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
3034 // CHECK6-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
3035 // CHECK6-NEXT:    store i32 0, i32* [[DOTCNT_ADDR]], align 4
3036 // CHECK6-NEXT:    br label [[PRECOND:%.*]]
3037 // CHECK6:       precond:
3038 // CHECK6-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4
3039 // CHECK6-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2
3040 // CHECK6-NEXT:    br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]]
3041 // CHECK6:       body:
3042 // CHECK6-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]])
3043 // CHECK6-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
3044 // CHECK6-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
3045 // CHECK6:       then:
3046 // CHECK6-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
3047 // CHECK6-NEXT:    [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4
3048 // CHECK6-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32*
3049 // CHECK6-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]]
3050 // CHECK6-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
3051 // CHECK6-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
3052 // CHECK6-NEXT:    store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
3053 // CHECK6-NEXT:    br label [[IFCONT:%.*]]
3054 // CHECK6:       else:
3055 // CHECK6-NEXT:    br label [[IFCONT]]
3056 // CHECK6:       ifcont:
3057 // CHECK6-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
3058 // CHECK6-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
3059 // CHECK6-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]]
3060 // CHECK6-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
3061 // CHECK6:       then4:
3062 // CHECK6-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
3063 // CHECK6-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
3064 // CHECK6-NEXT:    [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4
3065 // CHECK6-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
3066 // CHECK6-NEXT:    [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]]
3067 // CHECK6-NEXT:    [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4
3068 // CHECK6-NEXT:    store i32 [[TMP19]], i32* [[TMP18]], align 4
3069 // CHECK6-NEXT:    br label [[IFCONT6:%.*]]
3070 // CHECK6:       else5:
3071 // CHECK6-NEXT:    br label [[IFCONT6]]
3072 // CHECK6:       ifcont6:
3073 // CHECK6-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1
3074 // CHECK6-NEXT:    store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4
3075 // CHECK6-NEXT:    br label [[PRECOND]]
3076 // CHECK6:       exit:
3077 // CHECK6-NEXT:    ret void
3078 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
3079 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
3080 // CHECK6-NEXT:  entry:
3081 // CHECK6-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
3082 // CHECK6-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
3083 // CHECK6-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
3084 // CHECK6-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
3085 // CHECK6-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
3086 // CHECK6-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
3087 // CHECK6-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
3088 // CHECK6-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
3089 // CHECK6-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
3090 // CHECK6-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
3091 // CHECK6-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
3092 // CHECK6-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
3093 // CHECK6-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
3094 // CHECK6-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
3095 // CHECK6-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
3096 // CHECK6-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]]
3097 // CHECK6-NEXT:    [[TMP12:%.*]] = load double, double* [[TMP10]], align 8
3098 // CHECK6-NEXT:    store double [[TMP12]], double* [[TMP11]], align 128
3099 // CHECK6-NEXT:    ret void
3100 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
3101 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
3102 // CHECK6-NEXT:  entry:
3103 // CHECK6-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
3104 // CHECK6-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
3105 // CHECK6-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
3106 // CHECK6-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
3107 // CHECK6-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
3108 // CHECK6-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
3109 // CHECK6-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
3110 // CHECK6-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
3111 // CHECK6-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
3112 // CHECK6-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
3113 // CHECK6-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
3114 // CHECK6-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
3115 // CHECK6-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]]
3116 // CHECK6-NEXT:    [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
3117 // CHECK6-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 4
3118 // CHECK6-NEXT:    [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
3119 // CHECK6-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
3120 // CHECK6-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]]
3121 // CHECK6-NEXT:    ret void
3122 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
3123 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
3124 // CHECK6-NEXT:  entry:
3125 // CHECK6-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
3126 // CHECK6-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
3127 // CHECK6-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
3128 // CHECK6-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
3129 // CHECK6-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
3130 // CHECK6-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
3131 // CHECK6-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
3132 // CHECK6-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
3133 // CHECK6-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
3134 // CHECK6-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
3135 // CHECK6-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
3136 // CHECK6-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
3137 // CHECK6-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
3138 // CHECK6-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
3139 // CHECK6-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
3140 // CHECK6-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]]
3141 // CHECK6-NEXT:    [[TMP12:%.*]] = load double, double* [[TMP11]], align 128
3142 // CHECK6-NEXT:    store double [[TMP12]], double* [[TMP10]], align 8
3143 // CHECK6-NEXT:    ret void
3144 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
3145 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
3146 // CHECK6-NEXT:  entry:
3147 // CHECK6-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
3148 // CHECK6-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
3149 // CHECK6-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
3150 // CHECK6-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
3151 // CHECK6-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
3152 // CHECK6-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
3153 // CHECK6-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
3154 // CHECK6-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
3155 // CHECK6-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
3156 // CHECK6-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
3157 // CHECK6-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
3158 // CHECK6-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
3159 // CHECK6-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]]
3160 // CHECK6-NEXT:    [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
3161 // CHECK6-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 4
3162 // CHECK6-NEXT:    [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
3163 // CHECK6-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
3164 // CHECK6-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]]
3165 // CHECK6-NEXT:    ret void
3166 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker
3167 // CHECK6-SAME: () #[[ATTR0]] {
3168 // CHECK6-NEXT:  entry:
3169 // CHECK6-NEXT:    [[WORK_FN:%.*]] = alloca i8*, align 4
3170 // CHECK6-NEXT:    [[EXEC_STATUS:%.*]] = alloca i8, align 1
3171 // CHECK6-NEXT:    store i8* null, i8** [[WORK_FN]], align 4
3172 // CHECK6-NEXT:    store i8 0, i8* [[EXEC_STATUS]], align 1
3173 // CHECK6-NEXT:    br label [[DOTAWAIT_WORK:%.*]]
3174 // CHECK6:       .await.work:
3175 // CHECK6-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
3176 // CHECK6-NEXT:    [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
3177 // CHECK6-NEXT:    [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
3178 // CHECK6-NEXT:    store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
3179 // CHECK6-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 4
3180 // CHECK6-NEXT:    [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
3181 // CHECK6-NEXT:    br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
3182 // CHECK6:       .select.workers:
3183 // CHECK6-NEXT:    [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
3184 // CHECK6-NEXT:    [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
3185 // CHECK6-NEXT:    br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
3186 // CHECK6:       .execute.parallel:
3187 // CHECK6-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
3188 // CHECK6-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
3189 // CHECK6-NEXT:    call void [[TMP5]](i16 0, i32 [[TMP4]])
3190 // CHECK6-NEXT:    br label [[DOTTERMINATE_PARALLEL:%.*]]
3191 // CHECK6:       .terminate.parallel:
3192 // CHECK6-NEXT:    call void @__kmpc_kernel_end_parallel()
3193 // CHECK6-NEXT:    br label [[DOTBARRIER_PARALLEL]]
3194 // CHECK6:       .barrier.parallel:
3195 // CHECK6-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
3196 // CHECK6-NEXT:    br label [[DOTAWAIT_WORK]]
3197 // CHECK6:       .exit:
3198 // CHECK6-NEXT:    ret void
3199 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29
3200 // CHECK6-SAME: (i32 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR1]] {
3201 // CHECK6-NEXT:  entry:
3202 // CHECK6-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4
3203 // CHECK6-NEXT:    [[D_ADDR:%.*]] = alloca i32, align 4
3204 // CHECK6-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
3205 // CHECK6-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
3206 // CHECK6-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
3207 // CHECK6-NEXT:    store i32 [[C]], i32* [[C_ADDR]], align 4
3208 // CHECK6-NEXT:    store i32 [[D]], i32* [[D_ADDR]], align 4
3209 // CHECK6-NEXT:    [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8*
3210 // CHECK6-NEXT:    [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float*
3211 // CHECK6-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
3212 // CHECK6-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
3213 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
3214 // CHECK6-NEXT:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
3215 // CHECK6-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
3216 // CHECK6-NEXT:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
3217 // CHECK6:       .worker:
3218 // CHECK6-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_worker() #[[ATTR3]]
3219 // CHECK6-NEXT:    br label [[DOTEXIT:%.*]]
3220 // CHECK6:       .mastercheck:
3221 // CHECK6-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
3222 // CHECK6-NEXT:    [[NVPTX_NUM_THREADS3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
3223 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
3224 // CHECK6-NEXT:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE4]], 1
3225 // CHECK6-NEXT:    [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS3]], 1
3226 // CHECK6-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
3227 // CHECK6-NEXT:    [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
3228 // CHECK6-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID2]], [[MASTER_TID]]
3229 // CHECK6-NEXT:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
3230 // CHECK6:       .master:
3231 // CHECK6-NEXT:    [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
3232 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
3233 // CHECK6-NEXT:    [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]]
3234 // CHECK6-NEXT:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1)
3235 // CHECK6-NEXT:    call void @__kmpc_data_sharing_init_stack()
3236 // CHECK6-NEXT:    [[TMP5:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1)
3237 // CHECK6-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1*
3238 // CHECK6-NEXT:    [[TMP7:%.*]] = load i8, i8* [[CONV]], align 4
3239 // CHECK6-NEXT:    [[C8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1
3240 // CHECK6-NEXT:    store i8 [[TMP7]], i8* [[C8]], align 4
3241 // CHECK6-NEXT:    [[TMP8:%.*]] = load float, float* [[CONV1]], align 4
3242 // CHECK6-NEXT:    [[D9:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0
3243 // CHECK6-NEXT:    store float [[TMP8]], float* [[D9]], align 4
3244 // CHECK6-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
3245 // CHECK6-NEXT:    store i32 [[TMP9]], i32* [[DOTTHREADID_TEMP_]], align 4
3246 // CHECK6-NEXT:    call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C8]], float* [[D9]]) #[[ATTR3]]
3247 // CHECK6-NEXT:    call void @__kmpc_data_sharing_pop_stack(i8* [[TMP5]])
3248 // CHECK6-NEXT:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
3249 // CHECK6:       .termination.notifier:
3250 // CHECK6-NEXT:    call void @__kmpc_kernel_deinit(i16 1)
3251 // CHECK6-NEXT:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
3252 // CHECK6-NEXT:    br label [[DOTEXIT]]
3253 // CHECK6:       .exit:
3254 // CHECK6-NEXT:    ret void
3255 // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__1
3256 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
3257 // CHECK6-NEXT:  entry:
3258 // CHECK6-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
3259 // CHECK6-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
3260 // CHECK6-NEXT:    [[C_ADDR:%.*]] = alloca i8*, align 4
3261 // CHECK6-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 4
3262 // CHECK6-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
3263 // CHECK6-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
3264 // CHECK6-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
3265 // CHECK6-NEXT:    store i8* [[C]], i8** [[C_ADDR]], align 4
3266 // CHECK6-NEXT:    store float* [[D]], float** [[D_ADDR]], align 4
3267 // CHECK6-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4
3268 // CHECK6-NEXT:    [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4
3269 // CHECK6-NEXT:    [[TMP2:%.*]] = call i8* @__kmpc_data_sharing_push_stack(i32 8, i16 1)
3270 // CHECK6-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP2]] to %struct._globalized_locals_ty.2*
3271 // CHECK6-NEXT:    [[C1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 1
3272 // CHECK6-NEXT:    [[D2:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], %struct._globalized_locals_ty.2* [[TMP3]], i32 0, i32 0
3273 // CHECK6-NEXT:    store i8 0, i8* [[C1]], align 4
3274 // CHECK6-NEXT:    store float 1.000000e+00, float* [[D2]], align 4
3275 // CHECK6-NEXT:    [[TMP4:%.*]] = load i8, i8* [[C1]], align 4
3276 // CHECK6-NEXT:    [[CONV:%.*]] = sext i8 [[TMP4]] to i32
3277 // CHECK6-NEXT:    [[XOR:%.*]] = xor i32 [[CONV]], 2
3278 // CHECK6-NEXT:    [[CONV3:%.*]] = trunc i32 [[XOR]] to i8
3279 // CHECK6-NEXT:    store i8 [[CONV3]], i8* [[C1]], align 4
3280 // CHECK6-NEXT:    [[TMP5:%.*]] = load float, float* [[D2]], align 4
3281 // CHECK6-NEXT:    [[MUL:%.*]] = fmul float [[TMP5]], 3.300000e+01
3282 // CHECK6-NEXT:    store float [[MUL]], float* [[D2]], align 4
3283 // CHECK6-NEXT:    [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
3284 // CHECK6-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
3285 // CHECK6-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
3286 // CHECK6-NEXT:    store i8* [[C1]], i8** [[TMP8]], align 4
3287 // CHECK6-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
3288 // CHECK6-NEXT:    [[TMP10:%.*]] = bitcast float* [[D2]] to i8*
3289 // CHECK6-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 4
3290 // CHECK6-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
3291 // CHECK6-NEXT:    [[TMP12:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
3292 // CHECK6-NEXT:    [[TMP13:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], i8* [[TMP12]], i32 2048, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8)
3293 // CHECK6-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 1
3294 // CHECK6-NEXT:    br i1 [[TMP14]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
3295 // CHECK6:       .omp.reduction.then:
3296 // CHECK6-NEXT:    [[TMP15:%.*]] = load i8, i8* [[TMP0]], align 1
3297 // CHECK6-NEXT:    [[CONV4:%.*]] = sext i8 [[TMP15]] to i32
3298 // CHECK6-NEXT:    [[TMP16:%.*]] = load i8, i8* [[C1]], align 4
3299 // CHECK6-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP16]] to i32
3300 // CHECK6-NEXT:    [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
3301 // CHECK6-NEXT:    [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
3302 // CHECK6-NEXT:    store i8 [[CONV7]], i8* [[TMP0]], align 1
3303 // CHECK6-NEXT:    [[TMP17:%.*]] = load float, float* [[TMP1]], align 4
3304 // CHECK6-NEXT:    [[TMP18:%.*]] = load float, float* [[D2]], align 4
3305 // CHECK6-NEXT:    [[MUL8:%.*]] = fmul float [[TMP17]], [[TMP18]]
3306 // CHECK6-NEXT:    store float [[MUL8]], float* [[TMP1]], align 4
3307 // CHECK6-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]])
3308 // CHECK6-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
3309 // CHECK6:       .omp.reduction.done:
3310 // CHECK6-NEXT:    call void @__kmpc_data_sharing_pop_stack(i8* [[TMP2]])
3311 // CHECK6-NEXT:    ret void
3312 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3
3313 // CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
3314 // CHECK6-NEXT:  entry:
3315 // CHECK6-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
3316 // CHECK6-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
3317 // CHECK6-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
3318 // CHECK6-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
3319 // CHECK6-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
3320 // CHECK6-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1
3321 // CHECK6-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4
3322 // CHECK6-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
3323 // CHECK6-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
3324 // CHECK6-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
3325 // CHECK6-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
3326 // CHECK6-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
3327 // CHECK6-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
3328 // CHECK6-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
3329 // CHECK6-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
3330 // CHECK6-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
3331 // CHECK6-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
3332 // CHECK6-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
3333 // CHECK6-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
3334 // CHECK6-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
3335 // CHECK6-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1
3336 // CHECK6-NEXT:    [[TMP14:%.*]] = sext i8 [[TMP13]] to i32
3337 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
3338 // CHECK6-NEXT:    [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
3339 // CHECK6-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]])
3340 // CHECK6-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
3341 // CHECK6-NEXT:    store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1
3342 // CHECK6-NEXT:    [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
3343 // CHECK6-NEXT:    [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
3344 // CHECK6-NEXT:    store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4
3345 // CHECK6-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
3346 // CHECK6-NEXT:    [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4
3347 // CHECK6-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
3348 // CHECK6-NEXT:    [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float*
3349 // CHECK6-NEXT:    [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1
3350 // CHECK6-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8*
3351 // CHECK6-NEXT:    [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32*
3352 // CHECK6-NEXT:    [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32*
3353 // CHECK6-NEXT:    [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4
3354 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
3355 // CHECK6-NEXT:    [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16
3356 // CHECK6-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]])
3357 // CHECK6-NEXT:    store i32 [[TMP30]], i32* [[TMP27]], align 4
3358 // CHECK6-NEXT:    [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1
3359 // CHECK6-NEXT:    [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1
3360 // CHECK6-NEXT:    [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
3361 // CHECK6-NEXT:    store i8* [[TMP33]], i8** [[TMP22]], align 4
3362 // CHECK6-NEXT:    [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0
3363 // CHECK6-NEXT:    [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1
3364 // CHECK6-NEXT:    [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
3365 // CHECK6-NEXT:    [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]]
3366 // CHECK6-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2
3367 // CHECK6-NEXT:    [[TMP39:%.*]] = and i16 [[TMP6]], 1
3368 // CHECK6-NEXT:    [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0
3369 // CHECK6-NEXT:    [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]]
3370 // CHECK6-NEXT:    [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0
3371 // CHECK6-NEXT:    [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]]
3372 // CHECK6-NEXT:    [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]]
3373 // CHECK6-NEXT:    [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]]
3374 // CHECK6-NEXT:    br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]]
3375 // CHECK6:       then:
3376 // CHECK6-NEXT:    [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
3377 // CHECK6-NEXT:    [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
3378 // CHECK6-NEXT:    call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR3]]
3379 // CHECK6-NEXT:    br label [[IFCONT:%.*]]
3380 // CHECK6:       else:
3381 // CHECK6-NEXT:    br label [[IFCONT]]
3382 // CHECK6:       ifcont:
3383 // CHECK6-NEXT:    [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1
3384 // CHECK6-NEXT:    [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
3385 // CHECK6-NEXT:    [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]]
3386 // CHECK6-NEXT:    br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
3387 // CHECK6:       then6:
3388 // CHECK6-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
3389 // CHECK6-NEXT:    [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4
3390 // CHECK6-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
3391 // CHECK6-NEXT:    [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4
3392 // CHECK6-NEXT:    [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1
3393 // CHECK6-NEXT:    store i8 [[TMP55]], i8* [[TMP54]], align 1
3394 // CHECK6-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
3395 // CHECK6-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4
3396 // CHECK6-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
3397 // CHECK6-NEXT:    [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4
3398 // CHECK6-NEXT:    [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float*
3399 // CHECK6-NEXT:    [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float*
3400 // CHECK6-NEXT:    [[TMP62:%.*]] = load float, float* [[TMP60]], align 4
3401 // CHECK6-NEXT:    store float [[TMP62]], float* [[TMP61]], align 4
3402 // CHECK6-NEXT:    br label [[IFCONT8:%.*]]
3403 // CHECK6:       else7:
3404 // CHECK6-NEXT:    br label [[IFCONT8]]
3405 // CHECK6:       ifcont8:
3406 // CHECK6-NEXT:    ret void
3407 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4
3408 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
3409 // CHECK6-NEXT:  entry:
3410 // CHECK6-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
3411 // CHECK6-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
3412 // CHECK6-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
3413 // CHECK6-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
3414 // CHECK6-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
3415 // CHECK6-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
3416 // CHECK6-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
3417 // CHECK6-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
3418 // CHECK6-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
3419 // CHECK6-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
3420 // CHECK6-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
3421 // CHECK6-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
3422 // CHECK6-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
3423 // CHECK6-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
3424 // CHECK6-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
3425 // CHECK6:       then:
3426 // CHECK6-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
3427 // CHECK6-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4
3428 // CHECK6-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
3429 // CHECK6-NEXT:    [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)*
3430 // CHECK6-NEXT:    [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1
3431 // CHECK6-NEXT:    store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1
3432 // CHECK6-NEXT:    br label [[IFCONT:%.*]]
3433 // CHECK6:       else:
3434 // CHECK6-NEXT:    br label [[IFCONT]]
3435 // CHECK6:       ifcont:
3436 // CHECK6-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
3437 // CHECK6-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4
3438 // CHECK6-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]]
3439 // CHECK6-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
3440 // CHECK6:       then4:
3441 // CHECK6-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
3442 // CHECK6-NEXT:    [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)*
3443 // CHECK6-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
3444 // CHECK6-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
3445 // CHECK6-NEXT:    [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1
3446 // CHECK6-NEXT:    store i8 [[TMP15]], i8* [[TMP14]], align 1
3447 // CHECK6-NEXT:    br label [[IFCONT6:%.*]]
3448 // CHECK6:       else5:
3449 // CHECK6-NEXT:    br label [[IFCONT6]]
3450 // CHECK6:       ifcont6:
3451 // CHECK6-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
3452 // CHECK6-NEXT:    [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
3453 // CHECK6-NEXT:    br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]]
3454 // CHECK6:       then8:
3455 // CHECK6-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
3456 // CHECK6-NEXT:    [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
3457 // CHECK6-NEXT:    [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32*
3458 // CHECK6-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
3459 // CHECK6-NEXT:    [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4
3460 // CHECK6-NEXT:    store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4
3461 // CHECK6-NEXT:    br label [[IFCONT10:%.*]]
3462 // CHECK6:       else9:
3463 // CHECK6-NEXT:    br label [[IFCONT10]]
3464 // CHECK6:       ifcont10:
3465 // CHECK6-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
3466 // CHECK6-NEXT:    [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4
3467 // CHECK6-NEXT:    [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]]
3468 // CHECK6-NEXT:    br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]]
3469 // CHECK6:       then12:
3470 // CHECK6-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
3471 // CHECK6-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
3472 // CHECK6-NEXT:    [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4
3473 // CHECK6-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32*
3474 // CHECK6-NEXT:    [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4
3475 // CHECK6-NEXT:    store i32 [[TMP26]], i32* [[TMP25]], align 4
3476 // CHECK6-NEXT:    br label [[IFCONT14:%.*]]
3477 // CHECK6:       else13:
3478 // CHECK6-NEXT:    br label [[IFCONT14]]
3479 // CHECK6:       ifcont14:
3480 // CHECK6-NEXT:    ret void
3481 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5
3482 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
3483 // CHECK6-NEXT:  entry:
3484 // CHECK6-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
3485 // CHECK6-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
3486 // CHECK6-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
3487 // CHECK6-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
3488 // CHECK6-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
3489 // CHECK6-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
3490 // CHECK6-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
3491 // CHECK6-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
3492 // CHECK6-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
3493 // CHECK6-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3*
3494 // CHECK6-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
3495 // CHECK6-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
3496 // CHECK6-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
3497 // CHECK6-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0
3498 // CHECK6-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]]
3499 // CHECK6-NEXT:    [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1
3500 // CHECK6-NEXT:    store i8 [[TMP11]], i8* [[TMP10]], align 128
3501 // CHECK6-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
3502 // CHECK6-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
3503 // CHECK6-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
3504 // CHECK6-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1
3505 // CHECK6-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]]
3506 // CHECK6-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP14]], align 4
3507 // CHECK6-NEXT:    store float [[TMP16]], float* [[TMP15]], align 128
3508 // CHECK6-NEXT:    ret void
3509 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6
3510 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
3511 // CHECK6-NEXT:  entry:
3512 // CHECK6-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
3513 // CHECK6-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
3514 // CHECK6-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
3515 // CHECK6-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
3516 // CHECK6-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
3517 // CHECK6-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
3518 // CHECK6-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
3519 // CHECK6-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
3520 // CHECK6-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3*
3521 // CHECK6-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
3522 // CHECK6-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
3523 // CHECK6-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0
3524 // CHECK6-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]]
3525 // CHECK6-NEXT:    store i8* [[TMP7]], i8** [[TMP6]], align 4
3526 // CHECK6-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
3527 // CHECK6-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1
3528 // CHECK6-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]]
3529 // CHECK6-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
3530 // CHECK6-NEXT:    store i8* [[TMP10]], i8** [[TMP8]], align 4
3531 // CHECK6-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
3532 // CHECK6-NEXT:    [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
3533 // CHECK6-NEXT:    call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]]
3534 // CHECK6-NEXT:    ret void
3535 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7
3536 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
3537 // CHECK6-NEXT:  entry:
3538 // CHECK6-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
3539 // CHECK6-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
3540 // CHECK6-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
3541 // CHECK6-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
3542 // CHECK6-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
3543 // CHECK6-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
3544 // CHECK6-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
3545 // CHECK6-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
3546 // CHECK6-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
3547 // CHECK6-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.3*
3548 // CHECK6-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
3549 // CHECK6-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
3550 // CHECK6-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
3551 // CHECK6-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 0
3552 // CHECK6-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]]
3553 // CHECK6-NEXT:    [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128
3554 // CHECK6-NEXT:    store i8 [[TMP11]], i8* [[TMP9]], align 1
3555 // CHECK6-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
3556 // CHECK6-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
3557 // CHECK6-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
3558 // CHECK6-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP6]], i32 0, i32 1
3559 // CHECK6-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]]
3560 // CHECK6-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP15]], align 128
3561 // CHECK6-NEXT:    store float [[TMP16]], float* [[TMP14]], align 4
3562 // CHECK6-NEXT:    ret void
3563 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8
3564 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
3565 // CHECK6-NEXT:  entry:
3566 // CHECK6-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
3567 // CHECK6-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
3568 // CHECK6-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
3569 // CHECK6-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
3570 // CHECK6-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
3571 // CHECK6-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
3572 // CHECK6-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
3573 // CHECK6-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
3574 // CHECK6-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.3*
3575 // CHECK6-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
3576 // CHECK6-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
3577 // CHECK6-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3:%.*]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 0
3578 // CHECK6-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]]
3579 // CHECK6-NEXT:    store i8* [[TMP7]], i8** [[TMP6]], align 4
3580 // CHECK6-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
3581 // CHECK6-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_3]], %struct._globalized_locals_ty.3* [[TMP4]], i32 0, i32 1
3582 // CHECK6-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]]
3583 // CHECK6-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
3584 // CHECK6-NEXT:    store i8* [[TMP10]], i8** [[TMP8]], align 4
3585 // CHECK6-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
3586 // CHECK6-NEXT:    [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
3587 // CHECK6-NEXT:    call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]]
3588 // CHECK6-NEXT:    ret void
3589 // CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l36
3590 // CHECK6-SAME: (i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR1]] {
3591 // CHECK6-NEXT:  entry:
3592 // CHECK6-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
3593 // CHECK6-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
3594 // CHECK6-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
3595 // CHECK6-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
3596 // CHECK6-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
3597 // CHECK6-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
3598 // CHECK6-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
3599 // CHECK6-NEXT:    [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16*
3600 // CHECK6-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
3601 // CHECK6-NEXT:    call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
3602 // CHECK6-NEXT:    call void @__kmpc_data_sharing_init_stack_spmd()
3603 // CHECK6-NEXT:    br label [[DOTEXECUTE:%.*]]
3604 // CHECK6:       .execute:
3605 // CHECK6-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
3606 // CHECK6-NEXT:    store i32 [[TMP0]], i32* [[DOTTHREADID_TEMP_]], align 4
3607 // CHECK6-NEXT:    call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]]
3608 // CHECK6-NEXT:    br label [[DOTOMP_DEINIT:%.*]]
3609 // CHECK6:       .omp.deinit:
3610 // CHECK6-NEXT:    call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
3611 // CHECK6-NEXT:    br label [[DOTEXIT:%.*]]
3612 // CHECK6:       .exit:
3613 // CHECK6-NEXT:    ret void
3614 // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__9
3615 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] {
3616 // CHECK6-NEXT:  entry:
3617 // CHECK6-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
3618 // CHECK6-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
3619 // CHECK6-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 4
3620 // CHECK6-NEXT:    [[B_ADDR:%.*]] = alloca i16*, align 4
3621 // CHECK6-NEXT:    [[A1:%.*]] = alloca i32, align 4
3622 // CHECK6-NEXT:    [[B2:%.*]] = alloca i16, align 2
3623 // CHECK6-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
3624 // CHECK6-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
3625 // CHECK6-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
3626 // CHECK6-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
3627 // CHECK6-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 4
3628 // CHECK6-NEXT:    store i16* [[B]], i16** [[B_ADDR]], align 4
3629 // CHECK6-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
3630 // CHECK6-NEXT:    [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
3631 // CHECK6-NEXT:    store i32 0, i32* [[A1]], align 4
3632 // CHECK6-NEXT:    store i16 -32768, i16* [[B2]], align 2
3633 // CHECK6-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
3634 // CHECK6-NEXT:    [[TMP3:%.*]] = bitcast i32* [[A1]] to i8*
3635 // CHECK6-NEXT:    store i8* [[TMP3]], i8** [[TMP2]], align 4
3636 // CHECK6-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
3637 // CHECK6-NEXT:    [[TMP5:%.*]] = bitcast i16* [[B2]] to i8*
3638 // CHECK6-NEXT:    store i8* [[TMP5]], i8** [[TMP4]], align 4
3639 // CHECK6-NEXT:    [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
3640 // CHECK6-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
3641 // CHECK6-NEXT:    [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
3642 // CHECK6-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2)
3643 // CHECK6-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
3644 // CHECK6-NEXT:    [[TMP10:%.*]] = bitcast i32* [[A1]] to i8*
3645 // CHECK6-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 4
3646 // CHECK6-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
3647 // CHECK6-NEXT:    [[TMP12:%.*]] = bitcast i16* [[B2]] to i8*
3648 // CHECK6-NEXT:    store i8* [[TMP12]], i8** [[TMP11]], align 4
3649 // CHECK6-NEXT:    [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
3650 // CHECK6-NEXT:    [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
3651 // CHECK6-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 2048, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20)
3652 // CHECK6-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1
3653 // CHECK6-NEXT:    br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
3654 // CHECK6:       .omp.reduction.then:
3655 // CHECK6-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4
3656 // CHECK6-NEXT:    [[TMP18:%.*]] = load i32, i32* [[A1]], align 4
3657 // CHECK6-NEXT:    [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]]
3658 // CHECK6-NEXT:    store i32 [[OR]], i32* [[TMP0]], align 4
3659 // CHECK6-NEXT:    [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2
3660 // CHECK6-NEXT:    [[CONV:%.*]] = sext i16 [[TMP19]] to i32
3661 // CHECK6-NEXT:    [[TMP20:%.*]] = load i16, i16* [[B2]], align 2
3662 // CHECK6-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP20]] to i32
3663 // CHECK6-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
3664 // CHECK6-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
3665 // CHECK6:       cond.true:
3666 // CHECK6-NEXT:    [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2
3667 // CHECK6-NEXT:    br label [[COND_END:%.*]]
3668 // CHECK6:       cond.false:
3669 // CHECK6-NEXT:    [[TMP22:%.*]] = load i16, i16* [[B2]], align 2
3670 // CHECK6-NEXT:    br label [[COND_END]]
3671 // CHECK6:       cond.end:
3672 // CHECK6-NEXT:    [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ]
3673 // CHECK6-NEXT:    store i16 [[COND]], i16* [[TMP1]], align 2
3674 // CHECK6-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]])
3675 // CHECK6-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
3676 // CHECK6:       .omp.reduction.done:
3677 // CHECK6-NEXT:    ret void
3678 // CHECK6-LABEL: define {{[^@]+}}@__omp_outlined__10
3679 // CHECK6-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] {
3680 // CHECK6-NEXT:  entry:
3681 // CHECK6-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
3682 // CHECK6-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
3683 // CHECK6-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 4
3684 // CHECK6-NEXT:    [[B_ADDR:%.*]] = alloca i16*, align 4
3685 // CHECK6-NEXT:    [[A1:%.*]] = alloca i32, align 4
3686 // CHECK6-NEXT:    [[B2:%.*]] = alloca i16, align 2
3687 // CHECK6-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
3688 // CHECK6-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
3689 // CHECK6-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
3690 // CHECK6-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 4
3691 // CHECK6-NEXT:    store i16* [[B]], i16** [[B_ADDR]], align 4
3692 // CHECK6-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
3693 // CHECK6-NEXT:    [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
3694 // CHECK6-NEXT:    store i32 0, i32* [[A1]], align 4
3695 // CHECK6-NEXT:    store i16 -32768, i16* [[B2]], align 2
3696 // CHECK6-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A1]], align 4
3697 // CHECK6-NEXT:    [[OR:%.*]] = or i32 [[TMP2]], 1
3698 // CHECK6-NEXT:    store i32 [[OR]], i32* [[A1]], align 4
3699 // CHECK6-NEXT:    [[TMP3:%.*]] = load i16, i16* [[B2]], align 2
3700 // CHECK6-NEXT:    [[CONV:%.*]] = sext i16 [[TMP3]] to i32
3701 // CHECK6-NEXT:    [[CMP:%.*]] = icmp sgt i32 99, [[CONV]]
3702 // CHECK6-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
3703 // CHECK6:       cond.true:
3704 // CHECK6-NEXT:    br label [[COND_END:%.*]]
3705 // CHECK6:       cond.false:
3706 // CHECK6-NEXT:    [[TMP4:%.*]] = load i16, i16* [[B2]], align 2
3707 // CHECK6-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP4]] to i32
3708 // CHECK6-NEXT:    br label [[COND_END]]
3709 // CHECK6:       cond.end:
3710 // CHECK6-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ]
3711 // CHECK6-NEXT:    [[CONV4:%.*]] = trunc i32 [[COND]] to i16
3712 // CHECK6-NEXT:    store i16 [[CONV4]], i16* [[B2]], align 2
3713 // CHECK6-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
3714 // CHECK6-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
3715 // CHECK6-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
3716 // CHECK6-NEXT:    [[TMP8:%.*]] = bitcast i32* [[A1]] to i8*
3717 // CHECK6-NEXT:    store i8* [[TMP8]], i8** [[TMP7]], align 4
3718 // CHECK6-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
3719 // CHECK6-NEXT:    [[TMP10:%.*]] = bitcast i16* [[B2]] to i8*
3720 // CHECK6-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 4
3721 // CHECK6-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
3722 // CHECK6-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13)
3723 // CHECK6-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1
3724 // CHECK6-NEXT:    br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
3725 // CHECK6:       .omp.reduction.then:
3726 // CHECK6-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4
3727 // CHECK6-NEXT:    [[TMP15:%.*]] = load i32, i32* [[A1]], align 4
3728 // CHECK6-NEXT:    [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]]
3729 // CHECK6-NEXT:    store i32 [[OR5]], i32* [[TMP0]], align 4
3730 // CHECK6-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2
3731 // CHECK6-NEXT:    [[CONV6:%.*]] = sext i16 [[TMP16]] to i32
3732 // CHECK6-NEXT:    [[TMP17:%.*]] = load i16, i16* [[B2]], align 2
3733 // CHECK6-NEXT:    [[CONV7:%.*]] = sext i16 [[TMP17]] to i32
3734 // CHECK6-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]]
3735 // CHECK6-NEXT:    br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
3736 // CHECK6:       cond.true9:
3737 // CHECK6-NEXT:    [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2
3738 // CHECK6-NEXT:    br label [[COND_END11:%.*]]
3739 // CHECK6:       cond.false10:
3740 // CHECK6-NEXT:    [[TMP19:%.*]] = load i16, i16* [[B2]], align 2
3741 // CHECK6-NEXT:    br label [[COND_END11]]
3742 // CHECK6:       cond.end11:
3743 // CHECK6-NEXT:    [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ]
3744 // CHECK6-NEXT:    store i16 [[COND12]], i16* [[TMP1]], align 2
3745 // CHECK6-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
3746 // CHECK6-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
3747 // CHECK6:       .omp.reduction.done:
3748 // CHECK6-NEXT:    ret void
3749 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12
3750 // CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
3751 // CHECK6-NEXT:  entry:
3752 // CHECK6-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
3753 // CHECK6-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
3754 // CHECK6-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
3755 // CHECK6-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
3756 // CHECK6-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
3757 // CHECK6-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
3758 // CHECK6-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
3759 // CHECK6-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
3760 // CHECK6-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
3761 // CHECK6-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
3762 // CHECK6-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
3763 // CHECK6-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
3764 // CHECK6-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
3765 // CHECK6-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
3766 // CHECK6-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
3767 // CHECK6-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
3768 // CHECK6-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
3769 // CHECK6-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
3770 // CHECK6-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
3771 // CHECK6-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
3772 // CHECK6-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
3773 // CHECK6-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
3774 // CHECK6-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
3775 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
3776 // CHECK6-NEXT:    [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
3777 // CHECK6-NEXT:    [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]])
3778 // CHECK6-NEXT:    store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
3779 // CHECK6-NEXT:    [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
3780 // CHECK6-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
3781 // CHECK6-NEXT:    [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
3782 // CHECK6-NEXT:    store i8* [[TMP20]], i8** [[TMP11]], align 4
3783 // CHECK6-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
3784 // CHECK6-NEXT:    [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4
3785 // CHECK6-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
3786 // CHECK6-NEXT:    [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16*
3787 // CHECK6-NEXT:    [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
3788 // CHECK6-NEXT:    [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8*
3789 // CHECK6-NEXT:    [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2
3790 // CHECK6-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP27]] to i32
3791 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
3792 // CHECK6-NEXT:    [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16
3793 // CHECK6-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]])
3794 // CHECK6-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
3795 // CHECK6-NEXT:    store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
3796 // CHECK6-NEXT:    [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
3797 // CHECK6-NEXT:    [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
3798 // CHECK6-NEXT:    [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
3799 // CHECK6-NEXT:    store i8* [[TMP34]], i8** [[TMP23]], align 4
3800 // CHECK6-NEXT:    [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0
3801 // CHECK6-NEXT:    [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1
3802 // CHECK6-NEXT:    [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
3803 // CHECK6-NEXT:    [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]]
3804 // CHECK6-NEXT:    [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2
3805 // CHECK6-NEXT:    [[TMP40:%.*]] = and i16 [[TMP6]], 1
3806 // CHECK6-NEXT:    [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0
3807 // CHECK6-NEXT:    [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]]
3808 // CHECK6-NEXT:    [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0
3809 // CHECK6-NEXT:    [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]]
3810 // CHECK6-NEXT:    [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]]
3811 // CHECK6-NEXT:    [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]]
3812 // CHECK6-NEXT:    br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]]
3813 // CHECK6:       then:
3814 // CHECK6-NEXT:    [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
3815 // CHECK6-NEXT:    [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
3816 // CHECK6-NEXT:    call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]]
3817 // CHECK6-NEXT:    br label [[IFCONT:%.*]]
3818 // CHECK6:       else:
3819 // CHECK6-NEXT:    br label [[IFCONT]]
3820 // CHECK6:       ifcont:
3821 // CHECK6-NEXT:    [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1
3822 // CHECK6-NEXT:    [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
3823 // CHECK6-NEXT:    [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]]
3824 // CHECK6-NEXT:    br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
3825 // CHECK6:       then6:
3826 // CHECK6-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
3827 // CHECK6-NEXT:    [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4
3828 // CHECK6-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
3829 // CHECK6-NEXT:    [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
3830 // CHECK6-NEXT:    [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32*
3831 // CHECK6-NEXT:    [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32*
3832 // CHECK6-NEXT:    [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4
3833 // CHECK6-NEXT:    store i32 [[TMP58]], i32* [[TMP57]], align 4
3834 // CHECK6-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
3835 // CHECK6-NEXT:    [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4
3836 // CHECK6-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
3837 // CHECK6-NEXT:    [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
3838 // CHECK6-NEXT:    [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16*
3839 // CHECK6-NEXT:    [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16*
3840 // CHECK6-NEXT:    [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2
3841 // CHECK6-NEXT:    store i16 [[TMP65]], i16* [[TMP64]], align 2
3842 // CHECK6-NEXT:    br label [[IFCONT8:%.*]]
3843 // CHECK6:       else7:
3844 // CHECK6-NEXT:    br label [[IFCONT8]]
3845 // CHECK6:       ifcont8:
3846 // CHECK6-NEXT:    ret void
3847 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13
3848 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
3849 // CHECK6-NEXT:  entry:
3850 // CHECK6-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
3851 // CHECK6-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
3852 // CHECK6-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
3853 // CHECK6-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
3854 // CHECK6-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
3855 // CHECK6-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
3856 // CHECK6-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
3857 // CHECK6-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
3858 // CHECK6-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
3859 // CHECK6-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
3860 // CHECK6-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
3861 // CHECK6-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
3862 // CHECK6-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]])
3863 // CHECK6-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
3864 // CHECK6-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
3865 // CHECK6:       then:
3866 // CHECK6-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
3867 // CHECK6-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4
3868 // CHECK6-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
3869 // CHECK6-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
3870 // CHECK6-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4
3871 // CHECK6-NEXT:    store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4
3872 // CHECK6-NEXT:    br label [[IFCONT:%.*]]
3873 // CHECK6:       else:
3874 // CHECK6-NEXT:    br label [[IFCONT]]
3875 // CHECK6:       ifcont:
3876 // CHECK6-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
3877 // CHECK6-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4
3878 // CHECK6-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]]
3879 // CHECK6-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
3880 // CHECK6:       then4:
3881 // CHECK6-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
3882 // CHECK6-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
3883 // CHECK6-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
3884 // CHECK6-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32*
3885 // CHECK6-NEXT:    [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4
3886 // CHECK6-NEXT:    store i32 [[TMP15]], i32* [[TMP14]], align 4
3887 // CHECK6-NEXT:    br label [[IFCONT6:%.*]]
3888 // CHECK6:       else5:
3889 // CHECK6-NEXT:    br label [[IFCONT6]]
3890 // CHECK6:       ifcont6:
3891 // CHECK6-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
3892 // CHECK6-NEXT:    [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
3893 // CHECK6-NEXT:    br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]]
3894 // CHECK6:       then8:
3895 // CHECK6-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
3896 // CHECK6-NEXT:    [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
3897 // CHECK6-NEXT:    [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16*
3898 // CHECK6-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
3899 // CHECK6-NEXT:    [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)*
3900 // CHECK6-NEXT:    [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2
3901 // CHECK6-NEXT:    store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2
3902 // CHECK6-NEXT:    br label [[IFCONT10:%.*]]
3903 // CHECK6:       else9:
3904 // CHECK6-NEXT:    br label [[IFCONT10]]
3905 // CHECK6:       ifcont10:
3906 // CHECK6-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
3907 // CHECK6-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4
3908 // CHECK6-NEXT:    [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]]
3909 // CHECK6-NEXT:    br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]]
3910 // CHECK6:       then12:
3911 // CHECK6-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
3912 // CHECK6-NEXT:    [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)*
3913 // CHECK6-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
3914 // CHECK6-NEXT:    [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4
3915 // CHECK6-NEXT:    [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16*
3916 // CHECK6-NEXT:    [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2
3917 // CHECK6-NEXT:    store i16 [[TMP28]], i16* [[TMP27]], align 2
3918 // CHECK6-NEXT:    br label [[IFCONT14:%.*]]
3919 // CHECK6:       else13:
3920 // CHECK6-NEXT:    br label [[IFCONT14]]
3921 // CHECK6:       ifcont14:
3922 // CHECK6-NEXT:    ret void
3923 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15
3924 // CHECK6-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR0]] {
3925 // CHECK6-NEXT:  entry:
3926 // CHECK6-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
3927 // CHECK6-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
3928 // CHECK6-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
3929 // CHECK6-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
3930 // CHECK6-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
3931 // CHECK6-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
3932 // CHECK6-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
3933 // CHECK6-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
3934 // CHECK6-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
3935 // CHECK6-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
3936 // CHECK6-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
3937 // CHECK6-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
3938 // CHECK6-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
3939 // CHECK6-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
3940 // CHECK6-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
3941 // CHECK6-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
3942 // CHECK6-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
3943 // CHECK6-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
3944 // CHECK6-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
3945 // CHECK6-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
3946 // CHECK6-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
3947 // CHECK6-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
3948 // CHECK6-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
3949 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
3950 // CHECK6-NEXT:    [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
3951 // CHECK6-NEXT:    [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]])
3952 // CHECK6-NEXT:    store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
3953 // CHECK6-NEXT:    [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
3954 // CHECK6-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
3955 // CHECK6-NEXT:    [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
3956 // CHECK6-NEXT:    store i8* [[TMP20]], i8** [[TMP11]], align 4
3957 // CHECK6-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
3958 // CHECK6-NEXT:    [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4
3959 // CHECK6-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
3960 // CHECK6-NEXT:    [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16*
3961 // CHECK6-NEXT:    [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
3962 // CHECK6-NEXT:    [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8*
3963 // CHECK6-NEXT:    [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2
3964 // CHECK6-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP27]] to i32
3965 // CHECK6-NEXT:    [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
3966 // CHECK6-NEXT:    [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16
3967 // CHECK6-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]])
3968 // CHECK6-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
3969 // CHECK6-NEXT:    store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
3970 // CHECK6-NEXT:    [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1
3971 // CHECK6-NEXT:    [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
3972 // CHECK6-NEXT:    [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
3973 // CHECK6-NEXT:    store i8* [[TMP34]], i8** [[TMP23]], align 4
3974 // CHECK6-NEXT:    [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0
3975 // CHECK6-NEXT:    [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1
3976 // CHECK6-NEXT:    [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
3977 // CHECK6-NEXT:    [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]]
3978 // CHECK6-NEXT:    [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2
3979 // CHECK6-NEXT:    [[TMP40:%.*]] = and i16 [[TMP6]], 1
3980 // CHECK6-NEXT:    [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0
3981 // CHECK6-NEXT:    [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]]
3982 // CHECK6-NEXT:    [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0
3983 // CHECK6-NEXT:    [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]]
3984 // CHECK6-NEXT:    [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]]
3985 // CHECK6-NEXT:    [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]]
3986 // CHECK6-NEXT:    br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]]
3987 // CHECK6:       then:
3988 // CHECK6-NEXT:    [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
3989 // CHECK6-NEXT:    [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
3990 // CHECK6-NEXT:    call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR3]]
3991 // CHECK6-NEXT:    br label [[IFCONT:%.*]]
3992 // CHECK6:       else:
3993 // CHECK6-NEXT:    br label [[IFCONT]]
3994 // CHECK6:       ifcont:
3995 // CHECK6-NEXT:    [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1
3996 // CHECK6-NEXT:    [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
3997 // CHECK6-NEXT:    [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]]
3998 // CHECK6-NEXT:    br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
3999 // CHECK6:       then6:
4000 // CHECK6-NEXT:    [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
4001 // CHECK6-NEXT:    [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4
4002 // CHECK6-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
4003 // CHECK6-NEXT:    [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
4004 // CHECK6-NEXT:    [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32*
4005 // CHECK6-NEXT:    [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32*
4006 // CHECK6-NEXT:    [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4
4007 // CHECK6-NEXT:    store i32 [[TMP58]], i32* [[TMP57]], align 4
4008 // CHECK6-NEXT:    [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
4009 // CHECK6-NEXT:    [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4
4010 // CHECK6-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
4011 // CHECK6-NEXT:    [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
4012 // CHECK6-NEXT:    [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16*
4013 // CHECK6-NEXT:    [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16*
4014 // CHECK6-NEXT:    [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2
4015 // CHECK6-NEXT:    store i16 [[TMP65]], i16* [[TMP64]], align 2
4016 // CHECK6-NEXT:    br label [[IFCONT8:%.*]]
4017 // CHECK6:       else7:
4018 // CHECK6-NEXT:    br label [[IFCONT8]]
4019 // CHECK6:       ifcont8:
4020 // CHECK6-NEXT:    ret void
4021 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16
4022 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
4023 // CHECK6-NEXT:  entry:
4024 // CHECK6-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
4025 // CHECK6-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
4026 // CHECK6-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
4027 // CHECK6-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
4028 // CHECK6-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
4029 // CHECK6-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
4030 // CHECK6-NEXT:    [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
4031 // CHECK6-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
4032 // CHECK6-NEXT:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
4033 // CHECK6-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
4034 // CHECK6-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
4035 // CHECK6-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
4036 // CHECK6-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
4037 // CHECK6-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
4038 // CHECK6-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
4039 // CHECK6:       then:
4040 // CHECK6-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
4041 // CHECK6-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4
4042 // CHECK6-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32*
4043 // CHECK6-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
4044 // CHECK6-NEXT:    [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4
4045 // CHECK6-NEXT:    store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4
4046 // CHECK6-NEXT:    br label [[IFCONT:%.*]]
4047 // CHECK6:       else:
4048 // CHECK6-NEXT:    br label [[IFCONT]]
4049 // CHECK6:       ifcont:
4050 // CHECK6-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
4051 // CHECK6-NEXT:    [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4
4052 // CHECK6-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]]
4053 // CHECK6-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
4054 // CHECK6:       then4:
4055 // CHECK6-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
4056 // CHECK6-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
4057 // CHECK6-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
4058 // CHECK6-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32*
4059 // CHECK6-NEXT:    [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4
4060 // CHECK6-NEXT:    store i32 [[TMP15]], i32* [[TMP14]], align 4
4061 // CHECK6-NEXT:    br label [[IFCONT6:%.*]]
4062 // CHECK6:       else5:
4063 // CHECK6-NEXT:    br label [[IFCONT6]]
4064 // CHECK6:       ifcont6:
4065 // CHECK6-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
4066 // CHECK6-NEXT:    [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
4067 // CHECK6-NEXT:    br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]]
4068 // CHECK6:       then8:
4069 // CHECK6-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
4070 // CHECK6-NEXT:    [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
4071 // CHECK6-NEXT:    [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16*
4072 // CHECK6-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
4073 // CHECK6-NEXT:    [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)*
4074 // CHECK6-NEXT:    [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2
4075 // CHECK6-NEXT:    store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2
4076 // CHECK6-NEXT:    br label [[IFCONT10:%.*]]
4077 // CHECK6:       else9:
4078 // CHECK6-NEXT:    br label [[IFCONT10]]
4079 // CHECK6:       ifcont10:
4080 // CHECK6-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
4081 // CHECK6-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4
4082 // CHECK6-NEXT:    [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]]
4083 // CHECK6-NEXT:    br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]]
4084 // CHECK6:       then12:
4085 // CHECK6-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
4086 // CHECK6-NEXT:    [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)*
4087 // CHECK6-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
4088 // CHECK6-NEXT:    [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4
4089 // CHECK6-NEXT:    [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16*
4090 // CHECK6-NEXT:    [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2
4091 // CHECK6-NEXT:    store i16 [[TMP28]], i16* [[TMP27]], align 2
4092 // CHECK6-NEXT:    br label [[IFCONT14:%.*]]
4093 // CHECK6:       else13:
4094 // CHECK6-NEXT:    br label [[IFCONT14]]
4095 // CHECK6:       ifcont14:
4096 // CHECK6-NEXT:    ret void
4097 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17
4098 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
4099 // CHECK6-NEXT:  entry:
4100 // CHECK6-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
4101 // CHECK6-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
4102 // CHECK6-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
4103 // CHECK6-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
4104 // CHECK6-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
4105 // CHECK6-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
4106 // CHECK6-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
4107 // CHECK6-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
4108 // CHECK6-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
4109 // CHECK6-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4*
4110 // CHECK6-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
4111 // CHECK6-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
4112 // CHECK6-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
4113 // CHECK6-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
4114 // CHECK6-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0
4115 // CHECK6-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]]
4116 // CHECK6-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
4117 // CHECK6-NEXT:    store i32 [[TMP12]], i32* [[TMP11]], align 128
4118 // CHECK6-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
4119 // CHECK6-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
4120 // CHECK6-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
4121 // CHECK6-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1
4122 // CHECK6-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]]
4123 // CHECK6-NEXT:    [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2
4124 // CHECK6-NEXT:    store i16 [[TMP17]], i16* [[TMP16]], align 128
4125 // CHECK6-NEXT:    ret void
4126 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18
4127 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
4128 // CHECK6-NEXT:  entry:
4129 // CHECK6-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
4130 // CHECK6-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
4131 // CHECK6-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
4132 // CHECK6-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
4133 // CHECK6-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
4134 // CHECK6-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
4135 // CHECK6-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
4136 // CHECK6-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
4137 // CHECK6-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4*
4138 // CHECK6-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
4139 // CHECK6-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
4140 // CHECK6-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0
4141 // CHECK6-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]]
4142 // CHECK6-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
4143 // CHECK6-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 4
4144 // CHECK6-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
4145 // CHECK6-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1
4146 // CHECK6-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]]
4147 // CHECK6-NEXT:    [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
4148 // CHECK6-NEXT:    store i8* [[TMP11]], i8** [[TMP9]], align 4
4149 // CHECK6-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
4150 // CHECK6-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
4151 // CHECK6-NEXT:    call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]]
4152 // CHECK6-NEXT:    ret void
4153 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19
4154 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
4155 // CHECK6-NEXT:  entry:
4156 // CHECK6-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
4157 // CHECK6-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
4158 // CHECK6-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
4159 // CHECK6-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
4160 // CHECK6-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
4161 // CHECK6-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
4162 // CHECK6-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
4163 // CHECK6-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
4164 // CHECK6-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
4165 // CHECK6-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.4*
4166 // CHECK6-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
4167 // CHECK6-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
4168 // CHECK6-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
4169 // CHECK6-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
4170 // CHECK6-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 0
4171 // CHECK6-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]]
4172 // CHECK6-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128
4173 // CHECK6-NEXT:    store i32 [[TMP12]], i32* [[TMP10]], align 4
4174 // CHECK6-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
4175 // CHECK6-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
4176 // CHECK6-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
4177 // CHECK6-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP6]], i32 0, i32 1
4178 // CHECK6-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]]
4179 // CHECK6-NEXT:    [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128
4180 // CHECK6-NEXT:    store i16 [[TMP17]], i16* [[TMP15]], align 2
4181 // CHECK6-NEXT:    ret void
4182 // CHECK6-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20
4183 // CHECK6-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]], i8* [[TMP2:%.*]]) #[[ATTR0]] {
4184 // CHECK6-NEXT:  entry:
4185 // CHECK6-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
4186 // CHECK6-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
4187 // CHECK6-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
4188 // CHECK6-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
4189 // CHECK6-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
4190 // CHECK6-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
4191 // CHECK6-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
4192 // CHECK6-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
4193 // CHECK6-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.4*
4194 // CHECK6-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
4195 // CHECK6-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
4196 // CHECK6-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4:%.*]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 0
4197 // CHECK6-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]]
4198 // CHECK6-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
4199 // CHECK6-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 4
4200 // CHECK6-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
4201 // CHECK6-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_4]], %struct._globalized_locals_ty.4* [[TMP4]], i32 0, i32 1
4202 // CHECK6-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]]
4203 // CHECK6-NEXT:    [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
4204 // CHECK6-NEXT:    store i8* [[TMP11]], i8** [[TMP9]], align 4
4205 // CHECK6-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
4206 // CHECK6-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
4207 // CHECK6-NEXT:    call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]]
4208 // CHECK6-NEXT:    ret void
4209 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20
4210 // CHECK1-SAME: (i64 noundef [[E:%.*]]) #[[ATTR0:[0-9]+]] {
4211 // CHECK1-NEXT:  entry:
4212 // CHECK1-NEXT:    [[E_ADDR:%.*]] = alloca i64, align 8
4213 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
4214 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
4215 // CHECK1-NEXT:    store i64 [[E]], i64* [[E_ADDR]], align 8
4216 // CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[E_ADDR]] to double*
4217 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true)
4218 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
4219 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
4220 // CHECK1:       user_code.entry:
4221 // CHECK1-NEXT:    [[TMP1:%.*]] = load double, double* [[CONV]], align 8
4222 // CHECK1-NEXT:    [[E1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 8)
4223 // CHECK1-NEXT:    [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double*
4224 // CHECK1-NEXT:    store double [[TMP1]], double* [[E_ON_STACK]], align 8
4225 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
4226 // CHECK1-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
4227 // CHECK1-NEXT:    store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4
4228 // CHECK1-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E_ON_STACK]]) #[[ATTR3:[0-9]+]]
4229 // CHECK1-NEXT:    call void @__kmpc_free_shared(i8* [[E1]], i64 8)
4230 // CHECK1-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
4231 // CHECK1-NEXT:    ret void
4232 // CHECK1:       worker.exit:
4233 // CHECK1-NEXT:    ret void
4234 //
4235 //
4236 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__
4237 // CHECK1-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], double* noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] {
4238 // CHECK1-NEXT:  entry:
4239 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
4240 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
4241 // CHECK1-NEXT:    [[E_ADDR:%.*]] = alloca double*, align 8
4242 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8
4243 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
4244 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
4245 // CHECK1-NEXT:    store double* [[E]], double** [[E_ADDR]], align 8
4246 // CHECK1-NEXT:    [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 8
4247 // CHECK1-NEXT:    [[E1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 8)
4248 // CHECK1-NEXT:    [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double*
4249 // CHECK1-NEXT:    store double 0.000000e+00, double* [[E_ON_STACK]], align 8
4250 // CHECK1-NEXT:    [[TMP1:%.*]] = load double, double* [[E_ON_STACK]], align 8
4251 // CHECK1-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00
4252 // CHECK1-NEXT:    store double [[ADD]], double* [[E_ON_STACK]], align 8
4253 // CHECK1-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
4254 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
4255 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
4256 // CHECK1-NEXT:    [[TMP5:%.*]] = bitcast double* [[E_ON_STACK]] to i8*
4257 // CHECK1-NEXT:    store i8* [[TMP5]], i8** [[TMP4]], align 8
4258 // CHECK1-NEXT:    [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
4259 // CHECK1-NEXT:    [[TMP7:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8
4260 // CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i8* [[TMP7]], i32 1024, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func)
4261 // CHECK1-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
4262 // CHECK1-NEXT:    br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
4263 // CHECK1:       .omp.reduction.then:
4264 // CHECK1-NEXT:    [[TMP10:%.*]] = load double, double* [[TMP0]], align 8
4265 // CHECK1-NEXT:    [[TMP11:%.*]] = load double, double* [[E_ON_STACK]], align 8
4266 // CHECK1-NEXT:    [[ADD2:%.*]] = fadd double [[TMP10]], [[TMP11]]
4267 // CHECK1-NEXT:    store double [[ADD2]], double* [[TMP0]], align 8
4268 // CHECK1-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
4269 // CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
4270 // CHECK1:       .omp.reduction.done:
4271 // CHECK1-NEXT:    call void @__kmpc_free_shared(i8* [[E1]], i64 8)
4272 // CHECK1-NEXT:    ret void
4273 //
4274 //
4275 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
4276 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2:[0-9]+]] {
4277 // CHECK1-NEXT:  entry:
4278 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
4279 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
4280 // CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
4281 // CHECK1-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
4282 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 8
4283 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8
4284 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
4285 // CHECK1-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
4286 // CHECK1-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
4287 // CHECK1-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
4288 // CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
4289 // CHECK1-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]*
4290 // CHECK1-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
4291 // CHECK1-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
4292 // CHECK1-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
4293 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
4294 // CHECK1-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8
4295 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
4296 // CHECK1-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double*
4297 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i64 1
4298 // CHECK1-NEXT:    [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8*
4299 // CHECK1-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64*
4300 // CHECK1-NEXT:    [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
4301 // CHECK1-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8
4302 // CHECK1-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_get_warp_size()
4303 // CHECK1-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
4304 // CHECK1-NEXT:    [[TMP20:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP19]])
4305 // CHECK1-NEXT:    store i64 [[TMP20]], i64* [[TMP16]], align 8
4306 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1
4307 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1
4308 // CHECK1-NEXT:    [[TMP23:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
4309 // CHECK1-NEXT:    store i8* [[TMP23]], i8** [[TMP11]], align 8
4310 // CHECK1-NEXT:    [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 0
4311 // CHECK1-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP8]], 1
4312 // CHECK1-NEXT:    [[TMP26:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
4313 // CHECK1-NEXT:    [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
4314 // CHECK1-NEXT:    [[TMP28:%.*]] = icmp eq i16 [[TMP8]], 2
4315 // CHECK1-NEXT:    [[TMP29:%.*]] = and i16 [[TMP6]], 1
4316 // CHECK1-NEXT:    [[TMP30:%.*]] = icmp eq i16 [[TMP29]], 0
4317 // CHECK1-NEXT:    [[TMP31:%.*]] = and i1 [[TMP28]], [[TMP30]]
4318 // CHECK1-NEXT:    [[TMP32:%.*]] = icmp sgt i16 [[TMP7]], 0
4319 // CHECK1-NEXT:    [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
4320 // CHECK1-NEXT:    [[TMP34:%.*]] = or i1 [[TMP24]], [[TMP27]]
4321 // CHECK1-NEXT:    [[TMP35:%.*]] = or i1 [[TMP34]], [[TMP33]]
4322 // CHECK1-NEXT:    br i1 [[TMP35]], label [[THEN:%.*]], label [[ELSE:%.*]]
4323 // CHECK1:       then:
4324 // CHECK1-NEXT:    [[TMP36:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
4325 // CHECK1-NEXT:    [[TMP37:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
4326 // CHECK1-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP36]], i8* [[TMP37]]) #[[ATTR3]]
4327 // CHECK1-NEXT:    br label [[IFCONT:%.*]]
4328 // CHECK1:       else:
4329 // CHECK1-NEXT:    br label [[IFCONT]]
4330 // CHECK1:       ifcont:
4331 // CHECK1-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
4332 // CHECK1-NEXT:    [[TMP39:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
4333 // CHECK1-NEXT:    [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
4334 // CHECK1-NEXT:    br i1 [[TMP40]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
4335 // CHECK1:       then4:
4336 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
4337 // CHECK1-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[TMP41]], align 8
4338 // CHECK1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0
4339 // CHECK1-NEXT:    [[TMP44:%.*]] = load i8*, i8** [[TMP43]], align 8
4340 // CHECK1-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP42]] to double*
4341 // CHECK1-NEXT:    [[TMP46:%.*]] = bitcast i8* [[TMP44]] to double*
4342 // CHECK1-NEXT:    [[TMP47:%.*]] = load double, double* [[TMP45]], align 8
4343 // CHECK1-NEXT:    store double [[TMP47]], double* [[TMP46]], align 8
4344 // CHECK1-NEXT:    br label [[IFCONT6:%.*]]
4345 // CHECK1:       else5:
4346 // CHECK1-NEXT:    br label [[IFCONT6]]
4347 // CHECK1:       ifcont6:
4348 // CHECK1-NEXT:    ret void
4349 //
4350 //
4351 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
4352 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
4353 // CHECK1-NEXT:  entry:
4354 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
4355 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
4356 // CHECK1-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
4357 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
4358 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
4359 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
4360 // CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
4361 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
4362 // CHECK1-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
4363 // CHECK1-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
4364 // CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
4365 // CHECK1-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 8
4366 // CHECK1-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [1 x i8*]*
4367 // CHECK1-NEXT:    store i32 0, i32* [[DOTCNT_ADDR]], align 4
4368 // CHECK1-NEXT:    br label [[PRECOND:%.*]]
4369 // CHECK1:       precond:
4370 // CHECK1-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4
4371 // CHECK1-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 2
4372 // CHECK1-NEXT:    br i1 [[TMP9]], label [[BODY:%.*]], label [[EXIT:%.*]]
4373 // CHECK1:       body:
4374 // CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]])
4375 // CHECK1-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
4376 // CHECK1-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
4377 // CHECK1:       then:
4378 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i64 0, i64 0
4379 // CHECK1-NEXT:    [[TMP11:%.*]] = load i8*, i8** [[TMP10]], align 8
4380 // CHECK1-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i32*
4381 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 [[TMP8]]
4382 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
4383 // CHECK1-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 4
4384 // CHECK1-NEXT:    store volatile i32 [[TMP15]], i32 addrspace(3)* [[TMP14]], align 4
4385 // CHECK1-NEXT:    br label [[IFCONT:%.*]]
4386 // CHECK1:       else:
4387 // CHECK1-NEXT:    br label [[IFCONT]]
4388 // CHECK1:       ifcont:
4389 // CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
4390 // CHECK1-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTADDR1]], align 4
4391 // CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP16]]
4392 // CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
4393 // CHECK1:       then2:
4394 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
4395 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i64 0, i64 0
4396 // CHECK1-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP18]], align 8
4397 // CHECK1-NEXT:    [[TMP20:%.*]] = bitcast i8* [[TMP19]] to i32*
4398 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[TMP20]], i32 [[TMP8]]
4399 // CHECK1-NEXT:    [[TMP22:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP17]], align 4
4400 // CHECK1-NEXT:    store i32 [[TMP22]], i32* [[TMP21]], align 4
4401 // CHECK1-NEXT:    br label [[IFCONT4:%.*]]
4402 // CHECK1:       else3:
4403 // CHECK1-NEXT:    br label [[IFCONT4]]
4404 // CHECK1:       ifcont4:
4405 // CHECK1-NEXT:    [[TMP23:%.*]] = add nsw i32 [[TMP8]], 1
4406 // CHECK1-NEXT:    store i32 [[TMP23]], i32* [[DOTCNT_ADDR]], align 4
4407 // CHECK1-NEXT:    br label [[PRECOND]]
4408 // CHECK1:       exit:
4409 // CHECK1-NEXT:    ret void
4410 //
4411 //
4412 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
4413 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
4414 // CHECK1-NEXT:  entry:
4415 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
4416 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
4417 // CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 8
4418 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
4419 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
4420 // CHECK1-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
4421 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
4422 // CHECK1-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
4423 // CHECK1-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8
4424 // CHECK1-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty*
4425 // CHECK1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
4426 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0
4427 // CHECK1-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8
4428 // CHECK1-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
4429 // CHECK1-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0
4430 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]]
4431 // CHECK1-NEXT:    [[TMP12:%.*]] = load double, double* [[TMP10]], align 8
4432 // CHECK1-NEXT:    store double [[TMP12]], double* [[TMP11]], align 128
4433 // CHECK1-NEXT:    ret void
4434 //
4435 //
4436 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
4437 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
4438 // CHECK1-NEXT:  entry:
4439 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
4440 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
4441 // CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 8
4442 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8
4443 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
4444 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
4445 // CHECK1-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
4446 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
4447 // CHECK1-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty*
4448 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
4449 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
4450 // CHECK1-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0
4451 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]]
4452 // CHECK1-NEXT:    [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
4453 // CHECK1-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 8
4454 // CHECK1-NEXT:    [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
4455 // CHECK1-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
4456 // CHECK1-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]]
4457 // CHECK1-NEXT:    ret void
4458 //
4459 //
4460 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
4461 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
4462 // CHECK1-NEXT:  entry:
4463 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
4464 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
4465 // CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 8
4466 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
4467 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
4468 // CHECK1-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
4469 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
4470 // CHECK1-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
4471 // CHECK1-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8
4472 // CHECK1-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty*
4473 // CHECK1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
4474 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0
4475 // CHECK1-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8
4476 // CHECK1-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
4477 // CHECK1-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0
4478 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]]
4479 // CHECK1-NEXT:    [[TMP12:%.*]] = load double, double* [[TMP11]], align 128
4480 // CHECK1-NEXT:    store double [[TMP12]], double* [[TMP10]], align 8
4481 // CHECK1-NEXT:    ret void
4482 //
4483 //
4484 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
4485 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
4486 // CHECK1-NEXT:  entry:
4487 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
4488 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
4489 // CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 8
4490 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8
4491 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
4492 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
4493 // CHECK1-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
4494 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
4495 // CHECK1-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty*
4496 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
4497 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
4498 // CHECK1-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0
4499 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]]
4500 // CHECK1-NEXT:    [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
4501 // CHECK1-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 8
4502 // CHECK1-NEXT:    [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
4503 // CHECK1-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
4504 // CHECK1-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]]
4505 // CHECK1-NEXT:    ret void
4506 //
4507 //
4508 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26
4509 // CHECK1-SAME: (i64 noundef [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR0]] {
4510 // CHECK1-NEXT:  entry:
4511 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca i64, align 8
4512 // CHECK1-NEXT:    [[D_ADDR:%.*]] = alloca i64, align 8
4513 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
4514 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
4515 // CHECK1-NEXT:    store i64 [[C]], i64* [[C_ADDR]], align 8
4516 // CHECK1-NEXT:    store i64 [[D]], i64* [[D_ADDR]], align 8
4517 // CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[C_ADDR]] to i8*
4518 // CHECK1-NEXT:    [[CONV1:%.*]] = bitcast i64* [[D_ADDR]] to float*
4519 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true)
4520 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
4521 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
4522 // CHECK1:       user_code.entry:
4523 // CHECK1-NEXT:    [[TMP1:%.*]] = load i8, i8* [[CONV]], align 1
4524 // CHECK1-NEXT:    [[C2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 1)
4525 // CHECK1-NEXT:    store i8 [[TMP1]], i8* [[C2]], align 1
4526 // CHECK1-NEXT:    [[TMP2:%.*]] = load float, float* [[CONV1]], align 4
4527 // CHECK1-NEXT:    [[D3:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
4528 // CHECK1-NEXT:    [[D_ON_STACK:%.*]] = bitcast i8* [[D3]] to float*
4529 // CHECK1-NEXT:    store float [[TMP2]], float* [[D_ON_STACK]], align 4
4530 // CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
4531 // CHECK1-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
4532 // CHECK1-NEXT:    store i32 [[TMP3]], i32* [[DOTTHREADID_TEMP_]], align 4
4533 // CHECK1-NEXT:    call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C2]], float* [[D_ON_STACK]]) #[[ATTR3]]
4534 // CHECK1-NEXT:    call void @__kmpc_free_shared(i8* [[D3]], i64 4)
4535 // CHECK1-NEXT:    call void @__kmpc_free_shared(i8* [[C2]], i64 1)
4536 // CHECK1-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
4537 // CHECK1-NEXT:    ret void
4538 // CHECK1:       worker.exit:
4539 // CHECK1-NEXT:    ret void
4540 //
4541 //
4542 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1
4543 // CHECK1-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i8* noundef nonnull align 1 dereferenceable(1) [[C:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
4544 // CHECK1-NEXT:  entry:
4545 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
4546 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
4547 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca i8*, align 8
4548 // CHECK1-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 8
4549 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8
4550 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
4551 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
4552 // CHECK1-NEXT:    store i8* [[C]], i8** [[C_ADDR]], align 8
4553 // CHECK1-NEXT:    store float* [[D]], float** [[D_ADDR]], align 8
4554 // CHECK1-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 8
4555 // CHECK1-NEXT:    [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 8
4556 // CHECK1-NEXT:    [[C1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 1)
4557 // CHECK1-NEXT:    [[D2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i64 4)
4558 // CHECK1-NEXT:    [[D_ON_STACK:%.*]] = bitcast i8* [[D2]] to float*
4559 // CHECK1-NEXT:    store i8 0, i8* [[C1]], align 1
4560 // CHECK1-NEXT:    store float 1.000000e+00, float* [[D_ON_STACK]], align 4
4561 // CHECK1-NEXT:    [[TMP2:%.*]] = load i8, i8* [[C1]], align 1
4562 // CHECK1-NEXT:    [[CONV:%.*]] = sext i8 [[TMP2]] to i32
4563 // CHECK1-NEXT:    [[XOR:%.*]] = xor i32 [[CONV]], 2
4564 // CHECK1-NEXT:    [[CONV3:%.*]] = trunc i32 [[XOR]] to i8
4565 // CHECK1-NEXT:    store i8 [[CONV3]], i8* [[C1]], align 1
4566 // CHECK1-NEXT:    [[TMP3:%.*]] = load float, float* [[D_ON_STACK]], align 4
4567 // CHECK1-NEXT:    [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01
4568 // CHECK1-NEXT:    store float [[MUL]], float* [[D_ON_STACK]], align 4
4569 // CHECK1-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
4570 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
4571 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
4572 // CHECK1-NEXT:    store i8* [[C1]], i8** [[TMP6]], align 8
4573 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
4574 // CHECK1-NEXT:    [[TMP8:%.*]] = bitcast float* [[D_ON_STACK]] to i8*
4575 // CHECK1-NEXT:    store i8* [[TMP8]], i8** [[TMP7]], align 8
4576 // CHECK1-NEXT:    [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
4577 // CHECK1-NEXT:    [[TMP10:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8
4578 // CHECK1-NEXT:    [[TMP11:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP10]], i32 1024, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8)
4579 // CHECK1-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 1
4580 // CHECK1-NEXT:    br i1 [[TMP12]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
4581 // CHECK1:       .omp.reduction.then:
4582 // CHECK1-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP0]], align 1
4583 // CHECK1-NEXT:    [[CONV4:%.*]] = sext i8 [[TMP13]] to i32
4584 // CHECK1-NEXT:    [[TMP14:%.*]] = load i8, i8* [[C1]], align 1
4585 // CHECK1-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP14]] to i32
4586 // CHECK1-NEXT:    [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
4587 // CHECK1-NEXT:    [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
4588 // CHECK1-NEXT:    store i8 [[CONV7]], i8* [[TMP0]], align 1
4589 // CHECK1-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP1]], align 4
4590 // CHECK1-NEXT:    [[TMP16:%.*]] = load float, float* [[D_ON_STACK]], align 4
4591 // CHECK1-NEXT:    [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP16]]
4592 // CHECK1-NEXT:    store float [[MUL8]], float* [[TMP1]], align 4
4593 // CHECK1-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
4594 // CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
4595 // CHECK1:       .omp.reduction.done:
4596 // CHECK1-NEXT:    call void @__kmpc_free_shared(i8* [[D2]], i64 4)
4597 // CHECK1-NEXT:    call void @__kmpc_free_shared(i8* [[C1]], i64 1)
4598 // CHECK1-NEXT:    ret void
4599 //
4600 //
4601 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3
4602 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] {
4603 // CHECK1-NEXT:  entry:
4604 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
4605 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
4606 // CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
4607 // CHECK1-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
4608 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8
4609 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1
4610 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4
4611 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
4612 // CHECK1-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
4613 // CHECK1-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
4614 // CHECK1-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
4615 // CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
4616 // CHECK1-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
4617 // CHECK1-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
4618 // CHECK1-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
4619 // CHECK1-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
4620 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
4621 // CHECK1-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8
4622 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
4623 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1
4624 // CHECK1-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1
4625 // CHECK1-NEXT:    [[TMP14:%.*]] = sext i8 [[TMP13]] to i32
4626 // CHECK1-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_get_warp_size()
4627 // CHECK1-NEXT:    [[TMP16:%.*]] = trunc i32 [[TMP15]] to i16
4628 // CHECK1-NEXT:    [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP16]])
4629 // CHECK1-NEXT:    [[TMP18:%.*]] = trunc i32 [[TMP17]] to i8
4630 // CHECK1-NEXT:    store i8 [[TMP18]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1
4631 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1
4632 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i64 1
4633 // CHECK1-NEXT:    store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 8
4634 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
4635 // CHECK1-NEXT:    [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 8
4636 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
4637 // CHECK1-NEXT:    [[TMP24:%.*]] = bitcast i8* [[TMP22]] to float*
4638 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr float, float* [[TMP24]], i64 1
4639 // CHECK1-NEXT:    [[TMP26:%.*]] = bitcast float* [[TMP25]] to i8*
4640 // CHECK1-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP24]] to i32*
4641 // CHECK1-NEXT:    [[TMP28:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32*
4642 // CHECK1-NEXT:    [[TMP29:%.*]] = load i32, i32* [[TMP27]], align 4
4643 // CHECK1-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
4644 // CHECK1-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
4645 // CHECK1-NEXT:    [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
4646 // CHECK1-NEXT:    store i32 [[TMP32]], i32* [[TMP28]], align 4
4647 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr i32, i32* [[TMP27]], i64 1
4648 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr i32, i32* [[TMP28]], i64 1
4649 // CHECK1-NEXT:    [[TMP35:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
4650 // CHECK1-NEXT:    store i8* [[TMP35]], i8** [[TMP23]], align 8
4651 // CHECK1-NEXT:    [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 0
4652 // CHECK1-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1
4653 // CHECK1-NEXT:    [[TMP38:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
4654 // CHECK1-NEXT:    [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
4655 // CHECK1-NEXT:    [[TMP40:%.*]] = icmp eq i16 [[TMP8]], 2
4656 // CHECK1-NEXT:    [[TMP41:%.*]] = and i16 [[TMP6]], 1
4657 // CHECK1-NEXT:    [[TMP42:%.*]] = icmp eq i16 [[TMP41]], 0
4658 // CHECK1-NEXT:    [[TMP43:%.*]] = and i1 [[TMP40]], [[TMP42]]
4659 // CHECK1-NEXT:    [[TMP44:%.*]] = icmp sgt i16 [[TMP7]], 0
4660 // CHECK1-NEXT:    [[TMP45:%.*]] = and i1 [[TMP43]], [[TMP44]]
4661 // CHECK1-NEXT:    [[TMP46:%.*]] = or i1 [[TMP36]], [[TMP39]]
4662 // CHECK1-NEXT:    [[TMP47:%.*]] = or i1 [[TMP46]], [[TMP45]]
4663 // CHECK1-NEXT:    br i1 [[TMP47]], label [[THEN:%.*]], label [[ELSE:%.*]]
4664 // CHECK1:       then:
4665 // CHECK1-NEXT:    [[TMP48:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
4666 // CHECK1-NEXT:    [[TMP49:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
4667 // CHECK1-NEXT:    call void @"_omp$reduction$reduction_func2"(i8* [[TMP48]], i8* [[TMP49]]) #[[ATTR3]]
4668 // CHECK1-NEXT:    br label [[IFCONT:%.*]]
4669 // CHECK1:       else:
4670 // CHECK1-NEXT:    br label [[IFCONT]]
4671 // CHECK1:       ifcont:
4672 // CHECK1-NEXT:    [[TMP50:%.*]] = icmp eq i16 [[TMP8]], 1
4673 // CHECK1-NEXT:    [[TMP51:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
4674 // CHECK1-NEXT:    [[TMP52:%.*]] = and i1 [[TMP50]], [[TMP51]]
4675 // CHECK1-NEXT:    br i1 [[TMP52]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
4676 // CHECK1:       then5:
4677 // CHECK1-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
4678 // CHECK1-NEXT:    [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 8
4679 // CHECK1-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
4680 // CHECK1-NEXT:    [[TMP56:%.*]] = load i8*, i8** [[TMP55]], align 8
4681 // CHECK1-NEXT:    [[TMP57:%.*]] = load i8, i8* [[TMP54]], align 1
4682 // CHECK1-NEXT:    store i8 [[TMP57]], i8* [[TMP56]], align 1
4683 // CHECK1-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
4684 // CHECK1-NEXT:    [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 8
4685 // CHECK1-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
4686 // CHECK1-NEXT:    [[TMP61:%.*]] = load i8*, i8** [[TMP60]], align 8
4687 // CHECK1-NEXT:    [[TMP62:%.*]] = bitcast i8* [[TMP59]] to float*
4688 // CHECK1-NEXT:    [[TMP63:%.*]] = bitcast i8* [[TMP61]] to float*
4689 // CHECK1-NEXT:    [[TMP64:%.*]] = load float, float* [[TMP62]], align 4
4690 // CHECK1-NEXT:    store float [[TMP64]], float* [[TMP63]], align 4
4691 // CHECK1-NEXT:    br label [[IFCONT7:%.*]]
4692 // CHECK1:       else6:
4693 // CHECK1-NEXT:    br label [[IFCONT7]]
4694 // CHECK1:       ifcont7:
4695 // CHECK1-NEXT:    ret void
4696 //
4697 //
4698 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4
4699 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
4700 // CHECK1-NEXT:  entry:
4701 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
4702 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
4703 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
4704 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
4705 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
4706 // CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
4707 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
4708 // CHECK1-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
4709 // CHECK1-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
4710 // CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
4711 // CHECK1-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 8
4712 // CHECK1-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
4713 // CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
4714 // CHECK1-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
4715 // CHECK1-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
4716 // CHECK1:       then:
4717 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 0
4718 // CHECK1-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8
4719 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
4720 // CHECK1-NEXT:    [[TMP11:%.*]] = bitcast i32 addrspace(3)* [[TMP10]] to i8 addrspace(3)*
4721 // CHECK1-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP9]], align 1
4722 // CHECK1-NEXT:    store volatile i8 [[TMP12]], i8 addrspace(3)* [[TMP11]], align 1
4723 // CHECK1-NEXT:    br label [[IFCONT:%.*]]
4724 // CHECK1:       else:
4725 // CHECK1-NEXT:    br label [[IFCONT]]
4726 // CHECK1:       ifcont:
4727 // CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
4728 // CHECK1-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
4729 // CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
4730 // CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
4731 // CHECK1:       then2:
4732 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
4733 // CHECK1-NEXT:    [[TMP15:%.*]] = bitcast i32 addrspace(3)* [[TMP14]] to i8 addrspace(3)*
4734 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 0
4735 // CHECK1-NEXT:    [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8
4736 // CHECK1-NEXT:    [[TMP18:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP15]], align 1
4737 // CHECK1-NEXT:    store i8 [[TMP18]], i8* [[TMP17]], align 1
4738 // CHECK1-NEXT:    br label [[IFCONT4:%.*]]
4739 // CHECK1:       else3:
4740 // CHECK1-NEXT:    br label [[IFCONT4]]
4741 // CHECK1:       ifcont4:
4742 // CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
4743 // CHECK1-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
4744 // CHECK1-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
4745 // CHECK1:       then6:
4746 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 1
4747 // CHECK1-NEXT:    [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 8
4748 // CHECK1-NEXT:    [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i32*
4749 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
4750 // CHECK1-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP21]], align 4
4751 // CHECK1-NEXT:    store volatile i32 [[TMP23]], i32 addrspace(3)* [[TMP22]], align 4
4752 // CHECK1-NEXT:    br label [[IFCONT8:%.*]]
4753 // CHECK1:       else7:
4754 // CHECK1-NEXT:    br label [[IFCONT8]]
4755 // CHECK1:       ifcont8:
4756 // CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
4757 // CHECK1-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTADDR1]], align 4
4758 // CHECK1-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP24]]
4759 // CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
4760 // CHECK1:       then10:
4761 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
4762 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 1
4763 // CHECK1-NEXT:    [[TMP27:%.*]] = load i8*, i8** [[TMP26]], align 8
4764 // CHECK1-NEXT:    [[TMP28:%.*]] = bitcast i8* [[TMP27]] to i32*
4765 // CHECK1-NEXT:    [[TMP29:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP25]], align 4
4766 // CHECK1-NEXT:    store i32 [[TMP29]], i32* [[TMP28]], align 4
4767 // CHECK1-NEXT:    br label [[IFCONT12:%.*]]
4768 // CHECK1:       else11:
4769 // CHECK1-NEXT:    br label [[IFCONT12]]
4770 // CHECK1:       ifcont12:
4771 // CHECK1-NEXT:    ret void
4772 //
4773 //
4774 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5
4775 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
4776 // CHECK1-NEXT:  entry:
4777 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
4778 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
4779 // CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 8
4780 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
4781 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
4782 // CHECK1-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
4783 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
4784 // CHECK1-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
4785 // CHECK1-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8
4786 // CHECK1-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
4787 // CHECK1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
4788 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0
4789 // CHECK1-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8
4790 // CHECK1-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
4791 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]]
4792 // CHECK1-NEXT:    [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1
4793 // CHECK1-NEXT:    store i8 [[TMP11]], i8* [[TMP10]], align 128
4794 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1
4795 // CHECK1-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8
4796 // CHECK1-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
4797 // CHECK1-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1
4798 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]]
4799 // CHECK1-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP14]], align 4
4800 // CHECK1-NEXT:    store float [[TMP16]], float* [[TMP15]], align 128
4801 // CHECK1-NEXT:    ret void
4802 //
4803 //
4804 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6
4805 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
4806 // CHECK1-NEXT:  entry:
4807 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
4808 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
4809 // CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 8
4810 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8
4811 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
4812 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
4813 // CHECK1-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
4814 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
4815 // CHECK1-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
4816 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
4817 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
4818 // CHECK1-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
4819 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]]
4820 // CHECK1-NEXT:    store i8* [[TMP7]], i8** [[TMP6]], align 8
4821 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
4822 // CHECK1-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1
4823 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]]
4824 // CHECK1-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
4825 // CHECK1-NEXT:    store i8* [[TMP10]], i8** [[TMP8]], align 8
4826 // CHECK1-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
4827 // CHECK1-NEXT:    [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
4828 // CHECK1-NEXT:    call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]]
4829 // CHECK1-NEXT:    ret void
4830 //
4831 //
4832 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7
4833 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
4834 // CHECK1-NEXT:  entry:
4835 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
4836 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
4837 // CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 8
4838 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
4839 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
4840 // CHECK1-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
4841 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
4842 // CHECK1-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
4843 // CHECK1-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8
4844 // CHECK1-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
4845 // CHECK1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
4846 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0
4847 // CHECK1-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8
4848 // CHECK1-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
4849 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]]
4850 // CHECK1-NEXT:    [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128
4851 // CHECK1-NEXT:    store i8 [[TMP11]], i8* [[TMP9]], align 1
4852 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1
4853 // CHECK1-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8
4854 // CHECK1-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
4855 // CHECK1-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1
4856 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]]
4857 // CHECK1-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP15]], align 128
4858 // CHECK1-NEXT:    store float [[TMP16]], float* [[TMP14]], align 4
4859 // CHECK1-NEXT:    ret void
4860 //
4861 //
4862 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8
4863 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
4864 // CHECK1-NEXT:  entry:
4865 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
4866 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
4867 // CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 8
4868 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8
4869 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
4870 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
4871 // CHECK1-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
4872 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
4873 // CHECK1-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
4874 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
4875 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
4876 // CHECK1-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
4877 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]]
4878 // CHECK1-NEXT:    store i8* [[TMP7]], i8** [[TMP6]], align 8
4879 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
4880 // CHECK1-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1
4881 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]]
4882 // CHECK1-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
4883 // CHECK1-NEXT:    store i8* [[TMP10]], i8** [[TMP8]], align 8
4884 // CHECK1-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
4885 // CHECK1-NEXT:    [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
4886 // CHECK1-NEXT:    call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]]
4887 // CHECK1-NEXT:    ret void
4888 //
4889 //
4890 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33
4891 // CHECK1-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
4892 // CHECK1-NEXT:  entry:
4893 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
4894 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
4895 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
4896 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
4897 // CHECK1-NEXT:    store i64 [[A]], i64* [[A_ADDR]], align 8
4898 // CHECK1-NEXT:    store i64 [[B]], i64* [[B_ADDR]], align 8
4899 // CHECK1-NEXT:    [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32*
4900 // CHECK1-NEXT:    [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i16*
4901 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true)
4902 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
4903 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
4904 // CHECK1:       user_code.entry:
4905 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
4906 // CHECK1-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
4907 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
4908 // CHECK1-NEXT:    call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[CONV]], i16* [[CONV1]]) #[[ATTR3]]
4909 // CHECK1-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true)
4910 // CHECK1-NEXT:    ret void
4911 // CHECK1:       worker.exit:
4912 // CHECK1-NEXT:    ret void
4913 //
4914 //
4915 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__9
4916 // CHECK1-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] {
4917 // CHECK1-NEXT:  entry:
4918 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
4919 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
4920 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 8
4921 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca i16*, align 8
4922 // CHECK1-NEXT:    [[A1:%.*]] = alloca i32, align 4
4923 // CHECK1-NEXT:    [[B2:%.*]] = alloca i16, align 2
4924 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
4925 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8
4926 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
4927 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
4928 // CHECK1-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
4929 // CHECK1-NEXT:    store i16* [[B]], i16** [[B_ADDR]], align 8
4930 // CHECK1-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
4931 // CHECK1-NEXT:    [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8
4932 // CHECK1-NEXT:    store i32 0, i32* [[A1]], align 4
4933 // CHECK1-NEXT:    store i16 -32768, i16* [[B2]], align 2
4934 // CHECK1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
4935 // CHECK1-NEXT:    [[TMP3:%.*]] = bitcast i32* [[A1]] to i8*
4936 // CHECK1-NEXT:    store i8* [[TMP3]], i8** [[TMP2]], align 8
4937 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
4938 // CHECK1-NEXT:    [[TMP5:%.*]] = bitcast i16* [[B2]] to i8*
4939 // CHECK1-NEXT:    store i8* [[TMP5]], i8** [[TMP4]], align 8
4940 // CHECK1-NEXT:    [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
4941 // CHECK1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
4942 // CHECK1-NEXT:    [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
4943 // CHECK1-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i64 2)
4944 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
4945 // CHECK1-NEXT:    [[TMP10:%.*]] = bitcast i32* [[A1]] to i8*
4946 // CHECK1-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 8
4947 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
4948 // CHECK1-NEXT:    [[TMP12:%.*]] = bitcast i16* [[B2]] to i8*
4949 // CHECK1-NEXT:    store i8* [[TMP12]], i8** [[TMP11]], align 8
4950 // CHECK1-NEXT:    [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
4951 // CHECK1-NEXT:    [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 8
4952 // CHECK1-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20)
4953 // CHECK1-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1
4954 // CHECK1-NEXT:    br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
4955 // CHECK1:       .omp.reduction.then:
4956 // CHECK1-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4
4957 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32, i32* [[A1]], align 4
4958 // CHECK1-NEXT:    [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]]
4959 // CHECK1-NEXT:    store i32 [[OR]], i32* [[TMP0]], align 4
4960 // CHECK1-NEXT:    [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2
4961 // CHECK1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP19]] to i32
4962 // CHECK1-NEXT:    [[TMP20:%.*]] = load i16, i16* [[B2]], align 2
4963 // CHECK1-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP20]] to i32
4964 // CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
4965 // CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
4966 // CHECK1:       cond.true:
4967 // CHECK1-NEXT:    [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2
4968 // CHECK1-NEXT:    br label [[COND_END:%.*]]
4969 // CHECK1:       cond.false:
4970 // CHECK1-NEXT:    [[TMP22:%.*]] = load i16, i16* [[B2]], align 2
4971 // CHECK1-NEXT:    br label [[COND_END]]
4972 // CHECK1:       cond.end:
4973 // CHECK1-NEXT:    [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ]
4974 // CHECK1-NEXT:    store i16 [[COND]], i16* [[TMP1]], align 2
4975 // CHECK1-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]])
4976 // CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
4977 // CHECK1:       .omp.reduction.done:
4978 // CHECK1-NEXT:    ret void
4979 //
4980 //
4981 // CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__10
4982 // CHECK1-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] {
4983 // CHECK1-NEXT:  entry:
4984 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
4985 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
4986 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 8
4987 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca i16*, align 8
4988 // CHECK1-NEXT:    [[A1:%.*]] = alloca i32, align 4
4989 // CHECK1-NEXT:    [[B2:%.*]] = alloca i16, align 2
4990 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8
4991 // CHECK1-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
4992 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
4993 // CHECK1-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
4994 // CHECK1-NEXT:    store i16* [[B]], i16** [[B_ADDR]], align 8
4995 // CHECK1-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
4996 // CHECK1-NEXT:    [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8
4997 // CHECK1-NEXT:    store i32 0, i32* [[A1]], align 4
4998 // CHECK1-NEXT:    store i16 -32768, i16* [[B2]], align 2
4999 // CHECK1-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A1]], align 4
5000 // CHECK1-NEXT:    [[OR:%.*]] = or i32 [[TMP2]], 1
5001 // CHECK1-NEXT:    store i32 [[OR]], i32* [[A1]], align 4
5002 // CHECK1-NEXT:    [[TMP3:%.*]] = load i16, i16* [[B2]], align 2
5003 // CHECK1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP3]] to i32
5004 // CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 99, [[CONV]]
5005 // CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
5006 // CHECK1:       cond.true:
5007 // CHECK1-NEXT:    br label [[COND_END:%.*]]
5008 // CHECK1:       cond.false:
5009 // CHECK1-NEXT:    [[TMP4:%.*]] = load i16, i16* [[B2]], align 2
5010 // CHECK1-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP4]] to i32
5011 // CHECK1-NEXT:    br label [[COND_END]]
5012 // CHECK1:       cond.end:
5013 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ]
5014 // CHECK1-NEXT:    [[CONV4:%.*]] = trunc i32 [[COND]] to i16
5015 // CHECK1-NEXT:    store i16 [[CONV4]], i16* [[B2]], align 2
5016 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
5017 // CHECK1-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
5018 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
5019 // CHECK1-NEXT:    [[TMP8:%.*]] = bitcast i32* [[A1]] to i8*
5020 // CHECK1-NEXT:    store i8* [[TMP8]], i8** [[TMP7]], align 8
5021 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
5022 // CHECK1-NEXT:    [[TMP10:%.*]] = bitcast i16* [[B2]] to i8*
5023 // CHECK1-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 8
5024 // CHECK1-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
5025 // CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i64 16, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13)
5026 // CHECK1-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1
5027 // CHECK1-NEXT:    br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
5028 // CHECK1:       .omp.reduction.then:
5029 // CHECK1-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4
5030 // CHECK1-NEXT:    [[TMP15:%.*]] = load i32, i32* [[A1]], align 4
5031 // CHECK1-NEXT:    [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]]
5032 // CHECK1-NEXT:    store i32 [[OR5]], i32* [[TMP0]], align 4
5033 // CHECK1-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2
5034 // CHECK1-NEXT:    [[CONV6:%.*]] = sext i16 [[TMP16]] to i32
5035 // CHECK1-NEXT:    [[TMP17:%.*]] = load i16, i16* [[B2]], align 2
5036 // CHECK1-NEXT:    [[CONV7:%.*]] = sext i16 [[TMP17]] to i32
5037 // CHECK1-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]]
5038 // CHECK1-NEXT:    br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
5039 // CHECK1:       cond.true9:
5040 // CHECK1-NEXT:    [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2
5041 // CHECK1-NEXT:    br label [[COND_END11:%.*]]
5042 // CHECK1:       cond.false10:
5043 // CHECK1-NEXT:    [[TMP19:%.*]] = load i16, i16* [[B2]], align 2
5044 // CHECK1-NEXT:    br label [[COND_END11]]
5045 // CHECK1:       cond.end11:
5046 // CHECK1-NEXT:    [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ]
5047 // CHECK1-NEXT:    store i16 [[COND12]], i16* [[TMP1]], align 2
5048 // CHECK1-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
5049 // CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
5050 // CHECK1:       .omp.reduction.done:
5051 // CHECK1-NEXT:    ret void
5052 //
5053 //
5054 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12
5055 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] {
5056 // CHECK1-NEXT:  entry:
5057 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
5058 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
5059 // CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
5060 // CHECK1-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
5061 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8
5062 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
5063 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
5064 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
5065 // CHECK1-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
5066 // CHECK1-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
5067 // CHECK1-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
5068 // CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
5069 // CHECK1-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
5070 // CHECK1-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
5071 // CHECK1-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
5072 // CHECK1-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
5073 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
5074 // CHECK1-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8
5075 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
5076 // CHECK1-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
5077 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1
5078 // CHECK1-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
5079 // CHECK1-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
5080 // CHECK1-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size()
5081 // CHECK1-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
5082 // CHECK1-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]])
5083 // CHECK1-NEXT:    store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
5084 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1
5085 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1
5086 // CHECK1-NEXT:    [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
5087 // CHECK1-NEXT:    store i8* [[TMP21]], i8** [[TMP11]], align 8
5088 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
5089 // CHECK1-NEXT:    [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 8
5090 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
5091 // CHECK1-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16*
5092 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i64 1
5093 // CHECK1-NEXT:    [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8*
5094 // CHECK1-NEXT:    [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2
5095 // CHECK1-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP28]] to i32
5096 // CHECK1-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
5097 // CHECK1-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
5098 // CHECK1-NEXT:    [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
5099 // CHECK1-NEXT:    [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16
5100 // CHECK1-NEXT:    store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
5101 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i64 1
5102 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1
5103 // CHECK1-NEXT:    [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
5104 // CHECK1-NEXT:    store i8* [[TMP36]], i8** [[TMP24]], align 8
5105 // CHECK1-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0
5106 // CHECK1-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
5107 // CHECK1-NEXT:    [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
5108 // CHECK1-NEXT:    [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
5109 // CHECK1-NEXT:    [[TMP41:%.*]] = icmp eq i16 [[TMP8]], 2
5110 // CHECK1-NEXT:    [[TMP42:%.*]] = and i16 [[TMP6]], 1
5111 // CHECK1-NEXT:    [[TMP43:%.*]] = icmp eq i16 [[TMP42]], 0
5112 // CHECK1-NEXT:    [[TMP44:%.*]] = and i1 [[TMP41]], [[TMP43]]
5113 // CHECK1-NEXT:    [[TMP45:%.*]] = icmp sgt i16 [[TMP7]], 0
5114 // CHECK1-NEXT:    [[TMP46:%.*]] = and i1 [[TMP44]], [[TMP45]]
5115 // CHECK1-NEXT:    [[TMP47:%.*]] = or i1 [[TMP37]], [[TMP40]]
5116 // CHECK1-NEXT:    [[TMP48:%.*]] = or i1 [[TMP47]], [[TMP46]]
5117 // CHECK1-NEXT:    br i1 [[TMP48]], label [[THEN:%.*]], label [[ELSE:%.*]]
5118 // CHECK1:       then:
5119 // CHECK1-NEXT:    [[TMP49:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
5120 // CHECK1-NEXT:    [[TMP50:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
5121 // CHECK1-NEXT:    call void @"_omp$reduction$reduction_func11"(i8* [[TMP49]], i8* [[TMP50]]) #[[ATTR3]]
5122 // CHECK1-NEXT:    br label [[IFCONT:%.*]]
5123 // CHECK1:       else:
5124 // CHECK1-NEXT:    br label [[IFCONT]]
5125 // CHECK1:       ifcont:
5126 // CHECK1-NEXT:    [[TMP51:%.*]] = icmp eq i16 [[TMP8]], 1
5127 // CHECK1-NEXT:    [[TMP52:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
5128 // CHECK1-NEXT:    [[TMP53:%.*]] = and i1 [[TMP51]], [[TMP52]]
5129 // CHECK1-NEXT:    br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
5130 // CHECK1:       then5:
5131 // CHECK1-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
5132 // CHECK1-NEXT:    [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8
5133 // CHECK1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
5134 // CHECK1-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 8
5135 // CHECK1-NEXT:    [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32*
5136 // CHECK1-NEXT:    [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32*
5137 // CHECK1-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4
5138 // CHECK1-NEXT:    store i32 [[TMP60]], i32* [[TMP59]], align 4
5139 // CHECK1-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
5140 // CHECK1-NEXT:    [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8
5141 // CHECK1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
5142 // CHECK1-NEXT:    [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 8
5143 // CHECK1-NEXT:    [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16*
5144 // CHECK1-NEXT:    [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16*
5145 // CHECK1-NEXT:    [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2
5146 // CHECK1-NEXT:    store i16 [[TMP67]], i16* [[TMP66]], align 2
5147 // CHECK1-NEXT:    br label [[IFCONT7:%.*]]
5148 // CHECK1:       else6:
5149 // CHECK1-NEXT:    br label [[IFCONT7]]
5150 // CHECK1:       ifcont7:
5151 // CHECK1-NEXT:    ret void
5152 //
5153 //
5154 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13
5155 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
5156 // CHECK1-NEXT:  entry:
5157 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
5158 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
5159 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
5160 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
5161 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
5162 // CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
5163 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
5164 // CHECK1-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
5165 // CHECK1-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
5166 // CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
5167 // CHECK1-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 8
5168 // CHECK1-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
5169 // CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]])
5170 // CHECK1-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
5171 // CHECK1-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
5172 // CHECK1:       then:
5173 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 0
5174 // CHECK1-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8
5175 // CHECK1-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
5176 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
5177 // CHECK1-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
5178 // CHECK1-NEXT:    store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
5179 // CHECK1-NEXT:    br label [[IFCONT:%.*]]
5180 // CHECK1:       else:
5181 // CHECK1-NEXT:    br label [[IFCONT]]
5182 // CHECK1:       ifcont:
5183 // CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
5184 // CHECK1-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
5185 // CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
5186 // CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
5187 // CHECK1:       then2:
5188 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
5189 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 0
5190 // CHECK1-NEXT:    [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8
5191 // CHECK1-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
5192 // CHECK1-NEXT:    [[TMP18:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4
5193 // CHECK1-NEXT:    store i32 [[TMP18]], i32* [[TMP17]], align 4
5194 // CHECK1-NEXT:    br label [[IFCONT4:%.*]]
5195 // CHECK1:       else3:
5196 // CHECK1-NEXT:    br label [[IFCONT4]]
5197 // CHECK1:       ifcont4:
5198 // CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
5199 // CHECK1-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
5200 // CHECK1-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
5201 // CHECK1:       then6:
5202 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 1
5203 // CHECK1-NEXT:    [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 8
5204 // CHECK1-NEXT:    [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i16*
5205 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
5206 // CHECK1-NEXT:    [[TMP23:%.*]] = bitcast i32 addrspace(3)* [[TMP22]] to i16 addrspace(3)*
5207 // CHECK1-NEXT:    [[TMP24:%.*]] = load i16, i16* [[TMP21]], align 2
5208 // CHECK1-NEXT:    store volatile i16 [[TMP24]], i16 addrspace(3)* [[TMP23]], align 2
5209 // CHECK1-NEXT:    br label [[IFCONT8:%.*]]
5210 // CHECK1:       else7:
5211 // CHECK1-NEXT:    br label [[IFCONT8]]
5212 // CHECK1:       ifcont8:
5213 // CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
5214 // CHECK1-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTADDR1]], align 4
5215 // CHECK1-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP25]]
5216 // CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
5217 // CHECK1:       then10:
5218 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
5219 // CHECK1-NEXT:    [[TMP27:%.*]] = bitcast i32 addrspace(3)* [[TMP26]] to i16 addrspace(3)*
5220 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 1
5221 // CHECK1-NEXT:    [[TMP29:%.*]] = load i8*, i8** [[TMP28]], align 8
5222 // CHECK1-NEXT:    [[TMP30:%.*]] = bitcast i8* [[TMP29]] to i16*
5223 // CHECK1-NEXT:    [[TMP31:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP27]], align 2
5224 // CHECK1-NEXT:    store i16 [[TMP31]], i16* [[TMP30]], align 2
5225 // CHECK1-NEXT:    br label [[IFCONT12:%.*]]
5226 // CHECK1:       else11:
5227 // CHECK1-NEXT:    br label [[IFCONT12]]
5228 // CHECK1:       ifcont12:
5229 // CHECK1-NEXT:    ret void
5230 //
5231 //
5232 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15
5233 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] {
5234 // CHECK1-NEXT:  entry:
5235 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
5236 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
5237 // CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
5238 // CHECK1-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
5239 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8
5240 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
5241 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
5242 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
5243 // CHECK1-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
5244 // CHECK1-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
5245 // CHECK1-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
5246 // CHECK1-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8
5247 // CHECK1-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
5248 // CHECK1-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
5249 // CHECK1-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
5250 // CHECK1-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
5251 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
5252 // CHECK1-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8
5253 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
5254 // CHECK1-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
5255 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1
5256 // CHECK1-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
5257 // CHECK1-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
5258 // CHECK1-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size()
5259 // CHECK1-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
5260 // CHECK1-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]])
5261 // CHECK1-NEXT:    store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
5262 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1
5263 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1
5264 // CHECK1-NEXT:    [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
5265 // CHECK1-NEXT:    store i8* [[TMP21]], i8** [[TMP11]], align 8
5266 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
5267 // CHECK1-NEXT:    [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 8
5268 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
5269 // CHECK1-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16*
5270 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i64 1
5271 // CHECK1-NEXT:    [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8*
5272 // CHECK1-NEXT:    [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2
5273 // CHECK1-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP28]] to i32
5274 // CHECK1-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
5275 // CHECK1-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
5276 // CHECK1-NEXT:    [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
5277 // CHECK1-NEXT:    [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16
5278 // CHECK1-NEXT:    store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
5279 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i64 1
5280 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1
5281 // CHECK1-NEXT:    [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
5282 // CHECK1-NEXT:    store i8* [[TMP36]], i8** [[TMP24]], align 8
5283 // CHECK1-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0
5284 // CHECK1-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
5285 // CHECK1-NEXT:    [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
5286 // CHECK1-NEXT:    [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
5287 // CHECK1-NEXT:    [[TMP41:%.*]] = icmp eq i16 [[TMP8]], 2
5288 // CHECK1-NEXT:    [[TMP42:%.*]] = and i16 [[TMP6]], 1
5289 // CHECK1-NEXT:    [[TMP43:%.*]] = icmp eq i16 [[TMP42]], 0
5290 // CHECK1-NEXT:    [[TMP44:%.*]] = and i1 [[TMP41]], [[TMP43]]
5291 // CHECK1-NEXT:    [[TMP45:%.*]] = icmp sgt i16 [[TMP7]], 0
5292 // CHECK1-NEXT:    [[TMP46:%.*]] = and i1 [[TMP44]], [[TMP45]]
5293 // CHECK1-NEXT:    [[TMP47:%.*]] = or i1 [[TMP37]], [[TMP40]]
5294 // CHECK1-NEXT:    [[TMP48:%.*]] = or i1 [[TMP47]], [[TMP46]]
5295 // CHECK1-NEXT:    br i1 [[TMP48]], label [[THEN:%.*]], label [[ELSE:%.*]]
5296 // CHECK1:       then:
5297 // CHECK1-NEXT:    [[TMP49:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
5298 // CHECK1-NEXT:    [[TMP50:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
5299 // CHECK1-NEXT:    call void @"_omp$reduction$reduction_func14"(i8* [[TMP49]], i8* [[TMP50]]) #[[ATTR3]]
5300 // CHECK1-NEXT:    br label [[IFCONT:%.*]]
5301 // CHECK1:       else:
5302 // CHECK1-NEXT:    br label [[IFCONT]]
5303 // CHECK1:       ifcont:
5304 // CHECK1-NEXT:    [[TMP51:%.*]] = icmp eq i16 [[TMP8]], 1
5305 // CHECK1-NEXT:    [[TMP52:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
5306 // CHECK1-NEXT:    [[TMP53:%.*]] = and i1 [[TMP51]], [[TMP52]]
5307 // CHECK1-NEXT:    br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
5308 // CHECK1:       then5:
5309 // CHECK1-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
5310 // CHECK1-NEXT:    [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8
5311 // CHECK1-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0
5312 // CHECK1-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 8
5313 // CHECK1-NEXT:    [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32*
5314 // CHECK1-NEXT:    [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32*
5315 // CHECK1-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4
5316 // CHECK1-NEXT:    store i32 [[TMP60]], i32* [[TMP59]], align 4
5317 // CHECK1-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1
5318 // CHECK1-NEXT:    [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8
5319 // CHECK1-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1
5320 // CHECK1-NEXT:    [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 8
5321 // CHECK1-NEXT:    [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16*
5322 // CHECK1-NEXT:    [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16*
5323 // CHECK1-NEXT:    [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2
5324 // CHECK1-NEXT:    store i16 [[TMP67]], i16* [[TMP66]], align 2
5325 // CHECK1-NEXT:    br label [[IFCONT7:%.*]]
5326 // CHECK1:       else6:
5327 // CHECK1-NEXT:    br label [[IFCONT7]]
5328 // CHECK1:       ifcont7:
5329 // CHECK1-NEXT:    ret void
5330 //
5331 //
5332 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16
5333 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
5334 // CHECK1-NEXT:  entry:
5335 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
5336 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
5337 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
5338 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
5339 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
5340 // CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
5341 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
5342 // CHECK1-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
5343 // CHECK1-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
5344 // CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
5345 // CHECK1-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 8
5346 // CHECK1-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
5347 // CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
5348 // CHECK1-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
5349 // CHECK1-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
5350 // CHECK1:       then:
5351 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 0
5352 // CHECK1-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8
5353 // CHECK1-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
5354 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
5355 // CHECK1-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
5356 // CHECK1-NEXT:    store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
5357 // CHECK1-NEXT:    br label [[IFCONT:%.*]]
5358 // CHECK1:       else:
5359 // CHECK1-NEXT:    br label [[IFCONT]]
5360 // CHECK1:       ifcont:
5361 // CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
5362 // CHECK1-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
5363 // CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
5364 // CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
5365 // CHECK1:       then2:
5366 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
5367 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 0
5368 // CHECK1-NEXT:    [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8
5369 // CHECK1-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
5370 // CHECK1-NEXT:    [[TMP18:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4
5371 // CHECK1-NEXT:    store i32 [[TMP18]], i32* [[TMP17]], align 4
5372 // CHECK1-NEXT:    br label [[IFCONT4:%.*]]
5373 // CHECK1:       else3:
5374 // CHECK1-NEXT:    br label [[IFCONT4]]
5375 // CHECK1:       ifcont4:
5376 // CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
5377 // CHECK1-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
5378 // CHECK1-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
5379 // CHECK1:       then6:
5380 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 1
5381 // CHECK1-NEXT:    [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 8
5382 // CHECK1-NEXT:    [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i16*
5383 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
5384 // CHECK1-NEXT:    [[TMP23:%.*]] = bitcast i32 addrspace(3)* [[TMP22]] to i16 addrspace(3)*
5385 // CHECK1-NEXT:    [[TMP24:%.*]] = load i16, i16* [[TMP21]], align 2
5386 // CHECK1-NEXT:    store volatile i16 [[TMP24]], i16 addrspace(3)* [[TMP23]], align 2
5387 // CHECK1-NEXT:    br label [[IFCONT8:%.*]]
5388 // CHECK1:       else7:
5389 // CHECK1-NEXT:    br label [[IFCONT8]]
5390 // CHECK1:       ifcont8:
5391 // CHECK1-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
5392 // CHECK1-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTADDR1]], align 4
5393 // CHECK1-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP25]]
5394 // CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
5395 // CHECK1:       then10:
5396 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
5397 // CHECK1-NEXT:    [[TMP27:%.*]] = bitcast i32 addrspace(3)* [[TMP26]] to i16 addrspace(3)*
5398 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i64 0, i64 1
5399 // CHECK1-NEXT:    [[TMP29:%.*]] = load i8*, i8** [[TMP28]], align 8
5400 // CHECK1-NEXT:    [[TMP30:%.*]] = bitcast i8* [[TMP29]] to i16*
5401 // CHECK1-NEXT:    [[TMP31:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP27]], align 2
5402 // CHECK1-NEXT:    store i16 [[TMP31]], i16* [[TMP30]], align 2
5403 // CHECK1-NEXT:    br label [[IFCONT12:%.*]]
5404 // CHECK1:       else11:
5405 // CHECK1-NEXT:    br label [[IFCONT12]]
5406 // CHECK1:       ifcont12:
5407 // CHECK1-NEXT:    ret void
5408 //
5409 //
5410 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17
5411 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
5412 // CHECK1-NEXT:  entry:
5413 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
5414 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
5415 // CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 8
5416 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
5417 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
5418 // CHECK1-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
5419 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
5420 // CHECK1-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
5421 // CHECK1-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8
5422 // CHECK1-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1*
5423 // CHECK1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
5424 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0
5425 // CHECK1-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8
5426 // CHECK1-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
5427 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0
5428 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]]
5429 // CHECK1-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
5430 // CHECK1-NEXT:    store i32 [[TMP12]], i32* [[TMP11]], align 128
5431 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1
5432 // CHECK1-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8
5433 // CHECK1-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
5434 // CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1
5435 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]]
5436 // CHECK1-NEXT:    [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2
5437 // CHECK1-NEXT:    store i16 [[TMP17]], i16* [[TMP16]], align 128
5438 // CHECK1-NEXT:    ret void
5439 //
5440 //
5441 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18
5442 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
5443 // CHECK1-NEXT:  entry:
5444 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
5445 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
5446 // CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 8
5447 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8
5448 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
5449 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
5450 // CHECK1-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
5451 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
5452 // CHECK1-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1*
5453 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
5454 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
5455 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0
5456 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]]
5457 // CHECK1-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
5458 // CHECK1-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 8
5459 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
5460 // CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1
5461 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]]
5462 // CHECK1-NEXT:    [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
5463 // CHECK1-NEXT:    store i8* [[TMP11]], i8** [[TMP9]], align 8
5464 // CHECK1-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
5465 // CHECK1-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
5466 // CHECK1-NEXT:    call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]]
5467 // CHECK1-NEXT:    ret void
5468 //
5469 //
5470 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19
5471 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
5472 // CHECK1-NEXT:  entry:
5473 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
5474 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
5475 // CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 8
5476 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
5477 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
5478 // CHECK1-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
5479 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
5480 // CHECK1-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
5481 // CHECK1-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 8
5482 // CHECK1-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1*
5483 // CHECK1-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
5484 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0
5485 // CHECK1-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 8
5486 // CHECK1-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
5487 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0
5488 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]]
5489 // CHECK1-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128
5490 // CHECK1-NEXT:    store i32 [[TMP12]], i32* [[TMP10]], align 4
5491 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1
5492 // CHECK1-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8
5493 // CHECK1-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
5494 // CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1
5495 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]]
5496 // CHECK1-NEXT:    [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128
5497 // CHECK1-NEXT:    store i16 [[TMP17]], i16* [[TMP15]], align 2
5498 // CHECK1-NEXT:    ret void
5499 //
5500 //
5501 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20
5502 // CHECK1-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
5503 // CHECK1-NEXT:  entry:
5504 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 8
5505 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
5506 // CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 8
5507 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8
5508 // CHECK1-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 8
5509 // CHECK1-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
5510 // CHECK1-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 8
5511 // CHECK1-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
5512 // CHECK1-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1*
5513 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
5514 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
5515 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0
5516 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]]
5517 // CHECK1-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
5518 // CHECK1-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 8
5519 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
5520 // CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1
5521 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]]
5522 // CHECK1-NEXT:    [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
5523 // CHECK1-NEXT:    store i8* [[TMP11]], i8** [[TMP9]], align 8
5524 // CHECK1-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
5525 // CHECK1-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8
5526 // CHECK1-NEXT:    call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]]
5527 // CHECK1-NEXT:    ret void
5528 //
5529 //
5530 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20
5531 // CHECK2-SAME: (double* noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] {
5532 // CHECK2-NEXT:  entry:
5533 // CHECK2-NEXT:    [[E_ADDR:%.*]] = alloca double*, align 4
5534 // CHECK2-NEXT:    [[E1:%.*]] = alloca double, align 8
5535 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
5536 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
5537 // CHECK2-NEXT:    store double* [[E]], double** [[E_ADDR]], align 4
5538 // CHECK2-NEXT:    [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
5539 // CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true)
5540 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
5541 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
5542 // CHECK2:       user_code.entry:
5543 // CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
5544 // CHECK2-NEXT:    [[TMP3:%.*]] = load double, double* [[TMP0]], align 8
5545 // CHECK2-NEXT:    store double [[TMP3]], double* [[E1]], align 8
5546 // CHECK2-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
5547 // CHECK2-NEXT:    store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4
5548 // CHECK2-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E1]]) #[[ATTR3:[0-9]+]]
5549 // CHECK2-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
5550 // CHECK2-NEXT:    ret void
5551 // CHECK2:       worker.exit:
5552 // CHECK2-NEXT:    ret void
5553 //
5554 //
5555 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__
5556 // CHECK2-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], double* noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] {
5557 // CHECK2-NEXT:  entry:
5558 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
5559 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
5560 // CHECK2-NEXT:    [[E_ADDR:%.*]] = alloca double*, align 4
5561 // CHECK2-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
5562 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
5563 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
5564 // CHECK2-NEXT:    store double* [[E]], double** [[E_ADDR]], align 4
5565 // CHECK2-NEXT:    [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
5566 // CHECK2-NEXT:    [[E1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 8)
5567 // CHECK2-NEXT:    [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double*
5568 // CHECK2-NEXT:    store double 0.000000e+00, double* [[E_ON_STACK]], align 8
5569 // CHECK2-NEXT:    [[TMP1:%.*]] = load double, double* [[E_ON_STACK]], align 8
5570 // CHECK2-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00
5571 // CHECK2-NEXT:    store double [[ADD]], double* [[E_ON_STACK]], align 8
5572 // CHECK2-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
5573 // CHECK2-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
5574 // CHECK2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
5575 // CHECK2-NEXT:    [[TMP5:%.*]] = bitcast double* [[E_ON_STACK]] to i8*
5576 // CHECK2-NEXT:    store i8* [[TMP5]], i8** [[TMP4]], align 4
5577 // CHECK2-NEXT:    [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
5578 // CHECK2-NEXT:    [[TMP7:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
5579 // CHECK2-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i8* [[TMP7]], i32 1024, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func)
5580 // CHECK2-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
5581 // CHECK2-NEXT:    br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
5582 // CHECK2:       .omp.reduction.then:
5583 // CHECK2-NEXT:    [[TMP10:%.*]] = load double, double* [[TMP0]], align 8
5584 // CHECK2-NEXT:    [[TMP11:%.*]] = load double, double* [[E_ON_STACK]], align 8
5585 // CHECK2-NEXT:    [[ADD2:%.*]] = fadd double [[TMP10]], [[TMP11]]
5586 // CHECK2-NEXT:    store double [[ADD2]], double* [[TMP0]], align 8
5587 // CHECK2-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
5588 // CHECK2-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
5589 // CHECK2:       .omp.reduction.done:
5590 // CHECK2-NEXT:    call void @__kmpc_free_shared(i8* [[E1]], i32 8)
5591 // CHECK2-NEXT:    ret void
5592 //
5593 //
5594 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
5595 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2:[0-9]+]] {
5596 // CHECK2-NEXT:  entry:
5597 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
5598 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
5599 // CHECK2-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
5600 // CHECK2-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
5601 // CHECK2-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4
5602 // CHECK2-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8
5603 // CHECK2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
5604 // CHECK2-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
5605 // CHECK2-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
5606 // CHECK2-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
5607 // CHECK2-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
5608 // CHECK2-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]*
5609 // CHECK2-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
5610 // CHECK2-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
5611 // CHECK2-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
5612 // CHECK2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
5613 // CHECK2-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
5614 // CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
5615 // CHECK2-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double*
5616 // CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1
5617 // CHECK2-NEXT:    [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8*
5618 // CHECK2-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64*
5619 // CHECK2-NEXT:    [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
5620 // CHECK2-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8
5621 // CHECK2-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_get_warp_size()
5622 // CHECK2-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
5623 // CHECK2-NEXT:    [[TMP20:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP19]])
5624 // CHECK2-NEXT:    store i64 [[TMP20]], i64* [[TMP16]], align 8
5625 // CHECK2-NEXT:    [[TMP21:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1
5626 // CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1
5627 // CHECK2-NEXT:    [[TMP23:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
5628 // CHECK2-NEXT:    store i8* [[TMP23]], i8** [[TMP11]], align 4
5629 // CHECK2-NEXT:    [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 0
5630 // CHECK2-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP8]], 1
5631 // CHECK2-NEXT:    [[TMP26:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
5632 // CHECK2-NEXT:    [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
5633 // CHECK2-NEXT:    [[TMP28:%.*]] = icmp eq i16 [[TMP8]], 2
5634 // CHECK2-NEXT:    [[TMP29:%.*]] = and i16 [[TMP6]], 1
5635 // CHECK2-NEXT:    [[TMP30:%.*]] = icmp eq i16 [[TMP29]], 0
5636 // CHECK2-NEXT:    [[TMP31:%.*]] = and i1 [[TMP28]], [[TMP30]]
5637 // CHECK2-NEXT:    [[TMP32:%.*]] = icmp sgt i16 [[TMP7]], 0
5638 // CHECK2-NEXT:    [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
5639 // CHECK2-NEXT:    [[TMP34:%.*]] = or i1 [[TMP24]], [[TMP27]]
5640 // CHECK2-NEXT:    [[TMP35:%.*]] = or i1 [[TMP34]], [[TMP33]]
5641 // CHECK2-NEXT:    br i1 [[TMP35]], label [[THEN:%.*]], label [[ELSE:%.*]]
5642 // CHECK2:       then:
5643 // CHECK2-NEXT:    [[TMP36:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
5644 // CHECK2-NEXT:    [[TMP37:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
5645 // CHECK2-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP36]], i8* [[TMP37]]) #[[ATTR3]]
5646 // CHECK2-NEXT:    br label [[IFCONT:%.*]]
5647 // CHECK2:       else:
5648 // CHECK2-NEXT:    br label [[IFCONT]]
5649 // CHECK2:       ifcont:
5650 // CHECK2-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
5651 // CHECK2-NEXT:    [[TMP39:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
5652 // CHECK2-NEXT:    [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
5653 // CHECK2-NEXT:    br i1 [[TMP40]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
5654 // CHECK2:       then4:
5655 // CHECK2-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
5656 // CHECK2-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[TMP41]], align 4
5657 // CHECK2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
5658 // CHECK2-NEXT:    [[TMP44:%.*]] = load i8*, i8** [[TMP43]], align 4
5659 // CHECK2-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP42]] to double*
5660 // CHECK2-NEXT:    [[TMP46:%.*]] = bitcast i8* [[TMP44]] to double*
5661 // CHECK2-NEXT:    [[TMP47:%.*]] = load double, double* [[TMP45]], align 8
5662 // CHECK2-NEXT:    store double [[TMP47]], double* [[TMP46]], align 8
5663 // CHECK2-NEXT:    br label [[IFCONT6:%.*]]
5664 // CHECK2:       else5:
5665 // CHECK2-NEXT:    br label [[IFCONT6]]
5666 // CHECK2:       ifcont6:
5667 // CHECK2-NEXT:    ret void
5668 //
5669 //
5670 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
5671 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
5672 // CHECK2-NEXT:  entry:
5673 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
5674 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
5675 // CHECK2-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
5676 // CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
5677 // CHECK2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
5678 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
5679 // CHECK2-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
5680 // CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
5681 // CHECK2-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
5682 // CHECK2-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
5683 // CHECK2-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
5684 // CHECK2-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4
5685 // CHECK2-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [1 x i8*]*
5686 // CHECK2-NEXT:    store i32 0, i32* [[DOTCNT_ADDR]], align 4
5687 // CHECK2-NEXT:    br label [[PRECOND:%.*]]
5688 // CHECK2:       precond:
5689 // CHECK2-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4
5690 // CHECK2-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 2
5691 // CHECK2-NEXT:    br i1 [[TMP9]], label [[BODY:%.*]], label [[EXIT:%.*]]
5692 // CHECK2:       body:
5693 // CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]])
5694 // CHECK2-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
5695 // CHECK2-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
5696 // CHECK2:       then:
5697 // CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i32 0, i32 0
5698 // CHECK2-NEXT:    [[TMP11:%.*]] = load i8*, i8** [[TMP10]], align 4
5699 // CHECK2-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i32*
5700 // CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 [[TMP8]]
5701 // CHECK2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
5702 // CHECK2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 4
5703 // CHECK2-NEXT:    store volatile i32 [[TMP15]], i32 addrspace(3)* [[TMP14]], align 4
5704 // CHECK2-NEXT:    br label [[IFCONT:%.*]]
5705 // CHECK2:       else:
5706 // CHECK2-NEXT:    br label [[IFCONT]]
5707 // CHECK2:       ifcont:
5708 // CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
5709 // CHECK2-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTADDR1]], align 4
5710 // CHECK2-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP16]]
5711 // CHECK2-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
5712 // CHECK2:       then2:
5713 // CHECK2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
5714 // CHECK2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i32 0, i32 0
5715 // CHECK2-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP18]], align 4
5716 // CHECK2-NEXT:    [[TMP20:%.*]] = bitcast i8* [[TMP19]] to i32*
5717 // CHECK2-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[TMP20]], i32 [[TMP8]]
5718 // CHECK2-NEXT:    [[TMP22:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP17]], align 4
5719 // CHECK2-NEXT:    store i32 [[TMP22]], i32* [[TMP21]], align 4
5720 // CHECK2-NEXT:    br label [[IFCONT4:%.*]]
5721 // CHECK2:       else3:
5722 // CHECK2-NEXT:    br label [[IFCONT4]]
5723 // CHECK2:       ifcont4:
5724 // CHECK2-NEXT:    [[TMP23:%.*]] = add nsw i32 [[TMP8]], 1
5725 // CHECK2-NEXT:    store i32 [[TMP23]], i32* [[DOTCNT_ADDR]], align 4
5726 // CHECK2-NEXT:    br label [[PRECOND]]
5727 // CHECK2:       exit:
5728 // CHECK2-NEXT:    ret void
5729 //
5730 //
5731 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
5732 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
5733 // CHECK2-NEXT:  entry:
5734 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
5735 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
5736 // CHECK2-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
5737 // CHECK2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
5738 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
5739 // CHECK2-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
5740 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
5741 // CHECK2-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
5742 // CHECK2-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
5743 // CHECK2-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty*
5744 // CHECK2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
5745 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
5746 // CHECK2-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
5747 // CHECK2-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
5748 // CHECK2-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0
5749 // CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]]
5750 // CHECK2-NEXT:    [[TMP12:%.*]] = load double, double* [[TMP10]], align 8
5751 // CHECK2-NEXT:    store double [[TMP12]], double* [[TMP11]], align 128
5752 // CHECK2-NEXT:    ret void
5753 //
5754 //
5755 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
5756 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
5757 // CHECK2-NEXT:  entry:
5758 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
5759 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
5760 // CHECK2-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
5761 // CHECK2-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
5762 // CHECK2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
5763 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
5764 // CHECK2-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
5765 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
5766 // CHECK2-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty*
5767 // CHECK2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
5768 // CHECK2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
5769 // CHECK2-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0
5770 // CHECK2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]]
5771 // CHECK2-NEXT:    [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
5772 // CHECK2-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 4
5773 // CHECK2-NEXT:    [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
5774 // CHECK2-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
5775 // CHECK2-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]]
5776 // CHECK2-NEXT:    ret void
5777 //
5778 //
5779 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
5780 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
5781 // CHECK2-NEXT:  entry:
5782 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
5783 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
5784 // CHECK2-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
5785 // CHECK2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
5786 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
5787 // CHECK2-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
5788 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
5789 // CHECK2-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
5790 // CHECK2-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
5791 // CHECK2-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty*
5792 // CHECK2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
5793 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
5794 // CHECK2-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
5795 // CHECK2-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
5796 // CHECK2-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0
5797 // CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP7]]
5798 // CHECK2-NEXT:    [[TMP12:%.*]] = load double, double* [[TMP11]], align 128
5799 // CHECK2-NEXT:    store double [[TMP12]], double* [[TMP10]], align 8
5800 // CHECK2-NEXT:    ret void
5801 //
5802 //
5803 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
5804 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
5805 // CHECK2-NEXT:  entry:
5806 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
5807 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
5808 // CHECK2-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
5809 // CHECK2-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
5810 // CHECK2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
5811 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
5812 // CHECK2-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
5813 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
5814 // CHECK2-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty*
5815 // CHECK2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
5816 // CHECK2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
5817 // CHECK2-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0
5818 // CHECK2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x double], [1024 x double]* [[E]], i32 0, i32 [[TMP5]]
5819 // CHECK2-NEXT:    [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
5820 // CHECK2-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 4
5821 // CHECK2-NEXT:    [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
5822 // CHECK2-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
5823 // CHECK2-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]]
5824 // CHECK2-NEXT:    ret void
5825 //
5826 //
5827 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26
5828 // CHECK2-SAME: (i32 noundef [[C:%.*]], i32 noundef [[D:%.*]]) #[[ATTR0]] {
5829 // CHECK2-NEXT:  entry:
5830 // CHECK2-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4
5831 // CHECK2-NEXT:    [[D_ADDR:%.*]] = alloca i32, align 4
5832 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
5833 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
5834 // CHECK2-NEXT:    store i32 [[C]], i32* [[C_ADDR]], align 4
5835 // CHECK2-NEXT:    store i32 [[D]], i32* [[D_ADDR]], align 4
5836 // CHECK2-NEXT:    [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8*
5837 // CHECK2-NEXT:    [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float*
5838 // CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true)
5839 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
5840 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
5841 // CHECK2:       user_code.entry:
5842 // CHECK2-NEXT:    [[TMP1:%.*]] = load i8, i8* [[CONV]], align 1
5843 // CHECK2-NEXT:    [[C2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 1)
5844 // CHECK2-NEXT:    store i8 [[TMP1]], i8* [[C2]], align 1
5845 // CHECK2-NEXT:    [[TMP2:%.*]] = load float, float* [[CONV1]], align 4
5846 // CHECK2-NEXT:    [[D3:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
5847 // CHECK2-NEXT:    [[D_ON_STACK:%.*]] = bitcast i8* [[D3]] to float*
5848 // CHECK2-NEXT:    store float [[TMP2]], float* [[D_ON_STACK]], align 4
5849 // CHECK2-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
5850 // CHECK2-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
5851 // CHECK2-NEXT:    store i32 [[TMP3]], i32* [[DOTTHREADID_TEMP_]], align 4
5852 // CHECK2-NEXT:    call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C2]], float* [[D_ON_STACK]]) #[[ATTR3]]
5853 // CHECK2-NEXT:    call void @__kmpc_free_shared(i8* [[D3]], i32 4)
5854 // CHECK2-NEXT:    call void @__kmpc_free_shared(i8* [[C2]], i32 1)
5855 // CHECK2-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
5856 // CHECK2-NEXT:    ret void
5857 // CHECK2:       worker.exit:
5858 // CHECK2-NEXT:    ret void
5859 //
5860 //
5861 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1
5862 // CHECK2-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i8* noundef nonnull align 1 dereferenceable(1) [[C:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
5863 // CHECK2-NEXT:  entry:
5864 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
5865 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
5866 // CHECK2-NEXT:    [[C_ADDR:%.*]] = alloca i8*, align 4
5867 // CHECK2-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 4
5868 // CHECK2-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
5869 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
5870 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
5871 // CHECK2-NEXT:    store i8* [[C]], i8** [[C_ADDR]], align 4
5872 // CHECK2-NEXT:    store float* [[D]], float** [[D_ADDR]], align 4
5873 // CHECK2-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4
5874 // CHECK2-NEXT:    [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4
5875 // CHECK2-NEXT:    [[C1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 1)
5876 // CHECK2-NEXT:    [[D2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
5877 // CHECK2-NEXT:    [[D_ON_STACK:%.*]] = bitcast i8* [[D2]] to float*
5878 // CHECK2-NEXT:    store i8 0, i8* [[C1]], align 1
5879 // CHECK2-NEXT:    store float 1.000000e+00, float* [[D_ON_STACK]], align 4
5880 // CHECK2-NEXT:    [[TMP2:%.*]] = load i8, i8* [[C1]], align 1
5881 // CHECK2-NEXT:    [[CONV:%.*]] = sext i8 [[TMP2]] to i32
5882 // CHECK2-NEXT:    [[XOR:%.*]] = xor i32 [[CONV]], 2
5883 // CHECK2-NEXT:    [[CONV3:%.*]] = trunc i32 [[XOR]] to i8
5884 // CHECK2-NEXT:    store i8 [[CONV3]], i8* [[C1]], align 1
5885 // CHECK2-NEXT:    [[TMP3:%.*]] = load float, float* [[D_ON_STACK]], align 4
5886 // CHECK2-NEXT:    [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01
5887 // CHECK2-NEXT:    store float [[MUL]], float* [[D_ON_STACK]], align 4
5888 // CHECK2-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
5889 // CHECK2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
5890 // CHECK2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
5891 // CHECK2-NEXT:    store i8* [[C1]], i8** [[TMP6]], align 4
5892 // CHECK2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
5893 // CHECK2-NEXT:    [[TMP8:%.*]] = bitcast float* [[D_ON_STACK]] to i8*
5894 // CHECK2-NEXT:    store i8* [[TMP8]], i8** [[TMP7]], align 4
5895 // CHECK2-NEXT:    [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
5896 // CHECK2-NEXT:    [[TMP10:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
5897 // CHECK2-NEXT:    [[TMP11:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP10]], i32 1024, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8)
5898 // CHECK2-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 1
5899 // CHECK2-NEXT:    br i1 [[TMP12]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
5900 // CHECK2:       .omp.reduction.then:
5901 // CHECK2-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP0]], align 1
5902 // CHECK2-NEXT:    [[CONV4:%.*]] = sext i8 [[TMP13]] to i32
5903 // CHECK2-NEXT:    [[TMP14:%.*]] = load i8, i8* [[C1]], align 1
5904 // CHECK2-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP14]] to i32
5905 // CHECK2-NEXT:    [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
5906 // CHECK2-NEXT:    [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
5907 // CHECK2-NEXT:    store i8 [[CONV7]], i8* [[TMP0]], align 1
5908 // CHECK2-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP1]], align 4
5909 // CHECK2-NEXT:    [[TMP16:%.*]] = load float, float* [[D_ON_STACK]], align 4
5910 // CHECK2-NEXT:    [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP16]]
5911 // CHECK2-NEXT:    store float [[MUL8]], float* [[TMP1]], align 4
5912 // CHECK2-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
5913 // CHECK2-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
5914 // CHECK2:       .omp.reduction.done:
5915 // CHECK2-NEXT:    call void @__kmpc_free_shared(i8* [[D2]], i32 4)
5916 // CHECK2-NEXT:    call void @__kmpc_free_shared(i8* [[C1]], i32 1)
5917 // CHECK2-NEXT:    ret void
5918 //
5919 //
5920 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3
5921 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] {
5922 // CHECK2-NEXT:  entry:
5923 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
5924 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
5925 // CHECK2-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
5926 // CHECK2-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
5927 // CHECK2-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
5928 // CHECK2-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1
5929 // CHECK2-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4
5930 // CHECK2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
5931 // CHECK2-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
5932 // CHECK2-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
5933 // CHECK2-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
5934 // CHECK2-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
5935 // CHECK2-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
5936 // CHECK2-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
5937 // CHECK2-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
5938 // CHECK2-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
5939 // CHECK2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
5940 // CHECK2-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
5941 // CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
5942 // CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
5943 // CHECK2-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1
5944 // CHECK2-NEXT:    [[TMP14:%.*]] = sext i8 [[TMP13]] to i32
5945 // CHECK2-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_get_warp_size()
5946 // CHECK2-NEXT:    [[TMP16:%.*]] = trunc i32 [[TMP15]] to i16
5947 // CHECK2-NEXT:    [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP16]])
5948 // CHECK2-NEXT:    [[TMP18:%.*]] = trunc i32 [[TMP17]] to i8
5949 // CHECK2-NEXT:    store i8 [[TMP18]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1
5950 // CHECK2-NEXT:    [[TMP19:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
5951 // CHECK2-NEXT:    [[TMP20:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
5952 // CHECK2-NEXT:    store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4
5953 // CHECK2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
5954 // CHECK2-NEXT:    [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4
5955 // CHECK2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
5956 // CHECK2-NEXT:    [[TMP24:%.*]] = bitcast i8* [[TMP22]] to float*
5957 // CHECK2-NEXT:    [[TMP25:%.*]] = getelementptr float, float* [[TMP24]], i32 1
5958 // CHECK2-NEXT:    [[TMP26:%.*]] = bitcast float* [[TMP25]] to i8*
5959 // CHECK2-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP24]] to i32*
5960 // CHECK2-NEXT:    [[TMP28:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32*
5961 // CHECK2-NEXT:    [[TMP29:%.*]] = load i32, i32* [[TMP27]], align 4
5962 // CHECK2-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
5963 // CHECK2-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
5964 // CHECK2-NEXT:    [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
5965 // CHECK2-NEXT:    store i32 [[TMP32]], i32* [[TMP28]], align 4
5966 // CHECK2-NEXT:    [[TMP33:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1
5967 // CHECK2-NEXT:    [[TMP34:%.*]] = getelementptr i32, i32* [[TMP28]], i32 1
5968 // CHECK2-NEXT:    [[TMP35:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
5969 // CHECK2-NEXT:    store i8* [[TMP35]], i8** [[TMP23]], align 4
5970 // CHECK2-NEXT:    [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 0
5971 // CHECK2-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1
5972 // CHECK2-NEXT:    [[TMP38:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
5973 // CHECK2-NEXT:    [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
5974 // CHECK2-NEXT:    [[TMP40:%.*]] = icmp eq i16 [[TMP8]], 2
5975 // CHECK2-NEXT:    [[TMP41:%.*]] = and i16 [[TMP6]], 1
5976 // CHECK2-NEXT:    [[TMP42:%.*]] = icmp eq i16 [[TMP41]], 0
5977 // CHECK2-NEXT:    [[TMP43:%.*]] = and i1 [[TMP40]], [[TMP42]]
5978 // CHECK2-NEXT:    [[TMP44:%.*]] = icmp sgt i16 [[TMP7]], 0
5979 // CHECK2-NEXT:    [[TMP45:%.*]] = and i1 [[TMP43]], [[TMP44]]
5980 // CHECK2-NEXT:    [[TMP46:%.*]] = or i1 [[TMP36]], [[TMP39]]
5981 // CHECK2-NEXT:    [[TMP47:%.*]] = or i1 [[TMP46]], [[TMP45]]
5982 // CHECK2-NEXT:    br i1 [[TMP47]], label [[THEN:%.*]], label [[ELSE:%.*]]
5983 // CHECK2:       then:
5984 // CHECK2-NEXT:    [[TMP48:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
5985 // CHECK2-NEXT:    [[TMP49:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
5986 // CHECK2-NEXT:    call void @"_omp$reduction$reduction_func2"(i8* [[TMP48]], i8* [[TMP49]]) #[[ATTR3]]
5987 // CHECK2-NEXT:    br label [[IFCONT:%.*]]
5988 // CHECK2:       else:
5989 // CHECK2-NEXT:    br label [[IFCONT]]
5990 // CHECK2:       ifcont:
5991 // CHECK2-NEXT:    [[TMP50:%.*]] = icmp eq i16 [[TMP8]], 1
5992 // CHECK2-NEXT:    [[TMP51:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
5993 // CHECK2-NEXT:    [[TMP52:%.*]] = and i1 [[TMP50]], [[TMP51]]
5994 // CHECK2-NEXT:    br i1 [[TMP52]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
5995 // CHECK2:       then5:
5996 // CHECK2-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
5997 // CHECK2-NEXT:    [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4
5998 // CHECK2-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
5999 // CHECK2-NEXT:    [[TMP56:%.*]] = load i8*, i8** [[TMP55]], align 4
6000 // CHECK2-NEXT:    [[TMP57:%.*]] = load i8, i8* [[TMP54]], align 1
6001 // CHECK2-NEXT:    store i8 [[TMP57]], i8* [[TMP56]], align 1
6002 // CHECK2-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
6003 // CHECK2-NEXT:    [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4
6004 // CHECK2-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
6005 // CHECK2-NEXT:    [[TMP61:%.*]] = load i8*, i8** [[TMP60]], align 4
6006 // CHECK2-NEXT:    [[TMP62:%.*]] = bitcast i8* [[TMP59]] to float*
6007 // CHECK2-NEXT:    [[TMP63:%.*]] = bitcast i8* [[TMP61]] to float*
6008 // CHECK2-NEXT:    [[TMP64:%.*]] = load float, float* [[TMP62]], align 4
6009 // CHECK2-NEXT:    store float [[TMP64]], float* [[TMP63]], align 4
6010 // CHECK2-NEXT:    br label [[IFCONT7:%.*]]
6011 // CHECK2:       else6:
6012 // CHECK2-NEXT:    br label [[IFCONT7]]
6013 // CHECK2:       ifcont7:
6014 // CHECK2-NEXT:    ret void
6015 //
6016 //
6017 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4
6018 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
6019 // CHECK2-NEXT:  entry:
6020 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
6021 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
6022 // CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
6023 // CHECK2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
6024 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
6025 // CHECK2-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
6026 // CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
6027 // CHECK2-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
6028 // CHECK2-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
6029 // CHECK2-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
6030 // CHECK2-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4
6031 // CHECK2-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
6032 // CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
6033 // CHECK2-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
6034 // CHECK2-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
6035 // CHECK2:       then:
6036 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
6037 // CHECK2-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
6038 // CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
6039 // CHECK2-NEXT:    [[TMP11:%.*]] = bitcast i32 addrspace(3)* [[TMP10]] to i8 addrspace(3)*
6040 // CHECK2-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP9]], align 1
6041 // CHECK2-NEXT:    store volatile i8 [[TMP12]], i8 addrspace(3)* [[TMP11]], align 1
6042 // CHECK2-NEXT:    br label [[IFCONT:%.*]]
6043 // CHECK2:       else:
6044 // CHECK2-NEXT:    br label [[IFCONT]]
6045 // CHECK2:       ifcont:
6046 // CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
6047 // CHECK2-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
6048 // CHECK2-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
6049 // CHECK2-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
6050 // CHECK2:       then2:
6051 // CHECK2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
6052 // CHECK2-NEXT:    [[TMP15:%.*]] = bitcast i32 addrspace(3)* [[TMP14]] to i8 addrspace(3)*
6053 // CHECK2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
6054 // CHECK2-NEXT:    [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
6055 // CHECK2-NEXT:    [[TMP18:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP15]], align 1
6056 // CHECK2-NEXT:    store i8 [[TMP18]], i8* [[TMP17]], align 1
6057 // CHECK2-NEXT:    br label [[IFCONT4:%.*]]
6058 // CHECK2:       else3:
6059 // CHECK2-NEXT:    br label [[IFCONT4]]
6060 // CHECK2:       ifcont4:
6061 // CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
6062 // CHECK2-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
6063 // CHECK2-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
6064 // CHECK2:       then6:
6065 // CHECK2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
6066 // CHECK2-NEXT:    [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 4
6067 // CHECK2-NEXT:    [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i32*
6068 // CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
6069 // CHECK2-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP21]], align 4
6070 // CHECK2-NEXT:    store volatile i32 [[TMP23]], i32 addrspace(3)* [[TMP22]], align 4
6071 // CHECK2-NEXT:    br label [[IFCONT8:%.*]]
6072 // CHECK2:       else7:
6073 // CHECK2-NEXT:    br label [[IFCONT8]]
6074 // CHECK2:       ifcont8:
6075 // CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
6076 // CHECK2-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTADDR1]], align 4
6077 // CHECK2-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP24]]
6078 // CHECK2-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
6079 // CHECK2:       then10:
6080 // CHECK2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
6081 // CHECK2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
6082 // CHECK2-NEXT:    [[TMP27:%.*]] = load i8*, i8** [[TMP26]], align 4
6083 // CHECK2-NEXT:    [[TMP28:%.*]] = bitcast i8* [[TMP27]] to i32*
6084 // CHECK2-NEXT:    [[TMP29:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP25]], align 4
6085 // CHECK2-NEXT:    store i32 [[TMP29]], i32* [[TMP28]], align 4
6086 // CHECK2-NEXT:    br label [[IFCONT12:%.*]]
6087 // CHECK2:       else11:
6088 // CHECK2-NEXT:    br label [[IFCONT12]]
6089 // CHECK2:       ifcont12:
6090 // CHECK2-NEXT:    ret void
6091 //
6092 //
6093 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5
6094 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
6095 // CHECK2-NEXT:  entry:
6096 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
6097 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
6098 // CHECK2-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
6099 // CHECK2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
6100 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
6101 // CHECK2-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
6102 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
6103 // CHECK2-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
6104 // CHECK2-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
6105 // CHECK2-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
6106 // CHECK2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
6107 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
6108 // CHECK2-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
6109 // CHECK2-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
6110 // CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]]
6111 // CHECK2-NEXT:    [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1
6112 // CHECK2-NEXT:    store i8 [[TMP11]], i8* [[TMP10]], align 128
6113 // CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
6114 // CHECK2-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
6115 // CHECK2-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
6116 // CHECK2-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1
6117 // CHECK2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]]
6118 // CHECK2-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP14]], align 4
6119 // CHECK2-NEXT:    store float [[TMP16]], float* [[TMP15]], align 128
6120 // CHECK2-NEXT:    ret void
6121 //
6122 //
6123 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6
6124 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
6125 // CHECK2-NEXT:  entry:
6126 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
6127 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
6128 // CHECK2-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
6129 // CHECK2-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
6130 // CHECK2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
6131 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
6132 // CHECK2-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
6133 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
6134 // CHECK2-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
6135 // CHECK2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
6136 // CHECK2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
6137 // CHECK2-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
6138 // CHECK2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]]
6139 // CHECK2-NEXT:    store i8* [[TMP7]], i8** [[TMP6]], align 4
6140 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
6141 // CHECK2-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1
6142 // CHECK2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]]
6143 // CHECK2-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
6144 // CHECK2-NEXT:    store i8* [[TMP10]], i8** [[TMP8]], align 4
6145 // CHECK2-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
6146 // CHECK2-NEXT:    [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
6147 // CHECK2-NEXT:    call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]]
6148 // CHECK2-NEXT:    ret void
6149 //
6150 //
6151 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7
6152 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
6153 // CHECK2-NEXT:  entry:
6154 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
6155 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
6156 // CHECK2-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
6157 // CHECK2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
6158 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
6159 // CHECK2-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
6160 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
6161 // CHECK2-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
6162 // CHECK2-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
6163 // CHECK2-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
6164 // CHECK2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
6165 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
6166 // CHECK2-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
6167 // CHECK2-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
6168 // CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP7]]
6169 // CHECK2-NEXT:    [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128
6170 // CHECK2-NEXT:    store i8 [[TMP11]], i8* [[TMP9]], align 1
6171 // CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
6172 // CHECK2-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
6173 // CHECK2-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
6174 // CHECK2-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1
6175 // CHECK2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP7]]
6176 // CHECK2-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP15]], align 128
6177 // CHECK2-NEXT:    store float [[TMP16]], float* [[TMP14]], align 4
6178 // CHECK2-NEXT:    ret void
6179 //
6180 //
6181 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8
6182 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
6183 // CHECK2-NEXT:  entry:
6184 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
6185 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
6186 // CHECK2-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
6187 // CHECK2-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
6188 // CHECK2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
6189 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
6190 // CHECK2-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
6191 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
6192 // CHECK2-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
6193 // CHECK2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
6194 // CHECK2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
6195 // CHECK2-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
6196 // CHECK2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* [[C]], i32 0, i32 [[TMP5]]
6197 // CHECK2-NEXT:    store i8* [[TMP7]], i8** [[TMP6]], align 4
6198 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
6199 // CHECK2-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1
6200 // CHECK2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1024 x float], [1024 x float]* [[D]], i32 0, i32 [[TMP5]]
6201 // CHECK2-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
6202 // CHECK2-NEXT:    store i8* [[TMP10]], i8** [[TMP8]], align 4
6203 // CHECK2-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
6204 // CHECK2-NEXT:    [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
6205 // CHECK2-NEXT:    call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]]
6206 // CHECK2-NEXT:    ret void
6207 //
6208 //
6209 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33
6210 // CHECK2-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
6211 // CHECK2-NEXT:  entry:
6212 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
6213 // CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
6214 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
6215 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
6216 // CHECK2-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
6217 // CHECK2-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
6218 // CHECK2-NEXT:    [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16*
6219 // CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true)
6220 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
6221 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
6222 // CHECK2:       user_code.entry:
6223 // CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
6224 // CHECK2-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
6225 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
6226 // CHECK2-NEXT:    call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]]
6227 // CHECK2-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true)
6228 // CHECK2-NEXT:    ret void
6229 // CHECK2:       worker.exit:
6230 // CHECK2-NEXT:    ret void
6231 //
6232 //
6233 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__9
6234 // CHECK2-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] {
6235 // CHECK2-NEXT:  entry:
6236 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
6237 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
6238 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 4
6239 // CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca i16*, align 4
6240 // CHECK2-NEXT:    [[A1:%.*]] = alloca i32, align 4
6241 // CHECK2-NEXT:    [[B2:%.*]] = alloca i16, align 2
6242 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
6243 // CHECK2-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
6244 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
6245 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
6246 // CHECK2-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 4
6247 // CHECK2-NEXT:    store i16* [[B]], i16** [[B_ADDR]], align 4
6248 // CHECK2-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
6249 // CHECK2-NEXT:    [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
6250 // CHECK2-NEXT:    store i32 0, i32* [[A1]], align 4
6251 // CHECK2-NEXT:    store i16 -32768, i16* [[B2]], align 2
6252 // CHECK2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
6253 // CHECK2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[A1]] to i8*
6254 // CHECK2-NEXT:    store i8* [[TMP3]], i8** [[TMP2]], align 4
6255 // CHECK2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
6256 // CHECK2-NEXT:    [[TMP5:%.*]] = bitcast i16* [[B2]] to i8*
6257 // CHECK2-NEXT:    store i8* [[TMP5]], i8** [[TMP4]], align 4
6258 // CHECK2-NEXT:    [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
6259 // CHECK2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
6260 // CHECK2-NEXT:    [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
6261 // CHECK2-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2)
6262 // CHECK2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
6263 // CHECK2-NEXT:    [[TMP10:%.*]] = bitcast i32* [[A1]] to i8*
6264 // CHECK2-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 4
6265 // CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
6266 // CHECK2-NEXT:    [[TMP12:%.*]] = bitcast i16* [[B2]] to i8*
6267 // CHECK2-NEXT:    store i8* [[TMP12]], i8** [[TMP11]], align 4
6268 // CHECK2-NEXT:    [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
6269 // CHECK2-NEXT:    [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
6270 // CHECK2-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 1024, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20)
6271 // CHECK2-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1
6272 // CHECK2-NEXT:    br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
6273 // CHECK2:       .omp.reduction.then:
6274 // CHECK2-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4
6275 // CHECK2-NEXT:    [[TMP18:%.*]] = load i32, i32* [[A1]], align 4
6276 // CHECK2-NEXT:    [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]]
6277 // CHECK2-NEXT:    store i32 [[OR]], i32* [[TMP0]], align 4
6278 // CHECK2-NEXT:    [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2
6279 // CHECK2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP19]] to i32
6280 // CHECK2-NEXT:    [[TMP20:%.*]] = load i16, i16* [[B2]], align 2
6281 // CHECK2-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP20]] to i32
6282 // CHECK2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
6283 // CHECK2-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
6284 // CHECK2:       cond.true:
6285 // CHECK2-NEXT:    [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2
6286 // CHECK2-NEXT:    br label [[COND_END:%.*]]
6287 // CHECK2:       cond.false:
6288 // CHECK2-NEXT:    [[TMP22:%.*]] = load i16, i16* [[B2]], align 2
6289 // CHECK2-NEXT:    br label [[COND_END]]
6290 // CHECK2:       cond.end:
6291 // CHECK2-NEXT:    [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ]
6292 // CHECK2-NEXT:    store i16 [[COND]], i16* [[TMP1]], align 2
6293 // CHECK2-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]])
6294 // CHECK2-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
6295 // CHECK2:       .omp.reduction.done:
6296 // CHECK2-NEXT:    ret void
6297 //
6298 //
6299 // CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__10
6300 // CHECK2-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] {
6301 // CHECK2-NEXT:  entry:
6302 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
6303 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
6304 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 4
6305 // CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca i16*, align 4
6306 // CHECK2-NEXT:    [[A1:%.*]] = alloca i32, align 4
6307 // CHECK2-NEXT:    [[B2:%.*]] = alloca i16, align 2
6308 // CHECK2-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
6309 // CHECK2-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
6310 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
6311 // CHECK2-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 4
6312 // CHECK2-NEXT:    store i16* [[B]], i16** [[B_ADDR]], align 4
6313 // CHECK2-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
6314 // CHECK2-NEXT:    [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
6315 // CHECK2-NEXT:    store i32 0, i32* [[A1]], align 4
6316 // CHECK2-NEXT:    store i16 -32768, i16* [[B2]], align 2
6317 // CHECK2-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A1]], align 4
6318 // CHECK2-NEXT:    [[OR:%.*]] = or i32 [[TMP2]], 1
6319 // CHECK2-NEXT:    store i32 [[OR]], i32* [[A1]], align 4
6320 // CHECK2-NEXT:    [[TMP3:%.*]] = load i16, i16* [[B2]], align 2
6321 // CHECK2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP3]] to i32
6322 // CHECK2-NEXT:    [[CMP:%.*]] = icmp sgt i32 99, [[CONV]]
6323 // CHECK2-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
6324 // CHECK2:       cond.true:
6325 // CHECK2-NEXT:    br label [[COND_END:%.*]]
6326 // CHECK2:       cond.false:
6327 // CHECK2-NEXT:    [[TMP4:%.*]] = load i16, i16* [[B2]], align 2
6328 // CHECK2-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP4]] to i32
6329 // CHECK2-NEXT:    br label [[COND_END]]
6330 // CHECK2:       cond.end:
6331 // CHECK2-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ]
6332 // CHECK2-NEXT:    [[CONV4:%.*]] = trunc i32 [[COND]] to i16
6333 // CHECK2-NEXT:    store i16 [[CONV4]], i16* [[B2]], align 2
6334 // CHECK2-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
6335 // CHECK2-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
6336 // CHECK2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
6337 // CHECK2-NEXT:    [[TMP8:%.*]] = bitcast i32* [[A1]] to i8*
6338 // CHECK2-NEXT:    store i8* [[TMP8]], i8** [[TMP7]], align 4
6339 // CHECK2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
6340 // CHECK2-NEXT:    [[TMP10:%.*]] = bitcast i16* [[B2]] to i8*
6341 // CHECK2-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 4
6342 // CHECK2-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
6343 // CHECK2-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13)
6344 // CHECK2-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1
6345 // CHECK2-NEXT:    br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
6346 // CHECK2:       .omp.reduction.then:
6347 // CHECK2-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4
6348 // CHECK2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[A1]], align 4
6349 // CHECK2-NEXT:    [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]]
6350 // CHECK2-NEXT:    store i32 [[OR5]], i32* [[TMP0]], align 4
6351 // CHECK2-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2
6352 // CHECK2-NEXT:    [[CONV6:%.*]] = sext i16 [[TMP16]] to i32
6353 // CHECK2-NEXT:    [[TMP17:%.*]] = load i16, i16* [[B2]], align 2
6354 // CHECK2-NEXT:    [[CONV7:%.*]] = sext i16 [[TMP17]] to i32
6355 // CHECK2-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]]
6356 // CHECK2-NEXT:    br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
6357 // CHECK2:       cond.true9:
6358 // CHECK2-NEXT:    [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2
6359 // CHECK2-NEXT:    br label [[COND_END11:%.*]]
6360 // CHECK2:       cond.false10:
6361 // CHECK2-NEXT:    [[TMP19:%.*]] = load i16, i16* [[B2]], align 2
6362 // CHECK2-NEXT:    br label [[COND_END11]]
6363 // CHECK2:       cond.end11:
6364 // CHECK2-NEXT:    [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ]
6365 // CHECK2-NEXT:    store i16 [[COND12]], i16* [[TMP1]], align 2
6366 // CHECK2-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
6367 // CHECK2-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
6368 // CHECK2:       .omp.reduction.done:
6369 // CHECK2-NEXT:    ret void
6370 //
6371 //
6372 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12
6373 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] {
6374 // CHECK2-NEXT:  entry:
6375 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
6376 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
6377 // CHECK2-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
6378 // CHECK2-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
6379 // CHECK2-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
6380 // CHECK2-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
6381 // CHECK2-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
6382 // CHECK2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
6383 // CHECK2-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
6384 // CHECK2-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
6385 // CHECK2-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
6386 // CHECK2-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
6387 // CHECK2-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
6388 // CHECK2-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
6389 // CHECK2-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
6390 // CHECK2-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
6391 // CHECK2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
6392 // CHECK2-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
6393 // CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
6394 // CHECK2-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
6395 // CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
6396 // CHECK2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
6397 // CHECK2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
6398 // CHECK2-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size()
6399 // CHECK2-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
6400 // CHECK2-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]])
6401 // CHECK2-NEXT:    store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
6402 // CHECK2-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
6403 // CHECK2-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
6404 // CHECK2-NEXT:    [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
6405 // CHECK2-NEXT:    store i8* [[TMP21]], i8** [[TMP11]], align 4
6406 // CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
6407 // CHECK2-NEXT:    [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 4
6408 // CHECK2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
6409 // CHECK2-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16*
6410 // CHECK2-NEXT:    [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
6411 // CHECK2-NEXT:    [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8*
6412 // CHECK2-NEXT:    [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2
6413 // CHECK2-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP28]] to i32
6414 // CHECK2-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
6415 // CHECK2-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
6416 // CHECK2-NEXT:    [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
6417 // CHECK2-NEXT:    [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16
6418 // CHECK2-NEXT:    store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
6419 // CHECK2-NEXT:    [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
6420 // CHECK2-NEXT:    [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
6421 // CHECK2-NEXT:    [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
6422 // CHECK2-NEXT:    store i8* [[TMP36]], i8** [[TMP24]], align 4
6423 // CHECK2-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0
6424 // CHECK2-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
6425 // CHECK2-NEXT:    [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
6426 // CHECK2-NEXT:    [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
6427 // CHECK2-NEXT:    [[TMP41:%.*]] = icmp eq i16 [[TMP8]], 2
6428 // CHECK2-NEXT:    [[TMP42:%.*]] = and i16 [[TMP6]], 1
6429 // CHECK2-NEXT:    [[TMP43:%.*]] = icmp eq i16 [[TMP42]], 0
6430 // CHECK2-NEXT:    [[TMP44:%.*]] = and i1 [[TMP41]], [[TMP43]]
6431 // CHECK2-NEXT:    [[TMP45:%.*]] = icmp sgt i16 [[TMP7]], 0
6432 // CHECK2-NEXT:    [[TMP46:%.*]] = and i1 [[TMP44]], [[TMP45]]
6433 // CHECK2-NEXT:    [[TMP47:%.*]] = or i1 [[TMP37]], [[TMP40]]
6434 // CHECK2-NEXT:    [[TMP48:%.*]] = or i1 [[TMP47]], [[TMP46]]
6435 // CHECK2-NEXT:    br i1 [[TMP48]], label [[THEN:%.*]], label [[ELSE:%.*]]
6436 // CHECK2:       then:
6437 // CHECK2-NEXT:    [[TMP49:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
6438 // CHECK2-NEXT:    [[TMP50:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
6439 // CHECK2-NEXT:    call void @"_omp$reduction$reduction_func11"(i8* [[TMP49]], i8* [[TMP50]]) #[[ATTR3]]
6440 // CHECK2-NEXT:    br label [[IFCONT:%.*]]
6441 // CHECK2:       else:
6442 // CHECK2-NEXT:    br label [[IFCONT]]
6443 // CHECK2:       ifcont:
6444 // CHECK2-NEXT:    [[TMP51:%.*]] = icmp eq i16 [[TMP8]], 1
6445 // CHECK2-NEXT:    [[TMP52:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
6446 // CHECK2-NEXT:    [[TMP53:%.*]] = and i1 [[TMP51]], [[TMP52]]
6447 // CHECK2-NEXT:    br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
6448 // CHECK2:       then5:
6449 // CHECK2-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
6450 // CHECK2-NEXT:    [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
6451 // CHECK2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
6452 // CHECK2-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4
6453 // CHECK2-NEXT:    [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32*
6454 // CHECK2-NEXT:    [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32*
6455 // CHECK2-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4
6456 // CHECK2-NEXT:    store i32 [[TMP60]], i32* [[TMP59]], align 4
6457 // CHECK2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
6458 // CHECK2-NEXT:    [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
6459 // CHECK2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
6460 // CHECK2-NEXT:    [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 4
6461 // CHECK2-NEXT:    [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16*
6462 // CHECK2-NEXT:    [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16*
6463 // CHECK2-NEXT:    [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2
6464 // CHECK2-NEXT:    store i16 [[TMP67]], i16* [[TMP66]], align 2
6465 // CHECK2-NEXT:    br label [[IFCONT7:%.*]]
6466 // CHECK2:       else6:
6467 // CHECK2-NEXT:    br label [[IFCONT7]]
6468 // CHECK2:       ifcont7:
6469 // CHECK2-NEXT:    ret void
6470 //
6471 //
6472 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13
6473 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
6474 // CHECK2-NEXT:  entry:
6475 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
6476 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
6477 // CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
6478 // CHECK2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
6479 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
6480 // CHECK2-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
6481 // CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
6482 // CHECK2-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
6483 // CHECK2-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
6484 // CHECK2-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
6485 // CHECK2-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4
6486 // CHECK2-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
6487 // CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]])
6488 // CHECK2-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
6489 // CHECK2-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
6490 // CHECK2:       then:
6491 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
6492 // CHECK2-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
6493 // CHECK2-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
6494 // CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
6495 // CHECK2-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
6496 // CHECK2-NEXT:    store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
6497 // CHECK2-NEXT:    br label [[IFCONT:%.*]]
6498 // CHECK2:       else:
6499 // CHECK2-NEXT:    br label [[IFCONT]]
6500 // CHECK2:       ifcont:
6501 // CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
6502 // CHECK2-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
6503 // CHECK2-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
6504 // CHECK2-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
6505 // CHECK2:       then2:
6506 // CHECK2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
6507 // CHECK2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
6508 // CHECK2-NEXT:    [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4
6509 // CHECK2-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
6510 // CHECK2-NEXT:    [[TMP18:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4
6511 // CHECK2-NEXT:    store i32 [[TMP18]], i32* [[TMP17]], align 4
6512 // CHECK2-NEXT:    br label [[IFCONT4:%.*]]
6513 // CHECK2:       else3:
6514 // CHECK2-NEXT:    br label [[IFCONT4]]
6515 // CHECK2:       ifcont4:
6516 // CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
6517 // CHECK2-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
6518 // CHECK2-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
6519 // CHECK2:       then6:
6520 // CHECK2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
6521 // CHECK2-NEXT:    [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 4
6522 // CHECK2-NEXT:    [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i16*
6523 // CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
6524 // CHECK2-NEXT:    [[TMP23:%.*]] = bitcast i32 addrspace(3)* [[TMP22]] to i16 addrspace(3)*
6525 // CHECK2-NEXT:    [[TMP24:%.*]] = load i16, i16* [[TMP21]], align 2
6526 // CHECK2-NEXT:    store volatile i16 [[TMP24]], i16 addrspace(3)* [[TMP23]], align 2
6527 // CHECK2-NEXT:    br label [[IFCONT8:%.*]]
6528 // CHECK2:       else7:
6529 // CHECK2-NEXT:    br label [[IFCONT8]]
6530 // CHECK2:       ifcont8:
6531 // CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
6532 // CHECK2-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTADDR1]], align 4
6533 // CHECK2-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP25]]
6534 // CHECK2-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
6535 // CHECK2:       then10:
6536 // CHECK2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
6537 // CHECK2-NEXT:    [[TMP27:%.*]] = bitcast i32 addrspace(3)* [[TMP26]] to i16 addrspace(3)*
6538 // CHECK2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
6539 // CHECK2-NEXT:    [[TMP29:%.*]] = load i8*, i8** [[TMP28]], align 4
6540 // CHECK2-NEXT:    [[TMP30:%.*]] = bitcast i8* [[TMP29]] to i16*
6541 // CHECK2-NEXT:    [[TMP31:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP27]], align 2
6542 // CHECK2-NEXT:    store i16 [[TMP31]], i16* [[TMP30]], align 2
6543 // CHECK2-NEXT:    br label [[IFCONT12:%.*]]
6544 // CHECK2:       else11:
6545 // CHECK2-NEXT:    br label [[IFCONT12]]
6546 // CHECK2:       ifcont12:
6547 // CHECK2-NEXT:    ret void
6548 //
6549 //
6550 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15
6551 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] {
6552 // CHECK2-NEXT:  entry:
6553 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
6554 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
6555 // CHECK2-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
6556 // CHECK2-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
6557 // CHECK2-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
6558 // CHECK2-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
6559 // CHECK2-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
6560 // CHECK2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
6561 // CHECK2-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
6562 // CHECK2-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
6563 // CHECK2-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
6564 // CHECK2-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
6565 // CHECK2-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
6566 // CHECK2-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
6567 // CHECK2-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
6568 // CHECK2-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
6569 // CHECK2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
6570 // CHECK2-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
6571 // CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
6572 // CHECK2-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
6573 // CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
6574 // CHECK2-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
6575 // CHECK2-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
6576 // CHECK2-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size()
6577 // CHECK2-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
6578 // CHECK2-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]])
6579 // CHECK2-NEXT:    store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
6580 // CHECK2-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
6581 // CHECK2-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
6582 // CHECK2-NEXT:    [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
6583 // CHECK2-NEXT:    store i8* [[TMP21]], i8** [[TMP11]], align 4
6584 // CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
6585 // CHECK2-NEXT:    [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 4
6586 // CHECK2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
6587 // CHECK2-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16*
6588 // CHECK2-NEXT:    [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
6589 // CHECK2-NEXT:    [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8*
6590 // CHECK2-NEXT:    [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2
6591 // CHECK2-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP28]] to i32
6592 // CHECK2-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
6593 // CHECK2-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
6594 // CHECK2-NEXT:    [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
6595 // CHECK2-NEXT:    [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16
6596 // CHECK2-NEXT:    store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
6597 // CHECK2-NEXT:    [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
6598 // CHECK2-NEXT:    [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
6599 // CHECK2-NEXT:    [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
6600 // CHECK2-NEXT:    store i8* [[TMP36]], i8** [[TMP24]], align 4
6601 // CHECK2-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0
6602 // CHECK2-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
6603 // CHECK2-NEXT:    [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
6604 // CHECK2-NEXT:    [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
6605 // CHECK2-NEXT:    [[TMP41:%.*]] = icmp eq i16 [[TMP8]], 2
6606 // CHECK2-NEXT:    [[TMP42:%.*]] = and i16 [[TMP6]], 1
6607 // CHECK2-NEXT:    [[TMP43:%.*]] = icmp eq i16 [[TMP42]], 0
6608 // CHECK2-NEXT:    [[TMP44:%.*]] = and i1 [[TMP41]], [[TMP43]]
6609 // CHECK2-NEXT:    [[TMP45:%.*]] = icmp sgt i16 [[TMP7]], 0
6610 // CHECK2-NEXT:    [[TMP46:%.*]] = and i1 [[TMP44]], [[TMP45]]
6611 // CHECK2-NEXT:    [[TMP47:%.*]] = or i1 [[TMP37]], [[TMP40]]
6612 // CHECK2-NEXT:    [[TMP48:%.*]] = or i1 [[TMP47]], [[TMP46]]
6613 // CHECK2-NEXT:    br i1 [[TMP48]], label [[THEN:%.*]], label [[ELSE:%.*]]
6614 // CHECK2:       then:
6615 // CHECK2-NEXT:    [[TMP49:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
6616 // CHECK2-NEXT:    [[TMP50:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
6617 // CHECK2-NEXT:    call void @"_omp$reduction$reduction_func14"(i8* [[TMP49]], i8* [[TMP50]]) #[[ATTR3]]
6618 // CHECK2-NEXT:    br label [[IFCONT:%.*]]
6619 // CHECK2:       else:
6620 // CHECK2-NEXT:    br label [[IFCONT]]
6621 // CHECK2:       ifcont:
6622 // CHECK2-NEXT:    [[TMP51:%.*]] = icmp eq i16 [[TMP8]], 1
6623 // CHECK2-NEXT:    [[TMP52:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
6624 // CHECK2-NEXT:    [[TMP53:%.*]] = and i1 [[TMP51]], [[TMP52]]
6625 // CHECK2-NEXT:    br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
6626 // CHECK2:       then5:
6627 // CHECK2-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
6628 // CHECK2-NEXT:    [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
6629 // CHECK2-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
6630 // CHECK2-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4
6631 // CHECK2-NEXT:    [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32*
6632 // CHECK2-NEXT:    [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32*
6633 // CHECK2-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4
6634 // CHECK2-NEXT:    store i32 [[TMP60]], i32* [[TMP59]], align 4
6635 // CHECK2-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
6636 // CHECK2-NEXT:    [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
6637 // CHECK2-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
6638 // CHECK2-NEXT:    [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 4
6639 // CHECK2-NEXT:    [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16*
6640 // CHECK2-NEXT:    [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16*
6641 // CHECK2-NEXT:    [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2
6642 // CHECK2-NEXT:    store i16 [[TMP67]], i16* [[TMP66]], align 2
6643 // CHECK2-NEXT:    br label [[IFCONT7:%.*]]
6644 // CHECK2:       else6:
6645 // CHECK2-NEXT:    br label [[IFCONT7]]
6646 // CHECK2:       ifcont7:
6647 // CHECK2-NEXT:    ret void
6648 //
6649 //
6650 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16
6651 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
6652 // CHECK2-NEXT:  entry:
6653 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
6654 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
6655 // CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
6656 // CHECK2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
6657 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
6658 // CHECK2-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
6659 // CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
6660 // CHECK2-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
6661 // CHECK2-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
6662 // CHECK2-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
6663 // CHECK2-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4
6664 // CHECK2-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
6665 // CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
6666 // CHECK2-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
6667 // CHECK2-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
6668 // CHECK2:       then:
6669 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
6670 // CHECK2-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
6671 // CHECK2-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
6672 // CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
6673 // CHECK2-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
6674 // CHECK2-NEXT:    store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
6675 // CHECK2-NEXT:    br label [[IFCONT:%.*]]
6676 // CHECK2:       else:
6677 // CHECK2-NEXT:    br label [[IFCONT]]
6678 // CHECK2:       ifcont:
6679 // CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
6680 // CHECK2-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
6681 // CHECK2-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
6682 // CHECK2-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
6683 // CHECK2:       then2:
6684 // CHECK2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
6685 // CHECK2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
6686 // CHECK2-NEXT:    [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4
6687 // CHECK2-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
6688 // CHECK2-NEXT:    [[TMP18:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4
6689 // CHECK2-NEXT:    store i32 [[TMP18]], i32* [[TMP17]], align 4
6690 // CHECK2-NEXT:    br label [[IFCONT4:%.*]]
6691 // CHECK2:       else3:
6692 // CHECK2-NEXT:    br label [[IFCONT4]]
6693 // CHECK2:       ifcont4:
6694 // CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
6695 // CHECK2-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
6696 // CHECK2-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
6697 // CHECK2:       then6:
6698 // CHECK2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
6699 // CHECK2-NEXT:    [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 4
6700 // CHECK2-NEXT:    [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i16*
6701 // CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
6702 // CHECK2-NEXT:    [[TMP23:%.*]] = bitcast i32 addrspace(3)* [[TMP22]] to i16 addrspace(3)*
6703 // CHECK2-NEXT:    [[TMP24:%.*]] = load i16, i16* [[TMP21]], align 2
6704 // CHECK2-NEXT:    store volatile i16 [[TMP24]], i16 addrspace(3)* [[TMP23]], align 2
6705 // CHECK2-NEXT:    br label [[IFCONT8:%.*]]
6706 // CHECK2:       else7:
6707 // CHECK2-NEXT:    br label [[IFCONT8]]
6708 // CHECK2:       ifcont8:
6709 // CHECK2-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
6710 // CHECK2-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTADDR1]], align 4
6711 // CHECK2-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP25]]
6712 // CHECK2-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
6713 // CHECK2:       then10:
6714 // CHECK2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
6715 // CHECK2-NEXT:    [[TMP27:%.*]] = bitcast i32 addrspace(3)* [[TMP26]] to i16 addrspace(3)*
6716 // CHECK2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
6717 // CHECK2-NEXT:    [[TMP29:%.*]] = load i8*, i8** [[TMP28]], align 4
6718 // CHECK2-NEXT:    [[TMP30:%.*]] = bitcast i8* [[TMP29]] to i16*
6719 // CHECK2-NEXT:    [[TMP31:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP27]], align 2
6720 // CHECK2-NEXT:    store i16 [[TMP31]], i16* [[TMP30]], align 2
6721 // CHECK2-NEXT:    br label [[IFCONT12:%.*]]
6722 // CHECK2:       else11:
6723 // CHECK2-NEXT:    br label [[IFCONT12]]
6724 // CHECK2:       ifcont12:
6725 // CHECK2-NEXT:    ret void
6726 //
6727 //
6728 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17
6729 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
6730 // CHECK2-NEXT:  entry:
6731 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
6732 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
6733 // CHECK2-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
6734 // CHECK2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
6735 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
6736 // CHECK2-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
6737 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
6738 // CHECK2-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
6739 // CHECK2-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
6740 // CHECK2-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1*
6741 // CHECK2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
6742 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
6743 // CHECK2-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
6744 // CHECK2-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
6745 // CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0
6746 // CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]]
6747 // CHECK2-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
6748 // CHECK2-NEXT:    store i32 [[TMP12]], i32* [[TMP11]], align 128
6749 // CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
6750 // CHECK2-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
6751 // CHECK2-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
6752 // CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1
6753 // CHECK2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]]
6754 // CHECK2-NEXT:    [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2
6755 // CHECK2-NEXT:    store i16 [[TMP17]], i16* [[TMP16]], align 128
6756 // CHECK2-NEXT:    ret void
6757 //
6758 //
6759 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18
6760 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
6761 // CHECK2-NEXT:  entry:
6762 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
6763 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
6764 // CHECK2-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
6765 // CHECK2-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
6766 // CHECK2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
6767 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
6768 // CHECK2-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
6769 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
6770 // CHECK2-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1*
6771 // CHECK2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
6772 // CHECK2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
6773 // CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0
6774 // CHECK2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]]
6775 // CHECK2-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
6776 // CHECK2-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 4
6777 // CHECK2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
6778 // CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1
6779 // CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]]
6780 // CHECK2-NEXT:    [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
6781 // CHECK2-NEXT:    store i8* [[TMP11]], i8** [[TMP9]], align 4
6782 // CHECK2-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
6783 // CHECK2-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
6784 // CHECK2-NEXT:    call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]]
6785 // CHECK2-NEXT:    ret void
6786 //
6787 //
6788 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19
6789 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
6790 // CHECK2-NEXT:  entry:
6791 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
6792 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
6793 // CHECK2-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
6794 // CHECK2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
6795 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
6796 // CHECK2-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
6797 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
6798 // CHECK2-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
6799 // CHECK2-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
6800 // CHECK2-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1*
6801 // CHECK2-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
6802 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
6803 // CHECK2-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
6804 // CHECK2-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
6805 // CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0
6806 // CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP7]]
6807 // CHECK2-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128
6808 // CHECK2-NEXT:    store i32 [[TMP12]], i32* [[TMP10]], align 4
6809 // CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
6810 // CHECK2-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
6811 // CHECK2-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
6812 // CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1
6813 // CHECK2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP7]]
6814 // CHECK2-NEXT:    [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128
6815 // CHECK2-NEXT:    store i16 [[TMP17]], i16* [[TMP15]], align 2
6816 // CHECK2-NEXT:    ret void
6817 //
6818 //
6819 // CHECK2-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20
6820 // CHECK2-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
6821 // CHECK2-NEXT:  entry:
6822 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
6823 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
6824 // CHECK2-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
6825 // CHECK2-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
6826 // CHECK2-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
6827 // CHECK2-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
6828 // CHECK2-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
6829 // CHECK2-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
6830 // CHECK2-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1*
6831 // CHECK2-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
6832 // CHECK2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
6833 // CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0
6834 // CHECK2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* [[A]], i32 0, i32 [[TMP5]]
6835 // CHECK2-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
6836 // CHECK2-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 4
6837 // CHECK2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
6838 // CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1
6839 // CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], [1024 x i16]* [[B]], i32 0, i32 [[TMP5]]
6840 // CHECK2-NEXT:    [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
6841 // CHECK2-NEXT:    store i8* [[TMP11]], i8** [[TMP9]], align 4
6842 // CHECK2-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
6843 // CHECK2-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
6844 // CHECK2-NEXT:    call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]]
6845 // CHECK2-NEXT:    ret void
6846 //
6847 //
6848 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20
6849 // CHECK3-SAME: (double* noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] {
6850 // CHECK3-NEXT:  entry:
6851 // CHECK3-NEXT:    [[E_ADDR:%.*]] = alloca double*, align 4
6852 // CHECK3-NEXT:    [[E1:%.*]] = alloca double, align 8
6853 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
6854 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
6855 // CHECK3-NEXT:    store double* [[E]], double** [[E_ADDR]], align 4
6856 // CHECK3-NEXT:    [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
6857 // CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true)
6858 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
6859 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
6860 // CHECK3:       user_code.entry:
6861 // CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
6862 // CHECK3-NEXT:    [[TMP3:%.*]] = load double, double* [[TMP0]], align 8
6863 // CHECK3-NEXT:    store double [[TMP3]], double* [[E1]], align 8
6864 // CHECK3-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
6865 // CHECK3-NEXT:    store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4
6866 // CHECK3-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E1]]) #[[ATTR3:[0-9]+]]
6867 // CHECK3-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
6868 // CHECK3-NEXT:    ret void
6869 // CHECK3:       worker.exit:
6870 // CHECK3-NEXT:    ret void
6871 //
6872 //
6873 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__
6874 // CHECK3-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], double* noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] {
6875 // CHECK3-NEXT:  entry:
6876 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
6877 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
6878 // CHECK3-NEXT:    [[E_ADDR:%.*]] = alloca double*, align 4
6879 // CHECK3-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
6880 // CHECK3-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
6881 // CHECK3-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
6882 // CHECK3-NEXT:    store double* [[E]], double** [[E_ADDR]], align 4
6883 // CHECK3-NEXT:    [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4
6884 // CHECK3-NEXT:    [[E1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 8)
6885 // CHECK3-NEXT:    [[E_ON_STACK:%.*]] = bitcast i8* [[E1]] to double*
6886 // CHECK3-NEXT:    store double 0.000000e+00, double* [[E_ON_STACK]], align 8
6887 // CHECK3-NEXT:    [[TMP1:%.*]] = load double, double* [[E_ON_STACK]], align 8
6888 // CHECK3-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00
6889 // CHECK3-NEXT:    store double [[ADD]], double* [[E_ON_STACK]], align 8
6890 // CHECK3-NEXT:    [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
6891 // CHECK3-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
6892 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
6893 // CHECK3-NEXT:    [[TMP5:%.*]] = bitcast double* [[E_ON_STACK]] to i8*
6894 // CHECK3-NEXT:    store i8* [[TMP5]], i8** [[TMP4]], align 4
6895 // CHECK3-NEXT:    [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
6896 // CHECK3-NEXT:    [[TMP7:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
6897 // CHECK3-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]], i8* [[TMP7]], i32 2048, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func)
6898 // CHECK3-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
6899 // CHECK3-NEXT:    br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
6900 // CHECK3:       .omp.reduction.then:
6901 // CHECK3-NEXT:    [[TMP10:%.*]] = load double, double* [[TMP0]], align 8
6902 // CHECK3-NEXT:    [[TMP11:%.*]] = load double, double* [[E_ON_STACK]], align 8
6903 // CHECK3-NEXT:    [[ADD2:%.*]] = fadd double [[TMP10]], [[TMP11]]
6904 // CHECK3-NEXT:    store double [[ADD2]], double* [[TMP0]], align 8
6905 // CHECK3-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
6906 // CHECK3-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
6907 // CHECK3:       .omp.reduction.done:
6908 // CHECK3-NEXT:    call void @__kmpc_free_shared(i8* [[E1]], i32 8)
6909 // CHECK3-NEXT:    ret void
6910 //
6911 //
6912 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
6913 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2:[0-9]+]] {
6914 // CHECK3-NEXT:  entry:
6915 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
6916 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
6917 // CHECK3-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
6918 // CHECK3-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
6919 // CHECK3-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4
6920 // CHECK3-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8
6921 // CHECK3-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
6922 // CHECK3-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
6923 // CHECK3-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
6924 // CHECK3-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
6925 // CHECK3-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
6926 // CHECK3-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]*
6927 // CHECK3-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
6928 // CHECK3-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
6929 // CHECK3-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
6930 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
6931 // CHECK3-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
6932 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
6933 // CHECK3-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double*
6934 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1
6935 // CHECK3-NEXT:    [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8*
6936 // CHECK3-NEXT:    [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64*
6937 // CHECK3-NEXT:    [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
6938 // CHECK3-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8
6939 // CHECK3-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_get_warp_size()
6940 // CHECK3-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
6941 // CHECK3-NEXT:    [[TMP20:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP19]])
6942 // CHECK3-NEXT:    store i64 [[TMP20]], i64* [[TMP16]], align 8
6943 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1
6944 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1
6945 // CHECK3-NEXT:    [[TMP23:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
6946 // CHECK3-NEXT:    store i8* [[TMP23]], i8** [[TMP11]], align 4
6947 // CHECK3-NEXT:    [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 0
6948 // CHECK3-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP8]], 1
6949 // CHECK3-NEXT:    [[TMP26:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
6950 // CHECK3-NEXT:    [[TMP27:%.*]] = and i1 [[TMP25]], [[TMP26]]
6951 // CHECK3-NEXT:    [[TMP28:%.*]] = icmp eq i16 [[TMP8]], 2
6952 // CHECK3-NEXT:    [[TMP29:%.*]] = and i16 [[TMP6]], 1
6953 // CHECK3-NEXT:    [[TMP30:%.*]] = icmp eq i16 [[TMP29]], 0
6954 // CHECK3-NEXT:    [[TMP31:%.*]] = and i1 [[TMP28]], [[TMP30]]
6955 // CHECK3-NEXT:    [[TMP32:%.*]] = icmp sgt i16 [[TMP7]], 0
6956 // CHECK3-NEXT:    [[TMP33:%.*]] = and i1 [[TMP31]], [[TMP32]]
6957 // CHECK3-NEXT:    [[TMP34:%.*]] = or i1 [[TMP24]], [[TMP27]]
6958 // CHECK3-NEXT:    [[TMP35:%.*]] = or i1 [[TMP34]], [[TMP33]]
6959 // CHECK3-NEXT:    br i1 [[TMP35]], label [[THEN:%.*]], label [[ELSE:%.*]]
6960 // CHECK3:       then:
6961 // CHECK3-NEXT:    [[TMP36:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8*
6962 // CHECK3-NEXT:    [[TMP37:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
6963 // CHECK3-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP36]], i8* [[TMP37]]) #[[ATTR3]]
6964 // CHECK3-NEXT:    br label [[IFCONT:%.*]]
6965 // CHECK3:       else:
6966 // CHECK3-NEXT:    br label [[IFCONT]]
6967 // CHECK3:       ifcont:
6968 // CHECK3-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
6969 // CHECK3-NEXT:    [[TMP39:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
6970 // CHECK3-NEXT:    [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
6971 // CHECK3-NEXT:    br i1 [[TMP40]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
6972 // CHECK3:       then4:
6973 // CHECK3-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
6974 // CHECK3-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[TMP41]], align 4
6975 // CHECK3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0
6976 // CHECK3-NEXT:    [[TMP44:%.*]] = load i8*, i8** [[TMP43]], align 4
6977 // CHECK3-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP42]] to double*
6978 // CHECK3-NEXT:    [[TMP46:%.*]] = bitcast i8* [[TMP44]] to double*
6979 // CHECK3-NEXT:    [[TMP47:%.*]] = load double, double* [[TMP45]], align 8
6980 // CHECK3-NEXT:    store double [[TMP47]], double* [[TMP46]], align 8
6981 // CHECK3-NEXT:    br label [[IFCONT6:%.*]]
6982 // CHECK3:       else5:
6983 // CHECK3-NEXT:    br label [[IFCONT6]]
6984 // CHECK3:       ifcont6:
6985 // CHECK3-NEXT:    ret void
6986 //
6987 //
6988 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
6989 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
6990 // CHECK3-NEXT:  entry:
6991 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
6992 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
6993 // CHECK3-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
6994 // CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
6995 // CHECK3-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
6996 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
6997 // CHECK3-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
6998 // CHECK3-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
6999 // CHECK3-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
7000 // CHECK3-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
7001 // CHECK3-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
7002 // CHECK3-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4
7003 // CHECK3-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [1 x i8*]*
7004 // CHECK3-NEXT:    store i32 0, i32* [[DOTCNT_ADDR]], align 4
7005 // CHECK3-NEXT:    br label [[PRECOND:%.*]]
7006 // CHECK3:       precond:
7007 // CHECK3-NEXT:    [[TMP8:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4
7008 // CHECK3-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 2
7009 // CHECK3-NEXT:    br i1 [[TMP9]], label [[BODY:%.*]], label [[EXIT:%.*]]
7010 // CHECK3:       body:
7011 // CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2:[0-9]+]], i32 [[TMP2]])
7012 // CHECK3-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
7013 // CHECK3-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
7014 // CHECK3:       then:
7015 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i32 0, i32 0
7016 // CHECK3-NEXT:    [[TMP11:%.*]] = load i8*, i8** [[TMP10]], align 4
7017 // CHECK3-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to i32*
7018 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 [[TMP8]]
7019 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
7020 // CHECK3-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 4
7021 // CHECK3-NEXT:    store volatile i32 [[TMP15]], i32 addrspace(3)* [[TMP14]], align 4
7022 // CHECK3-NEXT:    br label [[IFCONT:%.*]]
7023 // CHECK3:       else:
7024 // CHECK3-NEXT:    br label [[IFCONT]]
7025 // CHECK3:       ifcont:
7026 // CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
7027 // CHECK3-NEXT:    [[TMP16:%.*]] = load i32, i32* [[DOTADDR1]], align 4
7028 // CHECK3-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP16]]
7029 // CHECK3-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
7030 // CHECK3:       then2:
7031 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
7032 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP7]], i32 0, i32 0
7033 // CHECK3-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP18]], align 4
7034 // CHECK3-NEXT:    [[TMP20:%.*]] = bitcast i8* [[TMP19]] to i32*
7035 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[TMP20]], i32 [[TMP8]]
7036 // CHECK3-NEXT:    [[TMP22:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP17]], align 4
7037 // CHECK3-NEXT:    store i32 [[TMP22]], i32* [[TMP21]], align 4
7038 // CHECK3-NEXT:    br label [[IFCONT4:%.*]]
7039 // CHECK3:       else3:
7040 // CHECK3-NEXT:    br label [[IFCONT4]]
7041 // CHECK3:       ifcont4:
7042 // CHECK3-NEXT:    [[TMP23:%.*]] = add nsw i32 [[TMP8]], 1
7043 // CHECK3-NEXT:    store i32 [[TMP23]], i32* [[DOTCNT_ADDR]], align 4
7044 // CHECK3-NEXT:    br label [[PRECOND]]
7045 // CHECK3:       exit:
7046 // CHECK3-NEXT:    ret void
7047 //
7048 //
7049 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func
7050 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
7051 // CHECK3-NEXT:  entry:
7052 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
7053 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
7054 // CHECK3-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
7055 // CHECK3-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
7056 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
7057 // CHECK3-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
7058 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
7059 // CHECK3-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
7060 // CHECK3-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
7061 // CHECK3-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty*
7062 // CHECK3-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
7063 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
7064 // CHECK3-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
7065 // CHECK3-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
7066 // CHECK3-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0
7067 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]]
7068 // CHECK3-NEXT:    [[TMP12:%.*]] = load double, double* [[TMP10]], align 8
7069 // CHECK3-NEXT:    store double [[TMP12]], double* [[TMP11]], align 128
7070 // CHECK3-NEXT:    ret void
7071 //
7072 //
7073 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func
7074 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
7075 // CHECK3-NEXT:  entry:
7076 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
7077 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
7078 // CHECK3-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
7079 // CHECK3-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
7080 // CHECK3-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
7081 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
7082 // CHECK3-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
7083 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
7084 // CHECK3-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty*
7085 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
7086 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
7087 // CHECK3-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0
7088 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]]
7089 // CHECK3-NEXT:    [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
7090 // CHECK3-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 4
7091 // CHECK3-NEXT:    [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
7092 // CHECK3-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
7093 // CHECK3-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR3]]
7094 // CHECK3-NEXT:    ret void
7095 //
7096 //
7097 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func
7098 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
7099 // CHECK3-NEXT:  entry:
7100 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
7101 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
7102 // CHECK3-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
7103 // CHECK3-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
7104 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
7105 // CHECK3-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
7106 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
7107 // CHECK3-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
7108 // CHECK3-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
7109 // CHECK3-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty*
7110 // CHECK3-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
7111 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0
7112 // CHECK3-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
7113 // CHECK3-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to double*
7114 // CHECK3-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP6]], i32 0, i32 0
7115 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP7]]
7116 // CHECK3-NEXT:    [[TMP12:%.*]] = load double, double* [[TMP11]], align 128
7117 // CHECK3-NEXT:    store double [[TMP12]], double* [[TMP10]], align 8
7118 // CHECK3-NEXT:    ret void
7119 //
7120 //
7121 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func
7122 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
7123 // CHECK3-NEXT:  entry:
7124 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
7125 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
7126 // CHECK3-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
7127 // CHECK3-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4
7128 // CHECK3-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
7129 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
7130 // CHECK3-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
7131 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
7132 // CHECK3-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty*
7133 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
7134 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
7135 // CHECK3-NEXT:    [[E:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[TMP4]], i32 0, i32 0
7136 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2048 x double], [2048 x double]* [[E]], i32 0, i32 [[TMP5]]
7137 // CHECK3-NEXT:    [[TMP8:%.*]] = bitcast double* [[TMP7]] to i8*
7138 // CHECK3-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 4
7139 // CHECK3-NEXT:    [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
7140 // CHECK3-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
7141 // CHECK3-NEXT:    call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR3]]
7142 // CHECK3-NEXT:    ret void
7143 //
7144 //
7145 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26
7146 // CHECK3-SAME: (i32 noundef [[C:%.*]], i32 noundef [[D:%.*]]) #[[ATTR0]] {
7147 // CHECK3-NEXT:  entry:
7148 // CHECK3-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4
7149 // CHECK3-NEXT:    [[D_ADDR:%.*]] = alloca i32, align 4
7150 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
7151 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
7152 // CHECK3-NEXT:    store i32 [[C]], i32* [[C_ADDR]], align 4
7153 // CHECK3-NEXT:    store i32 [[D]], i32* [[D_ADDR]], align 4
7154 // CHECK3-NEXT:    [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8*
7155 // CHECK3-NEXT:    [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float*
7156 // CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true)
7157 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
7158 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
7159 // CHECK3:       user_code.entry:
7160 // CHECK3-NEXT:    [[TMP1:%.*]] = load i8, i8* [[CONV]], align 1
7161 // CHECK3-NEXT:    [[C2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 1)
7162 // CHECK3-NEXT:    store i8 [[TMP1]], i8* [[C2]], align 1
7163 // CHECK3-NEXT:    [[TMP2:%.*]] = load float, float* [[CONV1]], align 4
7164 // CHECK3-NEXT:    [[D3:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
7165 // CHECK3-NEXT:    [[D_ON_STACK:%.*]] = bitcast i8* [[D3]] to float*
7166 // CHECK3-NEXT:    store float [[TMP2]], float* [[D_ON_STACK]], align 4
7167 // CHECK3-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
7168 // CHECK3-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
7169 // CHECK3-NEXT:    store i32 [[TMP3]], i32* [[DOTTHREADID_TEMP_]], align 4
7170 // CHECK3-NEXT:    call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C2]], float* [[D_ON_STACK]]) #[[ATTR3]]
7171 // CHECK3-NEXT:    call void @__kmpc_free_shared(i8* [[D3]], i32 4)
7172 // CHECK3-NEXT:    call void @__kmpc_free_shared(i8* [[C2]], i32 1)
7173 // CHECK3-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true)
7174 // CHECK3-NEXT:    ret void
7175 // CHECK3:       worker.exit:
7176 // CHECK3-NEXT:    ret void
7177 //
7178 //
7179 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1
7180 // CHECK3-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i8* noundef nonnull align 1 dereferenceable(1) [[C:%.*]], float* noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
7181 // CHECK3-NEXT:  entry:
7182 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
7183 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
7184 // CHECK3-NEXT:    [[C_ADDR:%.*]] = alloca i8*, align 4
7185 // CHECK3-NEXT:    [[D_ADDR:%.*]] = alloca float*, align 4
7186 // CHECK3-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
7187 // CHECK3-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
7188 // CHECK3-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
7189 // CHECK3-NEXT:    store i8* [[C]], i8** [[C_ADDR]], align 4
7190 // CHECK3-NEXT:    store float* [[D]], float** [[D_ADDR]], align 4
7191 // CHECK3-NEXT:    [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4
7192 // CHECK3-NEXT:    [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4
7193 // CHECK3-NEXT:    [[C1:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 1)
7194 // CHECK3-NEXT:    [[D2:%.*]] = call align 8 i8* @__kmpc_alloc_shared(i32 4)
7195 // CHECK3-NEXT:    [[D_ON_STACK:%.*]] = bitcast i8* [[D2]] to float*
7196 // CHECK3-NEXT:    store i8 0, i8* [[C1]], align 1
7197 // CHECK3-NEXT:    store float 1.000000e+00, float* [[D_ON_STACK]], align 4
7198 // CHECK3-NEXT:    [[TMP2:%.*]] = load i8, i8* [[C1]], align 1
7199 // CHECK3-NEXT:    [[CONV:%.*]] = sext i8 [[TMP2]] to i32
7200 // CHECK3-NEXT:    [[XOR:%.*]] = xor i32 [[CONV]], 2
7201 // CHECK3-NEXT:    [[CONV3:%.*]] = trunc i32 [[XOR]] to i8
7202 // CHECK3-NEXT:    store i8 [[CONV3]], i8* [[C1]], align 1
7203 // CHECK3-NEXT:    [[TMP3:%.*]] = load float, float* [[D_ON_STACK]], align 4
7204 // CHECK3-NEXT:    [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01
7205 // CHECK3-NEXT:    store float [[MUL]], float* [[D_ON_STACK]], align 4
7206 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
7207 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4
7208 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
7209 // CHECK3-NEXT:    store i8* [[C1]], i8** [[TMP6]], align 4
7210 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
7211 // CHECK3-NEXT:    [[TMP8:%.*]] = bitcast float* [[D_ON_STACK]] to i8*
7212 // CHECK3-NEXT:    store i8* [[TMP8]], i8** [[TMP7]], align 4
7213 // CHECK3-NEXT:    [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
7214 // CHECK3-NEXT:    [[TMP10:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
7215 // CHECK3-NEXT:    [[TMP11:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i8* [[TMP10]], i32 2048, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func5, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func6, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func7, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func8)
7216 // CHECK3-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP11]], 1
7217 // CHECK3-NEXT:    br i1 [[TMP12]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
7218 // CHECK3:       .omp.reduction.then:
7219 // CHECK3-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP0]], align 1
7220 // CHECK3-NEXT:    [[CONV4:%.*]] = sext i8 [[TMP13]] to i32
7221 // CHECK3-NEXT:    [[TMP14:%.*]] = load i8, i8* [[C1]], align 1
7222 // CHECK3-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP14]] to i32
7223 // CHECK3-NEXT:    [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
7224 // CHECK3-NEXT:    [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
7225 // CHECK3-NEXT:    store i8 [[CONV7]], i8* [[TMP0]], align 1
7226 // CHECK3-NEXT:    [[TMP15:%.*]] = load float, float* [[TMP1]], align 4
7227 // CHECK3-NEXT:    [[TMP16:%.*]] = load float, float* [[D_ON_STACK]], align 4
7228 // CHECK3-NEXT:    [[MUL8:%.*]] = fmul float [[TMP15]], [[TMP16]]
7229 // CHECK3-NEXT:    store float [[MUL8]], float* [[TMP1]], align 4
7230 // CHECK3-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
7231 // CHECK3-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
7232 // CHECK3:       .omp.reduction.done:
7233 // CHECK3-NEXT:    call void @__kmpc_free_shared(i8* [[D2]], i32 4)
7234 // CHECK3-NEXT:    call void @__kmpc_free_shared(i8* [[C1]], i32 1)
7235 // CHECK3-NEXT:    ret void
7236 //
7237 //
7238 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3
7239 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] {
7240 // CHECK3-NEXT:  entry:
7241 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
7242 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
7243 // CHECK3-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
7244 // CHECK3-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
7245 // CHECK3-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
7246 // CHECK3-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1
7247 // CHECK3-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4
7248 // CHECK3-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
7249 // CHECK3-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
7250 // CHECK3-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
7251 // CHECK3-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
7252 // CHECK3-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
7253 // CHECK3-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
7254 // CHECK3-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
7255 // CHECK3-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
7256 // CHECK3-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
7257 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
7258 // CHECK3-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
7259 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
7260 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
7261 // CHECK3-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1
7262 // CHECK3-NEXT:    [[TMP14:%.*]] = sext i8 [[TMP13]] to i32
7263 // CHECK3-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_get_warp_size()
7264 // CHECK3-NEXT:    [[TMP16:%.*]] = trunc i32 [[TMP15]] to i16
7265 // CHECK3-NEXT:    [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP16]])
7266 // CHECK3-NEXT:    [[TMP18:%.*]] = trunc i32 [[TMP17]] to i8
7267 // CHECK3-NEXT:    store i8 [[TMP18]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1
7268 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1
7269 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
7270 // CHECK3-NEXT:    store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4
7271 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
7272 // CHECK3-NEXT:    [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4
7273 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
7274 // CHECK3-NEXT:    [[TMP24:%.*]] = bitcast i8* [[TMP22]] to float*
7275 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr float, float* [[TMP24]], i32 1
7276 // CHECK3-NEXT:    [[TMP26:%.*]] = bitcast float* [[TMP25]] to i8*
7277 // CHECK3-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP24]] to i32*
7278 // CHECK3-NEXT:    [[TMP28:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32*
7279 // CHECK3-NEXT:    [[TMP29:%.*]] = load i32, i32* [[TMP27]], align 4
7280 // CHECK3-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
7281 // CHECK3-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
7282 // CHECK3-NEXT:    [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
7283 // CHECK3-NEXT:    store i32 [[TMP32]], i32* [[TMP28]], align 4
7284 // CHECK3-NEXT:    [[TMP33:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1
7285 // CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr i32, i32* [[TMP28]], i32 1
7286 // CHECK3-NEXT:    [[TMP35:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
7287 // CHECK3-NEXT:    store i8* [[TMP35]], i8** [[TMP23]], align 4
7288 // CHECK3-NEXT:    [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 0
7289 // CHECK3-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1
7290 // CHECK3-NEXT:    [[TMP38:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
7291 // CHECK3-NEXT:    [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]]
7292 // CHECK3-NEXT:    [[TMP40:%.*]] = icmp eq i16 [[TMP8]], 2
7293 // CHECK3-NEXT:    [[TMP41:%.*]] = and i16 [[TMP6]], 1
7294 // CHECK3-NEXT:    [[TMP42:%.*]] = icmp eq i16 [[TMP41]], 0
7295 // CHECK3-NEXT:    [[TMP43:%.*]] = and i1 [[TMP40]], [[TMP42]]
7296 // CHECK3-NEXT:    [[TMP44:%.*]] = icmp sgt i16 [[TMP7]], 0
7297 // CHECK3-NEXT:    [[TMP45:%.*]] = and i1 [[TMP43]], [[TMP44]]
7298 // CHECK3-NEXT:    [[TMP46:%.*]] = or i1 [[TMP36]], [[TMP39]]
7299 // CHECK3-NEXT:    [[TMP47:%.*]] = or i1 [[TMP46]], [[TMP45]]
7300 // CHECK3-NEXT:    br i1 [[TMP47]], label [[THEN:%.*]], label [[ELSE:%.*]]
7301 // CHECK3:       then:
7302 // CHECK3-NEXT:    [[TMP48:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
7303 // CHECK3-NEXT:    [[TMP49:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
7304 // CHECK3-NEXT:    call void @"_omp$reduction$reduction_func2"(i8* [[TMP48]], i8* [[TMP49]]) #[[ATTR3]]
7305 // CHECK3-NEXT:    br label [[IFCONT:%.*]]
7306 // CHECK3:       else:
7307 // CHECK3-NEXT:    br label [[IFCONT]]
7308 // CHECK3:       ifcont:
7309 // CHECK3-NEXT:    [[TMP50:%.*]] = icmp eq i16 [[TMP8]], 1
7310 // CHECK3-NEXT:    [[TMP51:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
7311 // CHECK3-NEXT:    [[TMP52:%.*]] = and i1 [[TMP50]], [[TMP51]]
7312 // CHECK3-NEXT:    br i1 [[TMP52]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
7313 // CHECK3:       then5:
7314 // CHECK3-NEXT:    [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
7315 // CHECK3-NEXT:    [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4
7316 // CHECK3-NEXT:    [[TMP55:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
7317 // CHECK3-NEXT:    [[TMP56:%.*]] = load i8*, i8** [[TMP55]], align 4
7318 // CHECK3-NEXT:    [[TMP57:%.*]] = load i8, i8* [[TMP54]], align 1
7319 // CHECK3-NEXT:    store i8 [[TMP57]], i8* [[TMP56]], align 1
7320 // CHECK3-NEXT:    [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
7321 // CHECK3-NEXT:    [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4
7322 // CHECK3-NEXT:    [[TMP60:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
7323 // CHECK3-NEXT:    [[TMP61:%.*]] = load i8*, i8** [[TMP60]], align 4
7324 // CHECK3-NEXT:    [[TMP62:%.*]] = bitcast i8* [[TMP59]] to float*
7325 // CHECK3-NEXT:    [[TMP63:%.*]] = bitcast i8* [[TMP61]] to float*
7326 // CHECK3-NEXT:    [[TMP64:%.*]] = load float, float* [[TMP62]], align 4
7327 // CHECK3-NEXT:    store float [[TMP64]], float* [[TMP63]], align 4
7328 // CHECK3-NEXT:    br label [[IFCONT7:%.*]]
7329 // CHECK3:       else6:
7330 // CHECK3-NEXT:    br label [[IFCONT7]]
7331 // CHECK3:       ifcont7:
7332 // CHECK3-NEXT:    ret void
7333 //
7334 //
7335 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4
7336 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
7337 // CHECK3-NEXT:  entry:
7338 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
7339 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
7340 // CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
7341 // CHECK3-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
7342 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
7343 // CHECK3-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
7344 // CHECK3-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
7345 // CHECK3-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
7346 // CHECK3-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
7347 // CHECK3-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
7348 // CHECK3-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4
7349 // CHECK3-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
7350 // CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
7351 // CHECK3-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
7352 // CHECK3-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
7353 // CHECK3:       then:
7354 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
7355 // CHECK3-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
7356 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
7357 // CHECK3-NEXT:    [[TMP11:%.*]] = bitcast i32 addrspace(3)* [[TMP10]] to i8 addrspace(3)*
7358 // CHECK3-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP9]], align 1
7359 // CHECK3-NEXT:    store volatile i8 [[TMP12]], i8 addrspace(3)* [[TMP11]], align 1
7360 // CHECK3-NEXT:    br label [[IFCONT:%.*]]
7361 // CHECK3:       else:
7362 // CHECK3-NEXT:    br label [[IFCONT]]
7363 // CHECK3:       ifcont:
7364 // CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
7365 // CHECK3-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
7366 // CHECK3-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
7367 // CHECK3-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
7368 // CHECK3:       then2:
7369 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
7370 // CHECK3-NEXT:    [[TMP15:%.*]] = bitcast i32 addrspace(3)* [[TMP14]] to i8 addrspace(3)*
7371 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
7372 // CHECK3-NEXT:    [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4
7373 // CHECK3-NEXT:    [[TMP18:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP15]], align 1
7374 // CHECK3-NEXT:    store i8 [[TMP18]], i8* [[TMP17]], align 1
7375 // CHECK3-NEXT:    br label [[IFCONT4:%.*]]
7376 // CHECK3:       else3:
7377 // CHECK3-NEXT:    br label [[IFCONT4]]
7378 // CHECK3:       ifcont4:
7379 // CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
7380 // CHECK3-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
7381 // CHECK3-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
7382 // CHECK3:       then6:
7383 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
7384 // CHECK3-NEXT:    [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 4
7385 // CHECK3-NEXT:    [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i32*
7386 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
7387 // CHECK3-NEXT:    [[TMP23:%.*]] = load i32, i32* [[TMP21]], align 4
7388 // CHECK3-NEXT:    store volatile i32 [[TMP23]], i32 addrspace(3)* [[TMP22]], align 4
7389 // CHECK3-NEXT:    br label [[IFCONT8:%.*]]
7390 // CHECK3:       else7:
7391 // CHECK3-NEXT:    br label [[IFCONT8]]
7392 // CHECK3:       ifcont8:
7393 // CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]])
7394 // CHECK3-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTADDR1]], align 4
7395 // CHECK3-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP24]]
7396 // CHECK3-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
7397 // CHECK3:       then10:
7398 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
7399 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
7400 // CHECK3-NEXT:    [[TMP27:%.*]] = load i8*, i8** [[TMP26]], align 4
7401 // CHECK3-NEXT:    [[TMP28:%.*]] = bitcast i8* [[TMP27]] to i32*
7402 // CHECK3-NEXT:    [[TMP29:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP25]], align 4
7403 // CHECK3-NEXT:    store i32 [[TMP29]], i32* [[TMP28]], align 4
7404 // CHECK3-NEXT:    br label [[IFCONT12:%.*]]
7405 // CHECK3:       else11:
7406 // CHECK3-NEXT:    br label [[IFCONT12]]
7407 // CHECK3:       ifcont12:
7408 // CHECK3-NEXT:    ret void
7409 //
7410 //
7411 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func5
7412 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
7413 // CHECK3-NEXT:  entry:
7414 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
7415 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
7416 // CHECK3-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
7417 // CHECK3-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
7418 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
7419 // CHECK3-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
7420 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
7421 // CHECK3-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
7422 // CHECK3-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
7423 // CHECK3-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
7424 // CHECK3-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
7425 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
7426 // CHECK3-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
7427 // CHECK3-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
7428 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]]
7429 // CHECK3-NEXT:    [[TMP11:%.*]] = load i8, i8* [[TMP9]], align 1
7430 // CHECK3-NEXT:    store i8 [[TMP11]], i8* [[TMP10]], align 128
7431 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
7432 // CHECK3-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
7433 // CHECK3-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
7434 // CHECK3-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1
7435 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]]
7436 // CHECK3-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP14]], align 4
7437 // CHECK3-NEXT:    store float [[TMP16]], float* [[TMP15]], align 128
7438 // CHECK3-NEXT:    ret void
7439 //
7440 //
7441 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func6
7442 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
7443 // CHECK3-NEXT:  entry:
7444 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
7445 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
7446 // CHECK3-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
7447 // CHECK3-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
7448 // CHECK3-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
7449 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
7450 // CHECK3-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
7451 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
7452 // CHECK3-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
7453 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
7454 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
7455 // CHECK3-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
7456 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]]
7457 // CHECK3-NEXT:    store i8* [[TMP7]], i8** [[TMP6]], align 4
7458 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
7459 // CHECK3-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1
7460 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]]
7461 // CHECK3-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
7462 // CHECK3-NEXT:    store i8* [[TMP10]], i8** [[TMP8]], align 4
7463 // CHECK3-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
7464 // CHECK3-NEXT:    [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
7465 // CHECK3-NEXT:    call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR3]]
7466 // CHECK3-NEXT:    ret void
7467 //
7468 //
7469 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func7
7470 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
7471 // CHECK3-NEXT:  entry:
7472 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
7473 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
7474 // CHECK3-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
7475 // CHECK3-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
7476 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
7477 // CHECK3-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
7478 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
7479 // CHECK3-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
7480 // CHECK3-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
7481 // CHECK3-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.0*
7482 // CHECK3-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
7483 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
7484 // CHECK3-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
7485 // CHECK3-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 0
7486 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP7]]
7487 // CHECK3-NEXT:    [[TMP11:%.*]] = load i8, i8* [[TMP10]], align 128
7488 // CHECK3-NEXT:    store i8 [[TMP11]], i8* [[TMP9]], align 1
7489 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
7490 // CHECK3-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4
7491 // CHECK3-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to float*
7492 // CHECK3-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP6]], i32 0, i32 1
7493 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP7]]
7494 // CHECK3-NEXT:    [[TMP16:%.*]] = load float, float* [[TMP15]], align 128
7495 // CHECK3-NEXT:    store float [[TMP16]], float* [[TMP14]], align 4
7496 // CHECK3-NEXT:    ret void
7497 //
7498 //
7499 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func8
7500 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
7501 // CHECK3-NEXT:  entry:
7502 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
7503 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
7504 // CHECK3-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
7505 // CHECK3-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
7506 // CHECK3-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
7507 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
7508 // CHECK3-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
7509 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
7510 // CHECK3-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.0*
7511 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
7512 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
7513 // CHECK3-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 0
7514 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2048 x i8], [2048 x i8]* [[C]], i32 0, i32 [[TMP5]]
7515 // CHECK3-NEXT:    store i8* [[TMP7]], i8** [[TMP6]], align 4
7516 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
7517 // CHECK3-NEXT:    [[D:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], %struct._globalized_locals_ty.0* [[TMP4]], i32 0, i32 1
7518 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2048 x float], [2048 x float]* [[D]], i32 0, i32 [[TMP5]]
7519 // CHECK3-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to i8*
7520 // CHECK3-NEXT:    store i8* [[TMP10]], i8** [[TMP8]], align 4
7521 // CHECK3-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
7522 // CHECK3-NEXT:    [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
7523 // CHECK3-NEXT:    call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR3]]
7524 // CHECK3-NEXT:    ret void
7525 //
7526 //
7527 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33
7528 // CHECK3-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
7529 // CHECK3-NEXT:  entry:
7530 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
7531 // CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
7532 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
7533 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
7534 // CHECK3-NEXT:    store i32 [[A]], i32* [[A_ADDR]], align 4
7535 // CHECK3-NEXT:    store i32 [[B]], i32* [[B_ADDR]], align 4
7536 // CHECK3-NEXT:    [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16*
7537 // CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true)
7538 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
7539 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
7540 // CHECK3:       user_code.entry:
7541 // CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
7542 // CHECK3-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
7543 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
7544 // CHECK3-NEXT:    call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR3]]
7545 // CHECK3-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true)
7546 // CHECK3-NEXT:    ret void
7547 // CHECK3:       worker.exit:
7548 // CHECK3-NEXT:    ret void
7549 //
7550 //
7551 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__9
7552 // CHECK3-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] {
7553 // CHECK3-NEXT:  entry:
7554 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
7555 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
7556 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 4
7557 // CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i16*, align 4
7558 // CHECK3-NEXT:    [[A1:%.*]] = alloca i32, align 4
7559 // CHECK3-NEXT:    [[B2:%.*]] = alloca i16, align 2
7560 // CHECK3-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
7561 // CHECK3-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
7562 // CHECK3-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
7563 // CHECK3-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
7564 // CHECK3-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 4
7565 // CHECK3-NEXT:    store i16* [[B]], i16** [[B_ADDR]], align 4
7566 // CHECK3-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
7567 // CHECK3-NEXT:    [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
7568 // CHECK3-NEXT:    store i32 0, i32* [[A1]], align 4
7569 // CHECK3-NEXT:    store i16 -32768, i16* [[B2]], align 2
7570 // CHECK3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
7571 // CHECK3-NEXT:    [[TMP3:%.*]] = bitcast i32* [[A1]] to i8*
7572 // CHECK3-NEXT:    store i8* [[TMP3]], i8** [[TMP2]], align 4
7573 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
7574 // CHECK3-NEXT:    [[TMP5:%.*]] = bitcast i16* [[B2]] to i8*
7575 // CHECK3-NEXT:    store i8* [[TMP5]], i8** [[TMP4]], align 4
7576 // CHECK3-NEXT:    [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
7577 // CHECK3-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
7578 // CHECK3-NEXT:    [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
7579 // CHECK3-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__10 to i8*), i8* null, i8** [[TMP8]], i32 2)
7580 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
7581 // CHECK3-NEXT:    [[TMP10:%.*]] = bitcast i32* [[A1]] to i8*
7582 // CHECK3-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 4
7583 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
7584 // CHECK3-NEXT:    [[TMP12:%.*]] = bitcast i16* [[B2]] to i8*
7585 // CHECK3-NEXT:    store i8* [[TMP12]], i8** [[TMP11]], align 4
7586 // CHECK3-NEXT:    [[TMP13:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
7587 // CHECK3-NEXT:    [[TMP14:%.*]] = load i8*, i8** @"_openmp_teams_reductions_buffer_$_$ptr", align 4
7588 // CHECK3-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP7]], i8* [[TMP14]], i32 2048, i8* [[TMP13]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func15, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func16, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_copy_func17, void (i8*, i32, i8*)* @_omp_reduction_list_to_global_reduce_func18, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_copy_func19, void (i8*, i32, i8*)* @_omp_reduction_global_to_list_reduce_func20)
7589 // CHECK3-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 1
7590 // CHECK3-NEXT:    br i1 [[TMP16]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
7591 // CHECK3:       .omp.reduction.then:
7592 // CHECK3-NEXT:    [[TMP17:%.*]] = load i32, i32* [[TMP0]], align 4
7593 // CHECK3-NEXT:    [[TMP18:%.*]] = load i32, i32* [[A1]], align 4
7594 // CHECK3-NEXT:    [[OR:%.*]] = or i32 [[TMP17]], [[TMP18]]
7595 // CHECK3-NEXT:    store i32 [[OR]], i32* [[TMP0]], align 4
7596 // CHECK3-NEXT:    [[TMP19:%.*]] = load i16, i16* [[TMP1]], align 2
7597 // CHECK3-NEXT:    [[CONV:%.*]] = sext i16 [[TMP19]] to i32
7598 // CHECK3-NEXT:    [[TMP20:%.*]] = load i16, i16* [[B2]], align 2
7599 // CHECK3-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP20]] to i32
7600 // CHECK3-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
7601 // CHECK3-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
7602 // CHECK3:       cond.true:
7603 // CHECK3-NEXT:    [[TMP21:%.*]] = load i16, i16* [[TMP1]], align 2
7604 // CHECK3-NEXT:    br label [[COND_END:%.*]]
7605 // CHECK3:       cond.false:
7606 // CHECK3-NEXT:    [[TMP22:%.*]] = load i16, i16* [[B2]], align 2
7607 // CHECK3-NEXT:    br label [[COND_END]]
7608 // CHECK3:       cond.end:
7609 // CHECK3-NEXT:    [[COND:%.*]] = phi i16 [ [[TMP21]], [[COND_TRUE]] ], [ [[TMP22]], [[COND_FALSE]] ]
7610 // CHECK3-NEXT:    store i16 [[COND]], i16* [[TMP1]], align 2
7611 // CHECK3-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP7]])
7612 // CHECK3-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
7613 // CHECK3:       .omp.reduction.done:
7614 // CHECK3-NEXT:    ret void
7615 //
7616 //
7617 // CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__10
7618 // CHECK3-SAME: (i32* noalias noundef [[DOTGLOBAL_TID_:%.*]], i32* noalias noundef [[DOTBOUND_TID_:%.*]], i32* noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i16* noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] {
7619 // CHECK3-NEXT:  entry:
7620 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4
7621 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4
7622 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 4
7623 // CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i16*, align 4
7624 // CHECK3-NEXT:    [[A1:%.*]] = alloca i32, align 4
7625 // CHECK3-NEXT:    [[B2:%.*]] = alloca i16, align 2
7626 // CHECK3-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
7627 // CHECK3-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4
7628 // CHECK3-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
7629 // CHECK3-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 4
7630 // CHECK3-NEXT:    store i16* [[B]], i16** [[B_ADDR]], align 4
7631 // CHECK3-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
7632 // CHECK3-NEXT:    [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4
7633 // CHECK3-NEXT:    store i32 0, i32* [[A1]], align 4
7634 // CHECK3-NEXT:    store i16 -32768, i16* [[B2]], align 2
7635 // CHECK3-NEXT:    [[TMP2:%.*]] = load i32, i32* [[A1]], align 4
7636 // CHECK3-NEXT:    [[OR:%.*]] = or i32 [[TMP2]], 1
7637 // CHECK3-NEXT:    store i32 [[OR]], i32* [[A1]], align 4
7638 // CHECK3-NEXT:    [[TMP3:%.*]] = load i16, i16* [[B2]], align 2
7639 // CHECK3-NEXT:    [[CONV:%.*]] = sext i16 [[TMP3]] to i32
7640 // CHECK3-NEXT:    [[CMP:%.*]] = icmp sgt i32 99, [[CONV]]
7641 // CHECK3-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
7642 // CHECK3:       cond.true:
7643 // CHECK3-NEXT:    br label [[COND_END:%.*]]
7644 // CHECK3:       cond.false:
7645 // CHECK3-NEXT:    [[TMP4:%.*]] = load i16, i16* [[B2]], align 2
7646 // CHECK3-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP4]] to i32
7647 // CHECK3-NEXT:    br label [[COND_END]]
7648 // CHECK3:       cond.end:
7649 // CHECK3-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ]
7650 // CHECK3-NEXT:    [[CONV4:%.*]] = trunc i32 [[COND]] to i16
7651 // CHECK3-NEXT:    store i16 [[CONV4]], i16* [[B2]], align 2
7652 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
7653 // CHECK3-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
7654 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
7655 // CHECK3-NEXT:    [[TMP8:%.*]] = bitcast i32* [[A1]] to i8*
7656 // CHECK3-NEXT:    store i8* [[TMP8]], i8** [[TMP7]], align 4
7657 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
7658 // CHECK3-NEXT:    [[TMP10:%.*]] = bitcast i16* [[B2]] to i8*
7659 // CHECK3-NEXT:    store i8* [[TMP10]], i8** [[TMP9]], align 4
7660 // CHECK3-NEXT:    [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
7661 // CHECK3-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB3]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func12, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func13)
7662 // CHECK3-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1
7663 // CHECK3-NEXT:    br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
7664 // CHECK3:       .omp.reduction.then:
7665 // CHECK3-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4
7666 // CHECK3-NEXT:    [[TMP15:%.*]] = load i32, i32* [[A1]], align 4
7667 // CHECK3-NEXT:    [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]]
7668 // CHECK3-NEXT:    store i32 [[OR5]], i32* [[TMP0]], align 4
7669 // CHECK3-NEXT:    [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2
7670 // CHECK3-NEXT:    [[CONV6:%.*]] = sext i16 [[TMP16]] to i32
7671 // CHECK3-NEXT:    [[TMP17:%.*]] = load i16, i16* [[B2]], align 2
7672 // CHECK3-NEXT:    [[CONV7:%.*]] = sext i16 [[TMP17]] to i32
7673 // CHECK3-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]]
7674 // CHECK3-NEXT:    br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]]
7675 // CHECK3:       cond.true9:
7676 // CHECK3-NEXT:    [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2
7677 // CHECK3-NEXT:    br label [[COND_END11:%.*]]
7678 // CHECK3:       cond.false10:
7679 // CHECK3-NEXT:    [[TMP19:%.*]] = load i16, i16* [[B2]], align 2
7680 // CHECK3-NEXT:    br label [[COND_END11]]
7681 // CHECK3:       cond.end11:
7682 // CHECK3-NEXT:    [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ]
7683 // CHECK3-NEXT:    store i16 [[COND12]], i16* [[TMP1]], align 2
7684 // CHECK3-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]])
7685 // CHECK3-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
7686 // CHECK3:       .omp.reduction.done:
7687 // CHECK3-NEXT:    ret void
7688 //
7689 //
7690 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func12
7691 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] {
7692 // CHECK3-NEXT:  entry:
7693 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
7694 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
7695 // CHECK3-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
7696 // CHECK3-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
7697 // CHECK3-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
7698 // CHECK3-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
7699 // CHECK3-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
7700 // CHECK3-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
7701 // CHECK3-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
7702 // CHECK3-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
7703 // CHECK3-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
7704 // CHECK3-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
7705 // CHECK3-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
7706 // CHECK3-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
7707 // CHECK3-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
7708 // CHECK3-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
7709 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
7710 // CHECK3-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
7711 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
7712 // CHECK3-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
7713 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
7714 // CHECK3-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
7715 // CHECK3-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
7716 // CHECK3-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size()
7717 // CHECK3-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
7718 // CHECK3-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]])
7719 // CHECK3-NEXT:    store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
7720 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
7721 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
7722 // CHECK3-NEXT:    [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
7723 // CHECK3-NEXT:    store i8* [[TMP21]], i8** [[TMP11]], align 4
7724 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
7725 // CHECK3-NEXT:    [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 4
7726 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
7727 // CHECK3-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16*
7728 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
7729 // CHECK3-NEXT:    [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8*
7730 // CHECK3-NEXT:    [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2
7731 // CHECK3-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP28]] to i32
7732 // CHECK3-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
7733 // CHECK3-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
7734 // CHECK3-NEXT:    [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
7735 // CHECK3-NEXT:    [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16
7736 // CHECK3-NEXT:    store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
7737 // CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
7738 // CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
7739 // CHECK3-NEXT:    [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
7740 // CHECK3-NEXT:    store i8* [[TMP36]], i8** [[TMP24]], align 4
7741 // CHECK3-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0
7742 // CHECK3-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
7743 // CHECK3-NEXT:    [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
7744 // CHECK3-NEXT:    [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
7745 // CHECK3-NEXT:    [[TMP41:%.*]] = icmp eq i16 [[TMP8]], 2
7746 // CHECK3-NEXT:    [[TMP42:%.*]] = and i16 [[TMP6]], 1
7747 // CHECK3-NEXT:    [[TMP43:%.*]] = icmp eq i16 [[TMP42]], 0
7748 // CHECK3-NEXT:    [[TMP44:%.*]] = and i1 [[TMP41]], [[TMP43]]
7749 // CHECK3-NEXT:    [[TMP45:%.*]] = icmp sgt i16 [[TMP7]], 0
7750 // CHECK3-NEXT:    [[TMP46:%.*]] = and i1 [[TMP44]], [[TMP45]]
7751 // CHECK3-NEXT:    [[TMP47:%.*]] = or i1 [[TMP37]], [[TMP40]]
7752 // CHECK3-NEXT:    [[TMP48:%.*]] = or i1 [[TMP47]], [[TMP46]]
7753 // CHECK3-NEXT:    br i1 [[TMP48]], label [[THEN:%.*]], label [[ELSE:%.*]]
7754 // CHECK3:       then:
7755 // CHECK3-NEXT:    [[TMP49:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
7756 // CHECK3-NEXT:    [[TMP50:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
7757 // CHECK3-NEXT:    call void @"_omp$reduction$reduction_func11"(i8* [[TMP49]], i8* [[TMP50]]) #[[ATTR3]]
7758 // CHECK3-NEXT:    br label [[IFCONT:%.*]]
7759 // CHECK3:       else:
7760 // CHECK3-NEXT:    br label [[IFCONT]]
7761 // CHECK3:       ifcont:
7762 // CHECK3-NEXT:    [[TMP51:%.*]] = icmp eq i16 [[TMP8]], 1
7763 // CHECK3-NEXT:    [[TMP52:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
7764 // CHECK3-NEXT:    [[TMP53:%.*]] = and i1 [[TMP51]], [[TMP52]]
7765 // CHECK3-NEXT:    br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
7766 // CHECK3:       then5:
7767 // CHECK3-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
7768 // CHECK3-NEXT:    [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
7769 // CHECK3-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
7770 // CHECK3-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4
7771 // CHECK3-NEXT:    [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32*
7772 // CHECK3-NEXT:    [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32*
7773 // CHECK3-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4
7774 // CHECK3-NEXT:    store i32 [[TMP60]], i32* [[TMP59]], align 4
7775 // CHECK3-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
7776 // CHECK3-NEXT:    [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
7777 // CHECK3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
7778 // CHECK3-NEXT:    [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 4
7779 // CHECK3-NEXT:    [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16*
7780 // CHECK3-NEXT:    [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16*
7781 // CHECK3-NEXT:    [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2
7782 // CHECK3-NEXT:    store i16 [[TMP67]], i16* [[TMP66]], align 2
7783 // CHECK3-NEXT:    br label [[IFCONT7:%.*]]
7784 // CHECK3:       else6:
7785 // CHECK3-NEXT:    br label [[IFCONT7]]
7786 // CHECK3:       ifcont7:
7787 // CHECK3-NEXT:    ret void
7788 //
7789 //
7790 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func13
7791 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
7792 // CHECK3-NEXT:  entry:
7793 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
7794 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
7795 // CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
7796 // CHECK3-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
7797 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
7798 // CHECK3-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
7799 // CHECK3-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
7800 // CHECK3-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
7801 // CHECK3-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
7802 // CHECK3-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
7803 // CHECK3-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4
7804 // CHECK3-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
7805 // CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4:[0-9]+]], i32 [[TMP2]])
7806 // CHECK3-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
7807 // CHECK3-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
7808 // CHECK3:       then:
7809 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
7810 // CHECK3-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
7811 // CHECK3-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
7812 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
7813 // CHECK3-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
7814 // CHECK3-NEXT:    store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
7815 // CHECK3-NEXT:    br label [[IFCONT:%.*]]
7816 // CHECK3:       else:
7817 // CHECK3-NEXT:    br label [[IFCONT]]
7818 // CHECK3:       ifcont:
7819 // CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
7820 // CHECK3-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
7821 // CHECK3-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
7822 // CHECK3-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
7823 // CHECK3:       then2:
7824 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
7825 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
7826 // CHECK3-NEXT:    [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4
7827 // CHECK3-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
7828 // CHECK3-NEXT:    [[TMP18:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4
7829 // CHECK3-NEXT:    store i32 [[TMP18]], i32* [[TMP17]], align 4
7830 // CHECK3-NEXT:    br label [[IFCONT4:%.*]]
7831 // CHECK3:       else3:
7832 // CHECK3-NEXT:    br label [[IFCONT4]]
7833 // CHECK3:       ifcont4:
7834 // CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
7835 // CHECK3-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
7836 // CHECK3-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
7837 // CHECK3:       then6:
7838 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
7839 // CHECK3-NEXT:    [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 4
7840 // CHECK3-NEXT:    [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i16*
7841 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
7842 // CHECK3-NEXT:    [[TMP23:%.*]] = bitcast i32 addrspace(3)* [[TMP22]] to i16 addrspace(3)*
7843 // CHECK3-NEXT:    [[TMP24:%.*]] = load i16, i16* [[TMP21]], align 2
7844 // CHECK3-NEXT:    store volatile i16 [[TMP24]], i16 addrspace(3)* [[TMP23]], align 2
7845 // CHECK3-NEXT:    br label [[IFCONT8:%.*]]
7846 // CHECK3:       else7:
7847 // CHECK3-NEXT:    br label [[IFCONT8]]
7848 // CHECK3:       ifcont8:
7849 // CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
7850 // CHECK3-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTADDR1]], align 4
7851 // CHECK3-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP25]]
7852 // CHECK3-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
7853 // CHECK3:       then10:
7854 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
7855 // CHECK3-NEXT:    [[TMP27:%.*]] = bitcast i32 addrspace(3)* [[TMP26]] to i16 addrspace(3)*
7856 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
7857 // CHECK3-NEXT:    [[TMP29:%.*]] = load i8*, i8** [[TMP28]], align 4
7858 // CHECK3-NEXT:    [[TMP30:%.*]] = bitcast i8* [[TMP29]] to i16*
7859 // CHECK3-NEXT:    [[TMP31:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP27]], align 2
7860 // CHECK3-NEXT:    store i16 [[TMP31]], i16* [[TMP30]], align 2
7861 // CHECK3-NEXT:    br label [[IFCONT12:%.*]]
7862 // CHECK3:       else11:
7863 // CHECK3-NEXT:    br label [[IFCONT12]]
7864 // CHECK3:       ifcont12:
7865 // CHECK3-NEXT:    ret void
7866 //
7867 //
7868 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func15
7869 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR2]] {
7870 // CHECK3-NEXT:  entry:
7871 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
7872 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
7873 // CHECK3-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
7874 // CHECK3-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
7875 // CHECK3-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4
7876 // CHECK3-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4
7877 // CHECK3-NEXT:    [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2
7878 // CHECK3-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
7879 // CHECK3-NEXT:    store i16 [[TMP1]], i16* [[DOTADDR1]], align 2
7880 // CHECK3-NEXT:    store i16 [[TMP2]], i16* [[DOTADDR2]], align 2
7881 // CHECK3-NEXT:    store i16 [[TMP3]], i16* [[DOTADDR3]], align 2
7882 // CHECK3-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4
7883 // CHECK3-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]*
7884 // CHECK3-NEXT:    [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2
7885 // CHECK3-NEXT:    [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2
7886 // CHECK3-NEXT:    [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2
7887 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
7888 // CHECK3-NEXT:    [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4
7889 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
7890 // CHECK3-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
7891 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
7892 // CHECK3-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8*
7893 // CHECK3-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4
7894 // CHECK3-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_get_warp_size()
7895 // CHECK3-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
7896 // CHECK3-NEXT:    [[TMP18:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP17]])
7897 // CHECK3-NEXT:    store i32 [[TMP18]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4
7898 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1
7899 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1
7900 // CHECK3-NEXT:    [[TMP21:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8*
7901 // CHECK3-NEXT:    store i8* [[TMP21]], i8** [[TMP11]], align 4
7902 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
7903 // CHECK3-NEXT:    [[TMP23:%.*]] = load i8*, i8** [[TMP22]], align 4
7904 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
7905 // CHECK3-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP23]] to i16*
7906 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
7907 // CHECK3-NEXT:    [[TMP27:%.*]] = bitcast i16* [[TMP26]] to i8*
7908 // CHECK3-NEXT:    [[TMP28:%.*]] = load i16, i16* [[TMP25]], align 2
7909 // CHECK3-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP28]] to i32
7910 // CHECK3-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_get_warp_size()
7911 // CHECK3-NEXT:    [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16
7912 // CHECK3-NEXT:    [[TMP32:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP29]], i16 [[TMP7]], i16 [[TMP31]])
7913 // CHECK3-NEXT:    [[TMP33:%.*]] = trunc i32 [[TMP32]] to i16
7914 // CHECK3-NEXT:    store i16 [[TMP33]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2
7915 // CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr i16, i16* [[TMP25]], i32 1
7916 // CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1
7917 // CHECK3-NEXT:    [[TMP36:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8*
7918 // CHECK3-NEXT:    store i8* [[TMP36]], i8** [[TMP24]], align 4
7919 // CHECK3-NEXT:    [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 0
7920 // CHECK3-NEXT:    [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 1
7921 // CHECK3-NEXT:    [[TMP39:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]]
7922 // CHECK3-NEXT:    [[TMP40:%.*]] = and i1 [[TMP38]], [[TMP39]]
7923 // CHECK3-NEXT:    [[TMP41:%.*]] = icmp eq i16 [[TMP8]], 2
7924 // CHECK3-NEXT:    [[TMP42:%.*]] = and i16 [[TMP6]], 1
7925 // CHECK3-NEXT:    [[TMP43:%.*]] = icmp eq i16 [[TMP42]], 0
7926 // CHECK3-NEXT:    [[TMP44:%.*]] = and i1 [[TMP41]], [[TMP43]]
7927 // CHECK3-NEXT:    [[TMP45:%.*]] = icmp sgt i16 [[TMP7]], 0
7928 // CHECK3-NEXT:    [[TMP46:%.*]] = and i1 [[TMP44]], [[TMP45]]
7929 // CHECK3-NEXT:    [[TMP47:%.*]] = or i1 [[TMP37]], [[TMP40]]
7930 // CHECK3-NEXT:    [[TMP48:%.*]] = or i1 [[TMP47]], [[TMP46]]
7931 // CHECK3-NEXT:    br i1 [[TMP48]], label [[THEN:%.*]], label [[ELSE:%.*]]
7932 // CHECK3:       then:
7933 // CHECK3-NEXT:    [[TMP49:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8*
7934 // CHECK3-NEXT:    [[TMP50:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8*
7935 // CHECK3-NEXT:    call void @"_omp$reduction$reduction_func14"(i8* [[TMP49]], i8* [[TMP50]]) #[[ATTR3]]
7936 // CHECK3-NEXT:    br label [[IFCONT:%.*]]
7937 // CHECK3:       else:
7938 // CHECK3-NEXT:    br label [[IFCONT]]
7939 // CHECK3:       ifcont:
7940 // CHECK3-NEXT:    [[TMP51:%.*]] = icmp eq i16 [[TMP8]], 1
7941 // CHECK3-NEXT:    [[TMP52:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]]
7942 // CHECK3-NEXT:    [[TMP53:%.*]] = and i1 [[TMP51]], [[TMP52]]
7943 // CHECK3-NEXT:    br i1 [[TMP53]], label [[THEN5:%.*]], label [[ELSE6:%.*]]
7944 // CHECK3:       then5:
7945 // CHECK3-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0
7946 // CHECK3-NEXT:    [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4
7947 // CHECK3-NEXT:    [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0
7948 // CHECK3-NEXT:    [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4
7949 // CHECK3-NEXT:    [[TMP58:%.*]] = bitcast i8* [[TMP55]] to i32*
7950 // CHECK3-NEXT:    [[TMP59:%.*]] = bitcast i8* [[TMP57]] to i32*
7951 // CHECK3-NEXT:    [[TMP60:%.*]] = load i32, i32* [[TMP58]], align 4
7952 // CHECK3-NEXT:    store i32 [[TMP60]], i32* [[TMP59]], align 4
7953 // CHECK3-NEXT:    [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1
7954 // CHECK3-NEXT:    [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4
7955 // CHECK3-NEXT:    [[TMP63:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1
7956 // CHECK3-NEXT:    [[TMP64:%.*]] = load i8*, i8** [[TMP63]], align 4
7957 // CHECK3-NEXT:    [[TMP65:%.*]] = bitcast i8* [[TMP62]] to i16*
7958 // CHECK3-NEXT:    [[TMP66:%.*]] = bitcast i8* [[TMP64]] to i16*
7959 // CHECK3-NEXT:    [[TMP67:%.*]] = load i16, i16* [[TMP65]], align 2
7960 // CHECK3-NEXT:    store i16 [[TMP67]], i16* [[TMP66]], align 2
7961 // CHECK3-NEXT:    br label [[IFCONT7:%.*]]
7962 // CHECK3:       else6:
7963 // CHECK3-NEXT:    br label [[IFCONT7]]
7964 // CHECK3:       ifcont7:
7965 // CHECK3-NEXT:    ret void
7966 //
7967 //
7968 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func16
7969 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2]] {
7970 // CHECK3-NEXT:  entry:
7971 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
7972 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
7973 // CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3]])
7974 // CHECK3-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
7975 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
7976 // CHECK3-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
7977 // CHECK3-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
7978 // CHECK3-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
7979 // CHECK3-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
7980 // CHECK3-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
7981 // CHECK3-NEXT:    [[TMP6:%.*]] = load i8*, i8** [[DOTADDR]], align 4
7982 // CHECK3-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP6]] to [2 x i8*]*
7983 // CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
7984 // CHECK3-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
7985 // CHECK3-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
7986 // CHECK3:       then:
7987 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
7988 // CHECK3-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
7989 // CHECK3-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
7990 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
7991 // CHECK3-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
7992 // CHECK3-NEXT:    store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4
7993 // CHECK3-NEXT:    br label [[IFCONT:%.*]]
7994 // CHECK3:       else:
7995 // CHECK3-NEXT:    br label [[IFCONT]]
7996 // CHECK3:       ifcont:
7997 // CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
7998 // CHECK3-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4
7999 // CHECK3-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP13]]
8000 // CHECK3-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
8001 // CHECK3:       then2:
8002 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
8003 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 0
8004 // CHECK3-NEXT:    [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4
8005 // CHECK3-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32*
8006 // CHECK3-NEXT:    [[TMP18:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4
8007 // CHECK3-NEXT:    store i32 [[TMP18]], i32* [[TMP17]], align 4
8008 // CHECK3-NEXT:    br label [[IFCONT4:%.*]]
8009 // CHECK3:       else3:
8010 // CHECK3-NEXT:    br label [[IFCONT4]]
8011 // CHECK3:       ifcont4:
8012 // CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
8013 // CHECK3-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
8014 // CHECK3-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
8015 // CHECK3:       then6:
8016 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
8017 // CHECK3-NEXT:    [[TMP20:%.*]] = load i8*, i8** [[TMP19]], align 4
8018 // CHECK3-NEXT:    [[TMP21:%.*]] = bitcast i8* [[TMP20]] to i16*
8019 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
8020 // CHECK3-NEXT:    [[TMP23:%.*]] = bitcast i32 addrspace(3)* [[TMP22]] to i16 addrspace(3)*
8021 // CHECK3-NEXT:    [[TMP24:%.*]] = load i16, i16* [[TMP21]], align 2
8022 // CHECK3-NEXT:    store volatile i16 [[TMP24]], i16 addrspace(3)* [[TMP23]], align 2
8023 // CHECK3-NEXT:    br label [[IFCONT8:%.*]]
8024 // CHECK3:       else7:
8025 // CHECK3-NEXT:    br label [[IFCONT8]]
8026 // CHECK3:       ifcont8:
8027 // CHECK3-NEXT:    call void @__kmpc_barrier(%struct.ident_t* @[[GLOB4]], i32 [[TMP2]])
8028 // CHECK3-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTADDR1]], align 4
8029 // CHECK3-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP25]]
8030 // CHECK3-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
8031 // CHECK3:       then10:
8032 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
8033 // CHECK3-NEXT:    [[TMP27:%.*]] = bitcast i32 addrspace(3)* [[TMP26]] to i16 addrspace(3)*
8034 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP7]], i32 0, i32 1
8035 // CHECK3-NEXT:    [[TMP29:%.*]] = load i8*, i8** [[TMP28]], align 4
8036 // CHECK3-NEXT:    [[TMP30:%.*]] = bitcast i8* [[TMP29]] to i16*
8037 // CHECK3-NEXT:    [[TMP31:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP27]], align 2
8038 // CHECK3-NEXT:    store i16 [[TMP31]], i16* [[TMP30]], align 2
8039 // CHECK3-NEXT:    br label [[IFCONT12:%.*]]
8040 // CHECK3:       else11:
8041 // CHECK3-NEXT:    br label [[IFCONT12]]
8042 // CHECK3:       ifcont12:
8043 // CHECK3-NEXT:    ret void
8044 //
8045 //
8046 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_copy_func17
8047 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
8048 // CHECK3-NEXT:  entry:
8049 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
8050 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
8051 // CHECK3-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
8052 // CHECK3-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
8053 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
8054 // CHECK3-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
8055 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
8056 // CHECK3-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
8057 // CHECK3-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
8058 // CHECK3-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1*
8059 // CHECK3-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
8060 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
8061 // CHECK3-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
8062 // CHECK3-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
8063 // CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0
8064 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]]
8065 // CHECK3-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4
8066 // CHECK3-NEXT:    store i32 [[TMP12]], i32* [[TMP11]], align 128
8067 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
8068 // CHECK3-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
8069 // CHECK3-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
8070 // CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1
8071 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]]
8072 // CHECK3-NEXT:    [[TMP17:%.*]] = load i16, i16* [[TMP15]], align 2
8073 // CHECK3-NEXT:    store i16 [[TMP17]], i16* [[TMP16]], align 128
8074 // CHECK3-NEXT:    ret void
8075 //
8076 //
8077 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_list_to_global_reduce_func18
8078 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
8079 // CHECK3-NEXT:  entry:
8080 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
8081 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
8082 // CHECK3-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
8083 // CHECK3-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
8084 // CHECK3-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
8085 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
8086 // CHECK3-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
8087 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
8088 // CHECK3-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1*
8089 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
8090 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
8091 // CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0
8092 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]]
8093 // CHECK3-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
8094 // CHECK3-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 4
8095 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
8096 // CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1
8097 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]]
8098 // CHECK3-NEXT:    [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
8099 // CHECK3-NEXT:    store i8* [[TMP11]], i8** [[TMP9]], align 4
8100 // CHECK3-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
8101 // CHECK3-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
8102 // CHECK3-NEXT:    call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR3]]
8103 // CHECK3-NEXT:    ret void
8104 //
8105 //
8106 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_copy_func19
8107 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
8108 // CHECK3-NEXT:  entry:
8109 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
8110 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
8111 // CHECK3-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
8112 // CHECK3-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
8113 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
8114 // CHECK3-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
8115 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
8116 // CHECK3-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]*
8117 // CHECK3-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[DOTADDR]], align 4
8118 // CHECK3-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to %struct._globalized_locals_ty.1*
8119 // CHECK3-NEXT:    [[TMP7:%.*]] = load i32, i32* [[DOTADDR1]], align 4
8120 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0
8121 // CHECK3-NEXT:    [[TMP9:%.*]] = load i8*, i8** [[TMP8]], align 4
8122 // CHECK3-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP9]] to i32*
8123 // CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 0
8124 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP7]]
8125 // CHECK3-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 128
8126 // CHECK3-NEXT:    store i32 [[TMP12]], i32* [[TMP10]], align 4
8127 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1
8128 // CHECK3-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4
8129 // CHECK3-NEXT:    [[TMP15:%.*]] = bitcast i8* [[TMP14]] to i16*
8130 // CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP6]], i32 0, i32 1
8131 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP7]]
8132 // CHECK3-NEXT:    [[TMP17:%.*]] = load i16, i16* [[TMP16]], align 128
8133 // CHECK3-NEXT:    store i16 [[TMP17]], i16* [[TMP15]], align 2
8134 // CHECK3-NEXT:    ret void
8135 //
8136 //
8137 // CHECK3-LABEL: define {{[^@]+}}@_omp_reduction_global_to_list_reduce_func20
8138 // CHECK3-SAME: (i8* noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]], i8* noundef [[TMP2:%.*]]) #[[ATTR2]] {
8139 // CHECK3-NEXT:  entry:
8140 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca i8*, align 4
8141 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
8142 // CHECK3-NEXT:    [[DOTADDR2:%.*]] = alloca i8*, align 4
8143 // CHECK3-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4
8144 // CHECK3-NEXT:    store i8* [[TMP0]], i8** [[DOTADDR]], align 4
8145 // CHECK3-NEXT:    store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
8146 // CHECK3-NEXT:    store i8* [[TMP2]], i8** [[DOTADDR2]], align 4
8147 // CHECK3-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4
8148 // CHECK3-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to %struct._globalized_locals_ty.1*
8149 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, i32* [[DOTADDR1]], align 4
8150 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
8151 // CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 0
8152 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* [[A]], i32 0, i32 [[TMP5]]
8153 // CHECK3-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
8154 // CHECK3-NEXT:    store i8* [[TMP8]], i8** [[TMP6]], align 4
8155 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
8156 // CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], %struct._globalized_locals_ty.1* [[TMP4]], i32 0, i32 1
8157 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2048 x i16], [2048 x i16]* [[B]], i32 0, i32 [[TMP5]]
8158 // CHECK3-NEXT:    [[TMP11:%.*]] = bitcast i16* [[TMP10]] to i8*
8159 // CHECK3-NEXT:    store i8* [[TMP11]], i8** [[TMP9]], align 4
8160 // CHECK3-NEXT:    [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8*
8161 // CHECK3-NEXT:    [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4
8162 // CHECK3-NEXT:    call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR3]]
8163 // CHECK3-NEXT:    ret void
8164 //
8165