1 // REQUIRES: nvptx-registered-target
2 // RUN: %clang_cc1 -no-opaque-pointers -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
3 // RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
4 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s
5 // RUN: %clang_cc1 -no-opaque-pointers -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
6 // RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
7 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s
8 // RUN: %clang_cc1 -no-opaque-pointers -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 \
9 // RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
10 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
11 // RUN: %clang_cc1 -no-opaque-pointers -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_60 \
12 // RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
13 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
14 // RUN: %clang_cc1 -no-opaque-pointers -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_61 \
15 // RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
16 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
17 // RUN: %clang_cc1 -no-opaque-pointers -triple nvptx-unknown-unknown -target-cpu sm_53 \
18 // RUN: -DERROR_CHECK -fcuda-is-device -S -o /dev/null -x cuda -verify %s
19 // RUN: %clang_cc1 -no-opaque-pointers -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
20 // RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
21 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP32 %s
22 // RUN: %clang_cc1 -no-opaque-pointers -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
23 // RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
24 // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s
25
26 #define __device__ __attribute__((device))
27 #define __global__ __attribute__((global))
28 #define __shared__ __attribute__((shared))
29 #define __constant__ __attribute__((constant))
30
read_tid()31 __device__ int read_tid() {
32
33 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
34 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
35 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
36 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.w()
37
38 int x = __nvvm_read_ptx_sreg_tid_x();
39 int y = __nvvm_read_ptx_sreg_tid_y();
40 int z = __nvvm_read_ptx_sreg_tid_z();
41 int w = __nvvm_read_ptx_sreg_tid_w();
42
43 return x + y + z + w;
44
45 }
46
read_ntid()47 __device__ int read_ntid() {
48
49 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
50 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
51 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
52 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.w()
53
54 int x = __nvvm_read_ptx_sreg_ntid_x();
55 int y = __nvvm_read_ptx_sreg_ntid_y();
56 int z = __nvvm_read_ptx_sreg_ntid_z();
57 int w = __nvvm_read_ptx_sreg_ntid_w();
58
59 return x + y + z + w;
60
61 }
62
read_ctaid()63 __device__ int read_ctaid() {
64
65 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
66 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
67 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
68 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.w()
69
70 int x = __nvvm_read_ptx_sreg_ctaid_x();
71 int y = __nvvm_read_ptx_sreg_ctaid_y();
72 int z = __nvvm_read_ptx_sreg_ctaid_z();
73 int w = __nvvm_read_ptx_sreg_ctaid_w();
74
75 return x + y + z + w;
76
77 }
78
read_nctaid()79 __device__ int read_nctaid() {
80
81 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
82 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
83 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
84 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.w()
85
86 int x = __nvvm_read_ptx_sreg_nctaid_x();
87 int y = __nvvm_read_ptx_sreg_nctaid_y();
88 int z = __nvvm_read_ptx_sreg_nctaid_z();
89 int w = __nvvm_read_ptx_sreg_nctaid_w();
90
91 return x + y + z + w;
92
93 }
94
read_ids()95 __device__ int read_ids() {
96
97 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.laneid()
98 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpid()
99 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nwarpid()
100 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.smid()
101 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nsmid()
102 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.gridid()
103
104 int a = __nvvm_read_ptx_sreg_laneid();
105 int b = __nvvm_read_ptx_sreg_warpid();
106 int c = __nvvm_read_ptx_sreg_nwarpid();
107 int d = __nvvm_read_ptx_sreg_smid();
108 int e = __nvvm_read_ptx_sreg_nsmid();
109 int f = __nvvm_read_ptx_sreg_gridid();
110
111 return a + b + c + d + e + f;
112
113 }
114
read_lanemasks()115 __device__ int read_lanemasks() {
116
117 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.eq()
118 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.le()
119 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.lt()
120 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.ge()
121 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.gt()
122
123 int a = __nvvm_read_ptx_sreg_lanemask_eq();
124 int b = __nvvm_read_ptx_sreg_lanemask_le();
125 int c = __nvvm_read_ptx_sreg_lanemask_lt();
126 int d = __nvvm_read_ptx_sreg_lanemask_ge();
127 int e = __nvvm_read_ptx_sreg_lanemask_gt();
128
129 return a + b + c + d + e;
130
131 }
132
read_clocks()133 __device__ long long read_clocks() {
134
135 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.clock()
136 // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.clock64()
137
138 int a = __nvvm_read_ptx_sreg_clock();
139 long long b = __nvvm_read_ptx_sreg_clock64();
140
141 return a + b;
142 }
143
read_pms()144 __device__ int read_pms() {
145
146 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm0()
147 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm1()
148 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm2()
149 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm3()
150
151 int a = __nvvm_read_ptx_sreg_pm0();
152 int b = __nvvm_read_ptx_sreg_pm1();
153 int c = __nvvm_read_ptx_sreg_pm2();
154 int d = __nvvm_read_ptx_sreg_pm3();
155
156 return a + b + c + d;
157
158 }
159
sync()160 __device__ void sync() {
161
162 // CHECK: call void @llvm.nvvm.bar.sync(i32 0)
163
164 __nvvm_bar_sync(0);
165
166 }
167
168
169 // NVVM intrinsics
170
171 // The idea is not to test all intrinsics, just that Clang is recognizing the
172 // builtins defined in BuiltinsNVPTX.def
nvvm_math(float f1,float f2,double d1,double d2)173 __device__ void nvvm_math(float f1, float f2, double d1, double d2) {
174 // CHECK: call float @llvm.nvvm.fmax.f
175 float t1 = __nvvm_fmax_f(f1, f2);
176 // CHECK: call float @llvm.nvvm.fmin.f
177 float t2 = __nvvm_fmin_f(f1, f2);
178 // CHECK: call float @llvm.nvvm.sqrt.rn.f
179 float t3 = __nvvm_sqrt_rn_f(f1);
180 // CHECK: call float @llvm.nvvm.rcp.rn.f
181 float t4 = __nvvm_rcp_rn_f(f2);
182 // CHECK: call float @llvm.nvvm.add.rn.f
183 float t5 = __nvvm_add_rn_f(f1, f2);
184
185 // CHECK: call double @llvm.nvvm.fmax.d
186 double td1 = __nvvm_fmax_d(d1, d2);
187 // CHECK: call double @llvm.nvvm.fmin.d
188 double td2 = __nvvm_fmin_d(d1, d2);
189 // CHECK: call double @llvm.nvvm.sqrt.rn.d
190 double td3 = __nvvm_sqrt_rn_d(d1);
191 // CHECK: call double @llvm.nvvm.rcp.rn.d
192 double td4 = __nvvm_rcp_rn_d(d2);
193
194 // CHECK: call void @llvm.nvvm.membar.cta()
195 __nvvm_membar_cta();
196 // CHECK: call void @llvm.nvvm.membar.gl()
197 __nvvm_membar_gl();
198 // CHECK: call void @llvm.nvvm.membar.sys()
199 __nvvm_membar_sys();
200 // CHECK: call void @llvm.nvvm.barrier0()
201 __syncthreads();
202 }
203
204 __device__ int di;
205 __shared__ int si;
206 __device__ long dl;
207 __shared__ long sl;
208 __device__ long long dll;
209 __shared__ long long sll;
210
211 // Check for atomic intrinsics
212 // CHECK-LABEL: nvvm_atom
nvvm_atom(float * fp,float f,double * dfp,double df,int * ip,int i,unsigned int * uip,unsigned ui,long * lp,long l,long long * llp,long long ll)213 __device__ void nvvm_atom(float *fp, float f, double *dfp, double df, int *ip,
214 int i, unsigned int *uip, unsigned ui, long *lp,
215 long l, long long *llp, long long ll) {
216 // CHECK: atomicrmw add i32* {{.*}} seq_cst, align 4
217 __nvvm_atom_add_gen_i(ip, i);
218 // CHECK: atomicrmw add i{{32|64}}* {{.*}} seq_cst, align {{4|8}}
219 __nvvm_atom_add_gen_l(&dl, l);
220 // CHECK: atomicrmw add i64* {{.*}} seq_cst, align 8
221 __nvvm_atom_add_gen_ll(&sll, ll);
222
223 // CHECK: atomicrmw sub i32* {{.*}} seq_cst, align 4
224 __nvvm_atom_sub_gen_i(ip, i);
225 // CHECK: atomicrmw sub i{{32|64}}* {{.*}} seq_cst, align {{4|8}}
226 __nvvm_atom_sub_gen_l(&dl, l);
227 // CHECK: atomicrmw sub i64* {{.*}} seq_cst, align 8
228 __nvvm_atom_sub_gen_ll(&sll, ll);
229
230 // CHECK: atomicrmw and i32* {{.*}} seq_cst, align 4
231 __nvvm_atom_and_gen_i(ip, i);
232 // CHECK: atomicrmw and i{{32|64}}* {{.*}} seq_cst, align {{4|8}}
233 __nvvm_atom_and_gen_l(&dl, l);
234 // CHECK: atomicrmw and i64* {{.*}} seq_cst, align 8
235 __nvvm_atom_and_gen_ll(&sll, ll);
236
237 // CHECK: atomicrmw or i32* {{.*}} seq_cst, align 4
238 __nvvm_atom_or_gen_i(ip, i);
239 // CHECK: atomicrmw or i{{32|64}}* {{.*}} seq_cst, align {{4|8}}
240 __nvvm_atom_or_gen_l(&dl, l);
241 // CHECK: atomicrmw or i64* {{.*}} seq_cst, align 8
242 __nvvm_atom_or_gen_ll(&sll, ll);
243
244 // CHECK: atomicrmw xor i32* {{.*}} seq_cst, align 4
245 __nvvm_atom_xor_gen_i(ip, i);
246 // CHECK: atomicrmw xor i{{32|64}}* {{.*}} seq_cst, align {{4|8}}
247 __nvvm_atom_xor_gen_l(&dl, l);
248 // CHECK: atomicrmw xor i64* {{.*}} seq_cst, align 8
249 __nvvm_atom_xor_gen_ll(&sll, ll);
250
251 // CHECK: atomicrmw xchg i32* {{.*}} seq_cst, align 4
252 __nvvm_atom_xchg_gen_i(ip, i);
253 // CHECK: atomicrmw xchg i{{32|64}}* {{.*}} seq_cst, align {{4|8}}
254 __nvvm_atom_xchg_gen_l(&dl, l);
255 // CHECK: atomicrmw xchg i64* {{.*}} seq_cst, align 8
256 __nvvm_atom_xchg_gen_ll(&sll, ll);
257
258 // CHECK: atomicrmw max i32* {{.*}} seq_cst, align 4
259 __nvvm_atom_max_gen_i(ip, i);
260 // CHECK: atomicrmw umax i32* {{.*}} seq_cst, align 4
261 __nvvm_atom_max_gen_ui((unsigned int *)ip, i);
262 // CHECK: atomicrmw max i{{32|64}}* {{.*}} seq_cst, align {{4|8}}
263 __nvvm_atom_max_gen_l(&dl, l);
264 // CHECK: atomicrmw umax i{{32|64}}* {{.*}} seq_cst, align {{4|8}}
265 __nvvm_atom_max_gen_ul((unsigned long *)&dl, l);
266 // CHECK: atomicrmw max i64* {{.*}} seq_cst, align 8
267 __nvvm_atom_max_gen_ll(&sll, ll);
268 // CHECK: atomicrmw umax i64* {{.*}} seq_cst, align 8
269 __nvvm_atom_max_gen_ull((unsigned long long *)&sll, ll);
270
271 // CHECK: atomicrmw min i32* {{.*}} seq_cst, align 4
272 __nvvm_atom_min_gen_i(ip, i);
273 // CHECK: atomicrmw umin i32* {{.*}} seq_cst, align 4
274 __nvvm_atom_min_gen_ui((unsigned int *)ip, i);
275 // CHECK: atomicrmw min i{{32|64}}* {{.*}} seq_cst, align {{4|8}}
276 __nvvm_atom_min_gen_l(&dl, l);
277 // CHECK: atomicrmw umin i{{32|64}}* {{.*}} seq_cst, align {{4|8}}
278 __nvvm_atom_min_gen_ul((unsigned long *)&dl, l);
279 // CHECK: atomicrmw min i64* {{.*}} seq_cst, align 8
280 __nvvm_atom_min_gen_ll(&sll, ll);
281 // CHECK: atomicrmw umin i64* {{.*}} seq_cst, align 8
282 __nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll);
283
284 // CHECK: cmpxchg i32* {{.*}} seq_cst seq_cst, align 4
285 // CHECK-NEXT: extractvalue { i32, i1 } {{%[0-9]+}}, 0
286 __nvvm_atom_cas_gen_i(ip, 0, i);
287 // CHECK: cmpxchg i{{32|64}}* {{.*}} seq_cst seq_cst, align {{4|8}}
288 // CHECK-NEXT: extractvalue { {{i32|i64}}, i1 } {{%[0-9]+}}, 0
289 __nvvm_atom_cas_gen_l(&dl, 0, l);
290 // CHECK: cmpxchg i64* {{.*}} seq_cst seq_cst, align 8
291 // CHECK-NEXT: extractvalue { i64, i1 } {{%[0-9]+}}, 0
292 __nvvm_atom_cas_gen_ll(&sll, 0, ll);
293
294 // CHECK: atomicrmw fadd float* {{.*}} seq_cst, align 4
295 __nvvm_atom_add_gen_f(fp, f);
296
297 // CHECK: call i32 @llvm.nvvm.atomic.load.inc.32.p0i32
298 __nvvm_atom_inc_gen_ui(uip, ui);
299
300 // CHECK: call i32 @llvm.nvvm.atomic.load.dec.32.p0i32
301 __nvvm_atom_dec_gen_ui(uip, ui);
302
303
304 //////////////////////////////////////////////////////////////////
305 // Atomics with scope (only supported on sm_60+).
306
307 #if ERROR_CHECK || __CUDA_ARCH__ >= 600
308
309 // CHECK: call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0i32
310 // expected-error@+1 {{'__nvvm_atom_cta_add_gen_i' needs target feature sm_60}}
311 __nvvm_atom_cta_add_gen_i(ip, i);
312 // LP32: call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0i32
313 // LP64: call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0i64
314 // expected-error@+1 {{'__nvvm_atom_cta_add_gen_l' needs target feature sm_60}}
315 __nvvm_atom_cta_add_gen_l(&dl, l);
316 // CHECK: call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0i64
317 // expected-error@+1 {{'__nvvm_atom_cta_add_gen_ll' needs target feature sm_60}}
318 __nvvm_atom_cta_add_gen_ll(&sll, ll);
319 // CHECK: call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0i32
320 // expected-error@+1 {{'__nvvm_atom_sys_add_gen_i' needs target feature sm_60}}
321 __nvvm_atom_sys_add_gen_i(ip, i);
322 // LP32: call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0i32
323 // LP64: call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0i64
324 // expected-error@+1 {{'__nvvm_atom_sys_add_gen_l' needs target feature sm_60}}
325 __nvvm_atom_sys_add_gen_l(&dl, l);
326 // CHECK: call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0i64
327 // expected-error@+1 {{'__nvvm_atom_sys_add_gen_ll' needs target feature sm_60}}
328 __nvvm_atom_sys_add_gen_ll(&sll, ll);
329
330 // CHECK: call float @llvm.nvvm.atomic.add.gen.f.cta.f32.p0f32
331 // expected-error@+1 {{'__nvvm_atom_cta_add_gen_f' needs target feature sm_60}}
332 __nvvm_atom_cta_add_gen_f(fp, f);
333 // CHECK: call double @llvm.nvvm.atomic.add.gen.f.cta.f64.p0f64
334 // expected-error@+1 {{'__nvvm_atom_cta_add_gen_d' needs target feature sm_60}}
335 __nvvm_atom_cta_add_gen_d(dfp, df);
336 // CHECK: call float @llvm.nvvm.atomic.add.gen.f.sys.f32.p0f32
337 // expected-error@+1 {{'__nvvm_atom_sys_add_gen_f' needs target feature sm_60}}
338 __nvvm_atom_sys_add_gen_f(fp, f);
339 // CHECK: call double @llvm.nvvm.atomic.add.gen.f.sys.f64.p0f64
340 // expected-error@+1 {{'__nvvm_atom_sys_add_gen_d' needs target feature sm_60}}
341 __nvvm_atom_sys_add_gen_d(dfp, df);
342
343 // CHECK: call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0i32
344 // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_i' needs target feature sm_60}}
345 __nvvm_atom_cta_xchg_gen_i(ip, i);
346 // LP32: call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0i32
347 // LP64: call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0i64
348 // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_l' needs target feature sm_60}}
349 __nvvm_atom_cta_xchg_gen_l(&dl, l);
350 // CHECK: call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0i64
351 // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_ll' needs target feature sm_60}}
352 __nvvm_atom_cta_xchg_gen_ll(&sll, ll);
353
354 // CHECK: call i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0i32
355 // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_i' needs target feature sm_60}}
356 __nvvm_atom_sys_xchg_gen_i(ip, i);
357 // LP32: call i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0i32
358 // LP64: call i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0i64
359 // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_l' needs target feature sm_60}}
360 __nvvm_atom_sys_xchg_gen_l(&dl, l);
361 // CHECK: call i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0i64
362 // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_ll' needs target feature sm_60}}
363 __nvvm_atom_sys_xchg_gen_ll(&sll, ll);
364
365 // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0i32
366 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_i' needs target feature sm_60}}
367 __nvvm_atom_cta_max_gen_i(ip, i);
368 // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0i32
369 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ui' needs target feature sm_60}}
370 __nvvm_atom_cta_max_gen_ui((unsigned int *)ip, i);
371 // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0i32
372 // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0i64
373 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_l' needs target feature sm_60}}
374 __nvvm_atom_cta_max_gen_l(&dl, l);
375 // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0i32
376 // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0i64
377 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ul' needs target feature sm_60}}
378 __nvvm_atom_cta_max_gen_ul((unsigned long *)lp, l);
379 // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0i64
380 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ll' needs target feature sm_60}}
381 __nvvm_atom_cta_max_gen_ll(&sll, ll);
382 // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0i64
383 // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ull' needs target feature sm_60}}
384 __nvvm_atom_cta_max_gen_ull((unsigned long long *)llp, ll);
385
386 // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0i32
387 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_i' needs target feature sm_60}}
388 __nvvm_atom_sys_max_gen_i(ip, i);
389 // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0i32
390 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ui' needs target feature sm_60}}
391 __nvvm_atom_sys_max_gen_ui((unsigned int *)ip, i);
392 // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0i32
393 // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0i64
394 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_l' needs target feature sm_60}}
395 __nvvm_atom_sys_max_gen_l(&dl, l);
396 // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0i32
397 // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0i64
398 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ul' needs target feature sm_60}}
399 __nvvm_atom_sys_max_gen_ul((unsigned long *)lp, l);
400 // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0i64
401 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ll' needs target feature sm_60}}
402 __nvvm_atom_sys_max_gen_ll(&sll, ll);
403 // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0i64
404 // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ull' needs target feature sm_60}}
405 __nvvm_atom_sys_max_gen_ull((unsigned long long *)llp, ll);
406
407 // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0i32
408 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_i' needs target feature sm_60}}
409 __nvvm_atom_cta_min_gen_i(ip, i);
410 // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0i32
411 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ui' needs target feature sm_60}}
412 __nvvm_atom_cta_min_gen_ui((unsigned int *)ip, i);
413 // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0i32
414 // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0i64
415 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_l' needs target feature sm_60}}
416 __nvvm_atom_cta_min_gen_l(&dl, l);
417 // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0i32
418 // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0i64
419 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ul' needs target feature sm_60}}
420 __nvvm_atom_cta_min_gen_ul((unsigned long *)lp, l);
421 // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0i64
422 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ll' needs target feature sm_60}}
423 __nvvm_atom_cta_min_gen_ll(&sll, ll);
424 // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0i64
425 // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ull' needs target feature sm_60}}
426 __nvvm_atom_cta_min_gen_ull((unsigned long long *)llp, ll);
427
428 // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0i32
429 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_i' needs target feature sm_60}}
430 __nvvm_atom_sys_min_gen_i(ip, i);
431 // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0i32
432 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ui' needs target feature sm_60}}
433 __nvvm_atom_sys_min_gen_ui((unsigned int *)ip, i);
434 // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0i32
435 // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0i64
436 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_l' needs target feature sm_60}}
437 __nvvm_atom_sys_min_gen_l(&dl, l);
438 // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0i32
439 // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0i64
440 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ul' needs target feature sm_60}}
441 __nvvm_atom_sys_min_gen_ul((unsigned long *)lp, l);
442 // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0i64
443 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ll' needs target feature sm_60}}
444 __nvvm_atom_sys_min_gen_ll(&sll, ll);
445 // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0i64
446 // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ull' needs target feature sm_60}}
447 __nvvm_atom_sys_min_gen_ull((unsigned long long *)llp, ll);
448
449 // CHECK: call i32 @llvm.nvvm.atomic.inc.gen.i.cta.i32.p0i32
450 // expected-error@+1 {{'__nvvm_atom_cta_inc_gen_ui' needs target feature sm_60}}
451 __nvvm_atom_cta_inc_gen_ui((unsigned int *)ip, i);
452 // CHECK: call i32 @llvm.nvvm.atomic.inc.gen.i.sys.i32.p0i32
453 // expected-error@+1 {{'__nvvm_atom_sys_inc_gen_ui' needs target feature sm_60}}
454 __nvvm_atom_sys_inc_gen_ui((unsigned int *)ip, i);
455
456 // CHECK: call i32 @llvm.nvvm.atomic.dec.gen.i.cta.i32.p0i32
457 // expected-error@+1 {{'__nvvm_atom_cta_dec_gen_ui' needs target feature sm_60}}
458 __nvvm_atom_cta_dec_gen_ui((unsigned int *)ip, i);
459 // CHECK: call i32 @llvm.nvvm.atomic.dec.gen.i.sys.i32.p0i32
460 // expected-error@+1 {{'__nvvm_atom_sys_dec_gen_ui' needs target feature sm_60}}
461 __nvvm_atom_sys_dec_gen_ui((unsigned int *)ip, i);
462
463 // CHECK: call i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0i32
464 // expected-error@+1 {{'__nvvm_atom_cta_and_gen_i' needs target feature sm_60}}
465 __nvvm_atom_cta_and_gen_i(ip, i);
466 // LP32: call i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0i32
467 // LP64: call i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0i64
468 // expected-error@+1 {{'__nvvm_atom_cta_and_gen_l' needs target feature sm_60}}
469 __nvvm_atom_cta_and_gen_l(&dl, l);
470 // CHECK: call i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0i64
471 // expected-error@+1 {{'__nvvm_atom_cta_and_gen_ll' needs target feature sm_60}}
472 __nvvm_atom_cta_and_gen_ll(&sll, ll);
473
474 // CHECK: call i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0i32
475 // expected-error@+1 {{'__nvvm_atom_sys_and_gen_i' needs target feature sm_60}}
476 __nvvm_atom_sys_and_gen_i(ip, i);
477 // LP32: call i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0i32
478 // LP64: call i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0i64
479 // expected-error@+1 {{'__nvvm_atom_sys_and_gen_l' needs target feature sm_60}}
480 __nvvm_atom_sys_and_gen_l(&dl, l);
481 // CHECK: call i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0i64
482 // expected-error@+1 {{'__nvvm_atom_sys_and_gen_ll' needs target feature sm_60}}
483 __nvvm_atom_sys_and_gen_ll(&sll, ll);
484
485 // CHECK: call i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0i32
486 // expected-error@+1 {{'__nvvm_atom_cta_or_gen_i' needs target feature sm_60}}
487 __nvvm_atom_cta_or_gen_i(ip, i);
488 // LP32: call i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0i32
489 // LP64: call i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0i64
490 // expected-error@+1 {{'__nvvm_atom_cta_or_gen_l' needs target feature sm_60}}
491 __nvvm_atom_cta_or_gen_l(&dl, l);
492 // CHECK: call i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0i64
493 // expected-error@+1 {{'__nvvm_atom_cta_or_gen_ll' needs target feature sm_60}}
494 __nvvm_atom_cta_or_gen_ll(&sll, ll);
495
496 // CHECK: call i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0i32
497 // expected-error@+1 {{'__nvvm_atom_sys_or_gen_i' needs target feature sm_60}}
498 __nvvm_atom_sys_or_gen_i(ip, i);
499 // LP32: call i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0i32
500 // LP64: call i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0i64
501 // expected-error@+1 {{'__nvvm_atom_sys_or_gen_l' needs target feature sm_60}}
502 __nvvm_atom_sys_or_gen_l(&dl, l);
503 // CHECK: call i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0i64
504 // expected-error@+1 {{'__nvvm_atom_sys_or_gen_ll' needs target feature sm_60}}
505 __nvvm_atom_sys_or_gen_ll(&sll, ll);
506
507 // CHECK: call i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0i32
508 // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_i' needs target feature sm_60}}
509 __nvvm_atom_cta_xor_gen_i(ip, i);
510 // LP32: call i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0i32
511 // LP64: call i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0i64
512 // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_l' needs target feature sm_60}}
513 __nvvm_atom_cta_xor_gen_l(&dl, l);
514 // CHECK: call i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0i64
515 // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_ll' needs target feature sm_60}}
516 __nvvm_atom_cta_xor_gen_ll(&sll, ll);
517
518 // CHECK: call i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0i32
519 // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_i' needs target feature sm_60}}
520 __nvvm_atom_sys_xor_gen_i(ip, i);
521 // LP32: call i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0i32
522 // LP64: call i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0i64
523 // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_l' needs target feature sm_60}}
524 __nvvm_atom_sys_xor_gen_l(&dl, l);
525 // CHECK: call i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0i64
526 // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_ll' needs target feature sm_60}}
527 __nvvm_atom_sys_xor_gen_ll(&sll, ll);
528
529 // CHECK: call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0i32
530 // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_i' needs target feature sm_60}}
531 __nvvm_atom_cta_cas_gen_i(ip, i, 0);
532 // LP32: call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0i32
533 // LP64: call i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0i64
534 // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_l' needs target feature sm_60}}
535 __nvvm_atom_cta_cas_gen_l(&dl, l, 0);
536 // CHECK: call i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0i64
537 // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_ll' needs target feature sm_60}}
538 __nvvm_atom_cta_cas_gen_ll(&sll, ll, 0);
539
540 // CHECK: call i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0i32
541 // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_i' needs target feature sm_60}}
542 __nvvm_atom_sys_cas_gen_i(ip, i, 0);
543 // LP32: call i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0i32
544 // LP64: call i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0i64
545 // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_l' needs target feature sm_60}}
546 __nvvm_atom_sys_cas_gen_l(&dl, l, 0);
547 // CHECK: call i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0i64
548 // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_ll' needs target feature sm_60}}
549 __nvvm_atom_sys_cas_gen_ll(&sll, ll, 0);
550 #endif
551
552 // CHECK: ret
553 }
554
555 // CHECK-LABEL: nvvm_ldg
nvvm_ldg(const void * p)556 __device__ void nvvm_ldg(const void *p) {
557 // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* {{%[0-9]+}}, i32 1)
558 // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* {{%[0-9]+}}, i32 1)
559 __nvvm_ldg_c((const char *)p);
560 __nvvm_ldg_uc((const unsigned char *)p);
561
562 // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0i16(i16* {{%[0-9]+}}, i32 2)
563 // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0i16(i16* {{%[0-9]+}}, i32 2)
564 __nvvm_ldg_s((const short *)p);
565 __nvvm_ldg_us((const unsigned short *)p);
566
567 // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
568 // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
569 __nvvm_ldg_i((const int *)p);
570 __nvvm_ldg_ui((const unsigned int *)p);
571
572 // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
573 // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
574 // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* {{%[0-9]+}}, i32 8)
575 // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* {{%[0-9]+}}, i32 8)
576 __nvvm_ldg_l((const long *)p);
577 __nvvm_ldg_ul((const unsigned long *)p);
578
579 // CHECK: call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* {{%[0-9]+}}, i32 4)
580 __nvvm_ldg_f((const float *)p);
581 // CHECK: call double @llvm.nvvm.ldg.global.f.f64.p0f64(double* {{%[0-9]+}}, i32 8)
582 __nvvm_ldg_d((const double *)p);
583
584 // In practice, the pointers we pass to __ldg will be aligned as appropriate
585 // for the CUDA <type>N vector types (e.g. short4), which are not the same as
586 // the LLVM vector types. However, each LLVM vector type has an alignment
587 // less than or equal to its corresponding CUDA type, so we're OK.
588 //
589 // PTX Interoperability section 2.2: "For a vector with an even number of
590 // elements, its alignment is set to number of elements times the alignment of
591 // its member: n*alignof(t)."
592
593 // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* {{%[0-9]+}}, i32 2)
594 // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* {{%[0-9]+}}, i32 2)
595 typedef char char2 __attribute__((ext_vector_type(2)));
596 typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
597 __nvvm_ldg_c2((const char2 *)p);
598 __nvvm_ldg_uc2((const uchar2 *)p);
599
600 // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0v4i8(<4 x i8>* {{%[0-9]+}}, i32 4)
601 // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0v4i8(<4 x i8>* {{%[0-9]+}}, i32 4)
602 typedef char char4 __attribute__((ext_vector_type(4)));
603 typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
604 __nvvm_ldg_c4((const char4 *)p);
605 __nvvm_ldg_uc4((const uchar4 *)p);
606
607 // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* {{%[0-9]+}}, i32 4)
608 // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* {{%[0-9]+}}, i32 4)
609 typedef short short2 __attribute__((ext_vector_type(2)));
610 typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
611 __nvvm_ldg_s2((const short2 *)p);
612 __nvvm_ldg_us2((const ushort2 *)p);
613
614 // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0v4i16(<4 x i16>* {{%[0-9]+}}, i32 8)
615 // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0v4i16(<4 x i16>* {{%[0-9]+}}, i32 8)
616 typedef short short4 __attribute__((ext_vector_type(4)));
617 typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
618 __nvvm_ldg_s4((const short4 *)p);
619 __nvvm_ldg_us4((const ushort4 *)p);
620
621 // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* {{%[0-9]+}}, i32 8)
622 // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* {{%[0-9]+}}, i32 8)
623 typedef int int2 __attribute__((ext_vector_type(2)));
624 typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
625 __nvvm_ldg_i2((const int2 *)p);
626 __nvvm_ldg_ui2((const uint2 *)p);
627
628 // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0v4i32(<4 x i32>* {{%[0-9]+}}, i32 16)
629 // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0v4i32(<4 x i32>* {{%[0-9]+}}, i32 16)
630 typedef int int4 __attribute__((ext_vector_type(4)));
631 typedef unsigned int uint4 __attribute__((ext_vector_type(4)));
632 __nvvm_ldg_i4((const int4 *)p);
633 __nvvm_ldg_ui4((const uint4 *)p);
634
635 // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* {{%[0-9]+}}, i32 16)
636 // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* {{%[0-9]+}}, i32 16)
637 typedef long long longlong2 __attribute__((ext_vector_type(2)));
638 typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2)));
639 __nvvm_ldg_ll2((const longlong2 *)p);
640 __nvvm_ldg_ull2((const ulonglong2 *)p);
641
642 // CHECK: call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0v2f32(<2 x float>* {{%[0-9]+}}, i32 8)
643 typedef float float2 __attribute__((ext_vector_type(2)));
644 __nvvm_ldg_f2((const float2 *)p);
645
646 // CHECK: call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0v4f32(<4 x float>* {{%[0-9]+}}, i32 16)
647 typedef float float4 __attribute__((ext_vector_type(4)));
648 __nvvm_ldg_f4((const float4 *)p);
649
650 // CHECK: call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0v2f64(<2 x double>* {{%[0-9]+}}, i32 16)
651 typedef double double2 __attribute__((ext_vector_type(2)));
652 __nvvm_ldg_d2((const double2 *)p);
653 }
654
655 // CHECK-LABEL: nvvm_shfl
nvvm_shfl(int i,float f,int a,int b)656 __device__ void nvvm_shfl(int i, float f, int a, int b) {
657 // CHECK: call i32 @llvm.nvvm.shfl.down.i32(i32
658 __nvvm_shfl_down_i32(i, a, b);
659 // CHECK: call float @llvm.nvvm.shfl.down.f32(float
660 __nvvm_shfl_down_f32(f, a, b);
661 // CHECK: call i32 @llvm.nvvm.shfl.up.i32(i32
662 __nvvm_shfl_up_i32(i, a, b);
663 // CHECK: call float @llvm.nvvm.shfl.up.f32(float
664 __nvvm_shfl_up_f32(f, a, b);
665 // CHECK: call i32 @llvm.nvvm.shfl.bfly.i32(i32
666 __nvvm_shfl_bfly_i32(i, a, b);
667 // CHECK: call float @llvm.nvvm.shfl.bfly.f32(float
668 __nvvm_shfl_bfly_f32(f, a, b);
669 // CHECK: call i32 @llvm.nvvm.shfl.idx.i32(i32
670 __nvvm_shfl_idx_i32(i, a, b);
671 // CHECK: call float @llvm.nvvm.shfl.idx.f32(float
672 __nvvm_shfl_idx_f32(f, a, b);
673 // CHECK: ret void
674 }
675
nvvm_vote(int pred)676 __device__ void nvvm_vote(int pred) {
677 // CHECK: call i1 @llvm.nvvm.vote.all(i1
678 __nvvm_vote_all(pred);
679 // CHECK: call i1 @llvm.nvvm.vote.any(i1
680 __nvvm_vote_any(pred);
681 // CHECK: call i1 @llvm.nvvm.vote.uni(i1
682 __nvvm_vote_uni(pred);
683 // CHECK: call i32 @llvm.nvvm.vote.ballot(i1
684 __nvvm_vote_ballot(pred);
685 // CHECK: ret void
686 }
687
688 // CHECK-LABEL: nvvm_mbarrier
nvvm_mbarrier(long long * addr,long long * sharedAddr,int count,long long state)689 __device__ void nvvm_mbarrier(long long* addr, __attribute__((address_space(3))) long long* sharedAddr, int count, long long state) {
690 #if __CUDA_ARCH__ >= 800
691 __nvvm_mbarrier_init(addr, count);
692 // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init
693 __nvvm_mbarrier_init_shared(sharedAddr, count);
694 // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init.shared
695
696 __nvvm_mbarrier_inval(addr);
697 // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval
698 __nvvm_mbarrier_inval_shared(sharedAddr);
699 // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval.shared
700
701 __nvvm_mbarrier_arrive(addr);
702 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive
703 __nvvm_mbarrier_arrive_shared(sharedAddr);
704 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.shared
705 __nvvm_mbarrier_arrive_noComplete(addr, count);
706 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete
707 __nvvm_mbarrier_arrive_noComplete_shared(sharedAddr, count);
708 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared
709
710 __nvvm_mbarrier_arrive_drop(addr);
711 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop
712 __nvvm_mbarrier_arrive_drop_shared(sharedAddr);
713 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.shared
714 __nvvm_mbarrier_arrive_drop_noComplete(addr, count);
715 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete
716 __nvvm_mbarrier_arrive_drop_noComplete_shared(sharedAddr, count);
717 // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared
718
719 __nvvm_mbarrier_test_wait(addr, state);
720 // CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait
721 __nvvm_mbarrier_test_wait_shared(sharedAddr, state);
722 // CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait.shared
723
724 __nvvm_mbarrier_pending_count(state);
725 // CHECK_PTX70_SM80: call i32 @llvm.nvvm.mbarrier.pending.count
726 #endif
727 // CHECK: ret void
728 }
729
730 // CHECK-LABEL: nvvm_async_copy
nvvm_async_copy(void * dst,const void * src,long long * addr,long long * sharedAddr)731 __device__ void nvvm_async_copy(__attribute__((address_space(3))) void* dst, __attribute__((address_space(1))) const void* src, long long* addr, __attribute__((address_space(3))) long long* sharedAddr) {
732 #if __CUDA_ARCH__ >= 800
733 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive
734 __nvvm_cp_async_mbarrier_arrive(addr);
735 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared
736 __nvvm_cp_async_mbarrier_arrive_shared(sharedAddr);
737 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc
738 __nvvm_cp_async_mbarrier_arrive_noinc(addr);
739 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared
740 __nvvm_cp_async_mbarrier_arrive_noinc_shared(sharedAddr);
741
742 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4
743 __nvvm_cp_async_ca_shared_global_4(dst, src);
744 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8
745 __nvvm_cp_async_ca_shared_global_8(dst, src);
746 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16
747 __nvvm_cp_async_ca_shared_global_16(dst, src);
748 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16
749 __nvvm_cp_async_cg_shared_global_16(dst, src);
750
751 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.commit.group
752 __nvvm_cp_async_commit_group();
753 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 0)
754 __nvvm_cp_async_wait_group(0);
755 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 8)
756 __nvvm_cp_async_wait_group(8);
757 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 16)
758 __nvvm_cp_async_wait_group(16);
759 // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.all
760 __nvvm_cp_async_wait_all();
761 #endif
762 // CHECK: ret void
763 }
764
765 // CHECK-LABEL: nvvm_cvt_sm80
nvvm_cvt_sm80()766 __device__ void nvvm_cvt_sm80() {
767 #if __CUDA_ARCH__ >= 800
768 // CHECK_PTX70_SM80: call i32 @llvm.nvvm.ff2bf16x2.rn(float 1.000000e+00, float 1.000000e+00)
769 __nvvm_ff2bf16x2_rn(1, 1);
770 // CHECK_PTX70_SM80: call i32 @llvm.nvvm.ff2bf16x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
771 __nvvm_ff2bf16x2_rn_relu(1, 1);
772 // CHECK_PTX70_SM80: call i32 @llvm.nvvm.ff2bf16x2.rz(float 1.000000e+00, float 1.000000e+00)
773 __nvvm_ff2bf16x2_rz(1, 1);
774 // CHECK_PTX70_SM80: call i32 @llvm.nvvm.ff2bf16x2.rz.relu(float 1.000000e+00, float 1.000000e+00)
775 __nvvm_ff2bf16x2_rz_relu(1, 1);
776
777 // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn(float 1.000000e+00, float 1.000000e+00)
778 __nvvm_ff2f16x2_rn(1, 1);
779 // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
780 __nvvm_ff2f16x2_rn_relu(1, 1);
781 // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz(float 1.000000e+00, float 1.000000e+00)
782 __nvvm_ff2f16x2_rz(1, 1);
783 // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz.relu(float 1.000000e+00, float 1.000000e+00)
784 __nvvm_ff2f16x2_rz_relu(1, 1);
785
786 // CHECK_PTX70_SM80: call i16 @llvm.nvvm.f2bf16.rn(float 1.000000e+00)
787 __nvvm_f2bf16_rn(1);
788 // CHECK_PTX70_SM80: call i16 @llvm.nvvm.f2bf16.rn.relu(float 1.000000e+00)
789 __nvvm_f2bf16_rn_relu(1);
790 // CHECK_PTX70_SM80: call i16 @llvm.nvvm.f2bf16.rz(float 1.000000e+00)
791 __nvvm_f2bf16_rz(1);
792 // CHECK_PTX70_SM80: call i16 @llvm.nvvm.f2bf16.rz.relu(float 1.000000e+00)
793 __nvvm_f2bf16_rz_relu(1);
794
795 // CHECK_PTX70_SM80: call i32 @llvm.nvvm.f2tf32.rna(float 1.000000e+00)
796 __nvvm_f2tf32_rna(1);
797 #endif
798 // CHECK: ret void
799 }
800
801 // CHECK-LABEL: nvvm_abs_neg_bf16_bf16x2_sm80
nvvm_abs_neg_bf16_bf16x2_sm80()802 __device__ void nvvm_abs_neg_bf16_bf16x2_sm80() {
803 #if __CUDA_ARCH__ >= 800
804
805 // CHECK_PTX70_SM80: call i16 @llvm.nvvm.abs.bf16(i16 -1)
806 __nvvm_abs_bf16(0xFFFF);
807 // CHECK_PTX70_SM80: call i32 @llvm.nvvm.abs.bf16x2(i32 -1)
808 __nvvm_abs_bf16x2(0xFFFFFFFF);
809
810 // CHECK_PTX70_SM80: call i16 @llvm.nvvm.neg.bf16(i16 -1)
811 __nvvm_neg_bf16(0xFFFF);
812 // CHECK_PTX70_SM80: call i32 @llvm.nvvm.neg.bf16x2(i32 -1)
813 __nvvm_neg_bf16x2(0xFFFFFFFF);
814 #endif
815 // CHECK: ret void
816 }
817
818 #define NAN32 0x7FBFFFFF
819 #define NAN16 0x7FBF
820 #define BF16 0x1234
821 #define BF16_2 0x4321
822 #define NANBF16 0xFFC1
823 #define BF16X2 0x12341234
824 #define BF16X2_2 0x32343234
825 #define NANBF16X2 0xFFC1FFC1
826
827 // CHECK-LABEL: nvvm_min_max_sm80
nvvm_min_max_sm80()828 __device__ void nvvm_min_max_sm80() {
829 #if __CUDA_ARCH__ >= 800
830
831 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmin.nan.f
832 __nvvm_fmin_nan_f(0.1f, (float)NAN32);
833 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmin.ftz.nan.f
834 __nvvm_fmin_ftz_nan_f(0.1f, (float)NAN32);
835
836 // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fmin.bf16
837 __nvvm_fmin_bf16(BF16, BF16_2);
838 // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fmin.nan.bf16
839 __nvvm_fmin_nan_bf16(BF16, NANBF16);
840 // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fmin.bf16x2
841 __nvvm_fmin_bf16x2(BF16X2, BF16X2_2);
842 // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fmin.nan.bf16x2
843 __nvvm_fmin_nan_bf16x2(BF16X2, NANBF16X2);
844 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f
845 __nvvm_fmax_nan_f(0.1f, 0.11f);
846 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f
847 __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32);
848
849 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f
850 __nvvm_fmax_nan_f(0.1f, (float)NAN32);
851 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f
852 __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32);
853 // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fmax.bf16
854 __nvvm_fmax_bf16(BF16, BF16_2);
855 // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fmax.nan.bf16
856 __nvvm_fmax_nan_bf16(BF16, NANBF16);
857 // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fmax.bf16x2
858 __nvvm_fmax_bf16x2(BF16X2, BF16X2_2);
859 // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fmax.nan.bf16x2
860 __nvvm_fmax_nan_bf16x2(NANBF16X2, BF16X2);
861 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f
862 __nvvm_fmax_nan_f(0.1f, (float)NAN32);
863 // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f
864 __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32);
865
866 #endif
867 // CHECK: ret void
868 }
869
870 // CHECK-LABEL: nvvm_fma_bf16_bf16x2_sm80
nvvm_fma_bf16_bf16x2_sm80()871 __device__ void nvvm_fma_bf16_bf16x2_sm80() {
872 #if __CUDA_ARCH__ >= 800
873 // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fma.rn.bf16
874 __nvvm_fma_rn_bf16(0x1234, 0x7FBF, 0x1234);
875 // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fma.rn.relu.bf16
876 __nvvm_fma_rn_relu_bf16(0x1234, 0x7FBF, 0x1234);
877 // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fma.rn.bf16x2
878 __nvvm_fma_rn_bf16x2(0x7FBFFFFF, 0xFFFFFFFF, 0x7FBFFFFF);
879 // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fma.rn.relu.bf16x2
880 __nvvm_fma_rn_relu_bf16x2(0x7FBFFFFF, 0xFFFFFFFF, 0x7FBFFFFF);
881 #endif
882 // CHECK: ret void
883 }
884
885 // CHECK-LABEL: nvvm_min_max_sm86
nvvm_min_max_sm86()886 __device__ void nvvm_min_max_sm86() {
887 #if __CUDA_ARCH__ >= 860
888
889 // CHECK_PTX72_SM86: call i16 @llvm.nvvm.fmin.xorsign.abs.bf16
890 __nvvm_fmin_xorsign_abs_bf16(BF16, BF16_2);
891 // CHECK_PTX72_SM86: call i16 @llvm.nvvm.fmin.nan.xorsign.abs.bf16
892 __nvvm_fmin_nan_xorsign_abs_bf16(BF16, NANBF16);
893 // CHECK_PTX72_SM86: call i32 @llvm.nvvm.fmin.xorsign.abs.bf16x2
894 __nvvm_fmin_xorsign_abs_bf16x2(BF16X2, BF16X2_2);
895 // CHECK_PTX72_SM86: call i32 @llvm.nvvm.fmin.nan.xorsign.abs.bf16x2
896 __nvvm_fmin_nan_xorsign_abs_bf16x2(BF16X2, NANBF16X2);
897 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.xorsign.abs.f
898 __nvvm_fmin_xorsign_abs_f(-0.1f, 0.1f);
899 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.ftz.xorsign.abs.f
900 __nvvm_fmin_ftz_xorsign_abs_f(-0.1f, 0.1f);
901 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.nan.xorsign.abs.f
902 __nvvm_fmin_nan_xorsign_abs_f(-0.1f, (float)NAN32);
903 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f
904 __nvvm_fmin_ftz_nan_xorsign_abs_f(-0.1f, (float)NAN32);
905
906 // CHECK_PTX72_SM86: call i16 @llvm.nvvm.fmax.xorsign.abs.bf16
907 __nvvm_fmax_xorsign_abs_bf16(BF16, BF16_2);
908 // CHECK_PTX72_SM86: call i16 @llvm.nvvm.fmax.nan.xorsign.abs.bf16
909 __nvvm_fmax_nan_xorsign_abs_bf16(BF16, NANBF16);
910 // CHECK_PTX72_SM86: call i32 @llvm.nvvm.fmax.xorsign.abs.bf16x2
911 __nvvm_fmax_xorsign_abs_bf16x2(BF16X2, BF16X2_2);
912 // CHECK_PTX72_SM86: call i32 @llvm.nvvm.fmax.nan.xorsign.abs.bf16x2
913 __nvvm_fmax_nan_xorsign_abs_bf16x2(BF16X2, NANBF16X2);
914 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.xorsign.abs.f
915 __nvvm_fmax_xorsign_abs_f(-0.1f, 0.1f);
916 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.ftz.xorsign.abs.f
917 __nvvm_fmax_ftz_xorsign_abs_f(-0.1f, 0.1f);
918 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.nan.xorsign.abs.f
919 __nvvm_fmax_nan_xorsign_abs_f(-0.1f, (float)NAN32);
920 // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f
921 __nvvm_fmax_ftz_nan_xorsign_abs_f(-0.1f, (float)NAN32);
922 #endif
923 // CHECK: ret void
924 }
925