1 // REQUIRES: nvptx-registered-target
2 // RUN: %clang_cc1 -no-opaque-pointers -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
3 // RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
4 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP32 %s
5 // RUN: %clang_cc1 -no-opaque-pointers -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
6 // RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
7 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s
8 // RUN: %clang_cc1 -no-opaque-pointers -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 \
9 // RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
10 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
11 // RUN: %clang_cc1 -no-opaque-pointers -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_60 \
12 // RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
13 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
14 // RUN: %clang_cc1 -no-opaque-pointers -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_61 \
15 // RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
16 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
17 // RUN: %clang_cc1 -no-opaque-pointers -triple nvptx-unknown-unknown -target-cpu sm_53 \
18 // RUN:   -DERROR_CHECK -fcuda-is-device -S -o /dev/null -x cuda -verify %s
19 // RUN: %clang_cc1 -no-opaque-pointers -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
20 // RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
21 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP32 %s
22 // RUN: %clang_cc1 -no-opaque-pointers -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
23 // RUN:            -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
24 // RUN:   | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s
25 
26 #define __device__ __attribute__((device))
27 #define __global__ __attribute__((global))
28 #define __shared__ __attribute__((shared))
29 #define __constant__ __attribute__((constant))
30 
read_tid()31 __device__ int read_tid() {
32 
33 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
34 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
35 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
36 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.w()
37 
38   int x = __nvvm_read_ptx_sreg_tid_x();
39   int y = __nvvm_read_ptx_sreg_tid_y();
40   int z = __nvvm_read_ptx_sreg_tid_z();
41   int w = __nvvm_read_ptx_sreg_tid_w();
42 
43   return x + y + z + w;
44 
45 }
46 
read_ntid()47 __device__ int read_ntid() {
48 
49 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
50 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
51 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
52 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.w()
53 
54   int x = __nvvm_read_ptx_sreg_ntid_x();
55   int y = __nvvm_read_ptx_sreg_ntid_y();
56   int z = __nvvm_read_ptx_sreg_ntid_z();
57   int w = __nvvm_read_ptx_sreg_ntid_w();
58 
59   return x + y + z + w;
60 
61 }
62 
read_ctaid()63 __device__ int read_ctaid() {
64 
65 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
66 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
67 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
68 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.w()
69 
70   int x = __nvvm_read_ptx_sreg_ctaid_x();
71   int y = __nvvm_read_ptx_sreg_ctaid_y();
72   int z = __nvvm_read_ptx_sreg_ctaid_z();
73   int w = __nvvm_read_ptx_sreg_ctaid_w();
74 
75   return x + y + z + w;
76 
77 }
78 
read_nctaid()79 __device__ int read_nctaid() {
80 
81 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
82 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
83 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
84 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.w()
85 
86   int x = __nvvm_read_ptx_sreg_nctaid_x();
87   int y = __nvvm_read_ptx_sreg_nctaid_y();
88   int z = __nvvm_read_ptx_sreg_nctaid_z();
89   int w = __nvvm_read_ptx_sreg_nctaid_w();
90 
91   return x + y + z + w;
92 
93 }
94 
read_ids()95 __device__ int read_ids() {
96 
97 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.laneid()
98 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpid()
99 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nwarpid()
100 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.smid()
101 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nsmid()
102 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.gridid()
103 
104   int a = __nvvm_read_ptx_sreg_laneid();
105   int b = __nvvm_read_ptx_sreg_warpid();
106   int c = __nvvm_read_ptx_sreg_nwarpid();
107   int d = __nvvm_read_ptx_sreg_smid();
108   int e = __nvvm_read_ptx_sreg_nsmid();
109   int f = __nvvm_read_ptx_sreg_gridid();
110 
111   return a + b + c + d + e + f;
112 
113 }
114 
read_lanemasks()115 __device__ int read_lanemasks() {
116 
117 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.eq()
118 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.le()
119 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.lt()
120 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.ge()
121 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.gt()
122 
123   int a = __nvvm_read_ptx_sreg_lanemask_eq();
124   int b = __nvvm_read_ptx_sreg_lanemask_le();
125   int c = __nvvm_read_ptx_sreg_lanemask_lt();
126   int d = __nvvm_read_ptx_sreg_lanemask_ge();
127   int e = __nvvm_read_ptx_sreg_lanemask_gt();
128 
129   return a + b + c + d + e;
130 
131 }
132 
read_clocks()133 __device__ long long read_clocks() {
134 
135 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.clock()
136 // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.clock64()
137 
138   int a = __nvvm_read_ptx_sreg_clock();
139   long long b = __nvvm_read_ptx_sreg_clock64();
140 
141   return a + b;
142 }
143 
read_pms()144 __device__ int read_pms() {
145 
146 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm0()
147 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm1()
148 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm2()
149 // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm3()
150 
151   int a = __nvvm_read_ptx_sreg_pm0();
152   int b = __nvvm_read_ptx_sreg_pm1();
153   int c = __nvvm_read_ptx_sreg_pm2();
154   int d = __nvvm_read_ptx_sreg_pm3();
155 
156   return a + b + c + d;
157 
158 }
159 
sync()160 __device__ void sync() {
161 
162 // CHECK: call void @llvm.nvvm.bar.sync(i32 0)
163 
164   __nvvm_bar_sync(0);
165 
166 }
167 
168 
169 // NVVM intrinsics
170 
171 // The idea is not to test all intrinsics, just that Clang is recognizing the
172 // builtins defined in BuiltinsNVPTX.def
nvvm_math(float f1,float f2,double d1,double d2)173 __device__ void nvvm_math(float f1, float f2, double d1, double d2) {
174 // CHECK: call float @llvm.nvvm.fmax.f
175   float t1 = __nvvm_fmax_f(f1, f2);
176 // CHECK: call float @llvm.nvvm.fmin.f
177   float t2 = __nvvm_fmin_f(f1, f2);
178 // CHECK: call float @llvm.nvvm.sqrt.rn.f
179   float t3 = __nvvm_sqrt_rn_f(f1);
180 // CHECK: call float @llvm.nvvm.rcp.rn.f
181   float t4 = __nvvm_rcp_rn_f(f2);
182 // CHECK: call float @llvm.nvvm.add.rn.f
183   float t5 = __nvvm_add_rn_f(f1, f2);
184 
185 // CHECK: call double @llvm.nvvm.fmax.d
186   double td1 = __nvvm_fmax_d(d1, d2);
187 // CHECK: call double @llvm.nvvm.fmin.d
188   double td2 = __nvvm_fmin_d(d1, d2);
189 // CHECK: call double @llvm.nvvm.sqrt.rn.d
190   double td3 = __nvvm_sqrt_rn_d(d1);
191 // CHECK: call double @llvm.nvvm.rcp.rn.d
192   double td4 = __nvvm_rcp_rn_d(d2);
193 
194 // CHECK: call void @llvm.nvvm.membar.cta()
195   __nvvm_membar_cta();
196 // CHECK: call void @llvm.nvvm.membar.gl()
197   __nvvm_membar_gl();
198 // CHECK: call void @llvm.nvvm.membar.sys()
199   __nvvm_membar_sys();
200 // CHECK: call void @llvm.nvvm.barrier0()
201   __syncthreads();
202 }
203 
204 __device__ int di;
205 __shared__ int si;
206 __device__ long dl;
207 __shared__ long sl;
208 __device__ long long dll;
209 __shared__ long long sll;
210 
211 // Check for atomic intrinsics
212 // CHECK-LABEL: nvvm_atom
nvvm_atom(float * fp,float f,double * dfp,double df,int * ip,int i,unsigned int * uip,unsigned ui,long * lp,long l,long long * llp,long long ll)213 __device__ void nvvm_atom(float *fp, float f, double *dfp, double df, int *ip,
214                           int i, unsigned int *uip, unsigned ui, long *lp,
215                           long l, long long *llp, long long ll) {
216   // CHECK: atomicrmw add i32* {{.*}} seq_cst, align 4
217   __nvvm_atom_add_gen_i(ip, i);
218   // CHECK: atomicrmw add i{{32|64}}* {{.*}} seq_cst, align {{4|8}}
219   __nvvm_atom_add_gen_l(&dl, l);
220   // CHECK: atomicrmw add i64* {{.*}} seq_cst, align 8
221   __nvvm_atom_add_gen_ll(&sll, ll);
222 
223   // CHECK: atomicrmw sub i32* {{.*}} seq_cst, align 4
224   __nvvm_atom_sub_gen_i(ip, i);
225   // CHECK: atomicrmw sub i{{32|64}}* {{.*}} seq_cst, align {{4|8}}
226   __nvvm_atom_sub_gen_l(&dl, l);
227   // CHECK: atomicrmw sub i64* {{.*}} seq_cst, align 8
228   __nvvm_atom_sub_gen_ll(&sll, ll);
229 
230   // CHECK: atomicrmw and i32* {{.*}} seq_cst, align 4
231   __nvvm_atom_and_gen_i(ip, i);
232   // CHECK: atomicrmw and i{{32|64}}* {{.*}} seq_cst, align {{4|8}}
233   __nvvm_atom_and_gen_l(&dl, l);
234   // CHECK: atomicrmw and i64* {{.*}} seq_cst, align 8
235   __nvvm_atom_and_gen_ll(&sll, ll);
236 
237   // CHECK: atomicrmw or i32* {{.*}} seq_cst, align 4
238   __nvvm_atom_or_gen_i(ip, i);
239   // CHECK: atomicrmw or i{{32|64}}* {{.*}} seq_cst, align {{4|8}}
240   __nvvm_atom_or_gen_l(&dl, l);
241   // CHECK: atomicrmw or i64* {{.*}} seq_cst, align 8
242   __nvvm_atom_or_gen_ll(&sll, ll);
243 
244   // CHECK: atomicrmw xor i32* {{.*}} seq_cst, align 4
245   __nvvm_atom_xor_gen_i(ip, i);
246   // CHECK: atomicrmw xor i{{32|64}}* {{.*}} seq_cst, align {{4|8}}
247   __nvvm_atom_xor_gen_l(&dl, l);
248   // CHECK: atomicrmw xor i64* {{.*}} seq_cst, align 8
249   __nvvm_atom_xor_gen_ll(&sll, ll);
250 
251   // CHECK: atomicrmw xchg i32* {{.*}} seq_cst, align 4
252   __nvvm_atom_xchg_gen_i(ip, i);
253   // CHECK: atomicrmw xchg i{{32|64}}* {{.*}} seq_cst, align {{4|8}}
254   __nvvm_atom_xchg_gen_l(&dl, l);
255   // CHECK: atomicrmw xchg i64* {{.*}} seq_cst, align 8
256   __nvvm_atom_xchg_gen_ll(&sll, ll);
257 
258   // CHECK: atomicrmw max i32* {{.*}} seq_cst, align 4
259   __nvvm_atom_max_gen_i(ip, i);
260   // CHECK: atomicrmw umax i32* {{.*}} seq_cst, align 4
261   __nvvm_atom_max_gen_ui((unsigned int *)ip, i);
262   // CHECK: atomicrmw max i{{32|64}}* {{.*}} seq_cst, align {{4|8}}
263   __nvvm_atom_max_gen_l(&dl, l);
264   // CHECK: atomicrmw umax i{{32|64}}* {{.*}} seq_cst, align {{4|8}}
265   __nvvm_atom_max_gen_ul((unsigned long *)&dl, l);
266   // CHECK: atomicrmw max i64* {{.*}} seq_cst, align 8
267   __nvvm_atom_max_gen_ll(&sll, ll);
268   // CHECK: atomicrmw umax i64* {{.*}} seq_cst, align 8
269   __nvvm_atom_max_gen_ull((unsigned long long *)&sll, ll);
270 
271   // CHECK: atomicrmw min i32* {{.*}} seq_cst, align 4
272   __nvvm_atom_min_gen_i(ip, i);
273   // CHECK: atomicrmw umin i32* {{.*}} seq_cst, align 4
274   __nvvm_atom_min_gen_ui((unsigned int *)ip, i);
275   // CHECK: atomicrmw min i{{32|64}}* {{.*}} seq_cst, align {{4|8}}
276   __nvvm_atom_min_gen_l(&dl, l);
277   // CHECK: atomicrmw umin i{{32|64}}* {{.*}} seq_cst, align {{4|8}}
278   __nvvm_atom_min_gen_ul((unsigned long *)&dl, l);
279   // CHECK: atomicrmw min i64* {{.*}} seq_cst, align 8
280   __nvvm_atom_min_gen_ll(&sll, ll);
281   // CHECK: atomicrmw umin i64* {{.*}} seq_cst, align 8
282   __nvvm_atom_min_gen_ull((unsigned long long *)&sll, ll);
283 
284   // CHECK: cmpxchg i32* {{.*}} seq_cst seq_cst, align 4
285   // CHECK-NEXT: extractvalue { i32, i1 } {{%[0-9]+}}, 0
286   __nvvm_atom_cas_gen_i(ip, 0, i);
287   // CHECK: cmpxchg i{{32|64}}* {{.*}} seq_cst seq_cst, align {{4|8}}
288   // CHECK-NEXT: extractvalue { {{i32|i64}}, i1 } {{%[0-9]+}}, 0
289   __nvvm_atom_cas_gen_l(&dl, 0, l);
290   // CHECK: cmpxchg i64* {{.*}} seq_cst seq_cst, align 8
291   // CHECK-NEXT: extractvalue { i64, i1 } {{%[0-9]+}}, 0
292   __nvvm_atom_cas_gen_ll(&sll, 0, ll);
293 
294   // CHECK: atomicrmw fadd float* {{.*}} seq_cst, align 4
295   __nvvm_atom_add_gen_f(fp, f);
296 
297   // CHECK: call i32 @llvm.nvvm.atomic.load.inc.32.p0i32
298   __nvvm_atom_inc_gen_ui(uip, ui);
299 
300   // CHECK: call i32 @llvm.nvvm.atomic.load.dec.32.p0i32
301   __nvvm_atom_dec_gen_ui(uip, ui);
302 
303 
304   //////////////////////////////////////////////////////////////////
305   // Atomics with scope (only supported on sm_60+).
306 
307 #if ERROR_CHECK || __CUDA_ARCH__ >= 600
308 
309   // CHECK: call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0i32
310   // expected-error@+1 {{'__nvvm_atom_cta_add_gen_i' needs target feature sm_60}}
311   __nvvm_atom_cta_add_gen_i(ip, i);
312   // LP32: call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0i32
313   // LP64: call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0i64
314   // expected-error@+1 {{'__nvvm_atom_cta_add_gen_l' needs target feature sm_60}}
315   __nvvm_atom_cta_add_gen_l(&dl, l);
316   // CHECK: call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0i64
317   // expected-error@+1 {{'__nvvm_atom_cta_add_gen_ll' needs target feature sm_60}}
318   __nvvm_atom_cta_add_gen_ll(&sll, ll);
319   // CHECK: call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0i32
320   // expected-error@+1 {{'__nvvm_atom_sys_add_gen_i' needs target feature sm_60}}
321   __nvvm_atom_sys_add_gen_i(ip, i);
322   // LP32: call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0i32
323   // LP64: call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0i64
324   // expected-error@+1 {{'__nvvm_atom_sys_add_gen_l' needs target feature sm_60}}
325   __nvvm_atom_sys_add_gen_l(&dl, l);
326   // CHECK: call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0i64
327   // expected-error@+1 {{'__nvvm_atom_sys_add_gen_ll' needs target feature sm_60}}
328   __nvvm_atom_sys_add_gen_ll(&sll, ll);
329 
330   // CHECK: call float @llvm.nvvm.atomic.add.gen.f.cta.f32.p0f32
331   // expected-error@+1 {{'__nvvm_atom_cta_add_gen_f' needs target feature sm_60}}
332   __nvvm_atom_cta_add_gen_f(fp, f);
333   // CHECK: call double @llvm.nvvm.atomic.add.gen.f.cta.f64.p0f64
334   // expected-error@+1 {{'__nvvm_atom_cta_add_gen_d' needs target feature sm_60}}
335   __nvvm_atom_cta_add_gen_d(dfp, df);
336   // CHECK: call float @llvm.nvvm.atomic.add.gen.f.sys.f32.p0f32
337   // expected-error@+1 {{'__nvvm_atom_sys_add_gen_f' needs target feature sm_60}}
338   __nvvm_atom_sys_add_gen_f(fp, f);
339   // CHECK: call double @llvm.nvvm.atomic.add.gen.f.sys.f64.p0f64
340   // expected-error@+1 {{'__nvvm_atom_sys_add_gen_d' needs target feature sm_60}}
341   __nvvm_atom_sys_add_gen_d(dfp, df);
342 
343   // CHECK: call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0i32
344   // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_i' needs target feature sm_60}}
345   __nvvm_atom_cta_xchg_gen_i(ip, i);
346   // LP32: call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0i32
347   // LP64: call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0i64
348   // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_l' needs target feature sm_60}}
349   __nvvm_atom_cta_xchg_gen_l(&dl, l);
350   // CHECK: call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0i64
351   // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_ll' needs target feature sm_60}}
352   __nvvm_atom_cta_xchg_gen_ll(&sll, ll);
353 
354   // CHECK: call i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0i32
355   // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_i' needs target feature sm_60}}
356   __nvvm_atom_sys_xchg_gen_i(ip, i);
357   // LP32: call i32 @llvm.nvvm.atomic.exch.gen.i.sys.i32.p0i32
358   // LP64: call i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0i64
359   // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_l' needs target feature sm_60}}
360   __nvvm_atom_sys_xchg_gen_l(&dl, l);
361   // CHECK: call i64 @llvm.nvvm.atomic.exch.gen.i.sys.i64.p0i64
362   // expected-error@+1 {{'__nvvm_atom_sys_xchg_gen_ll' needs target feature sm_60}}
363   __nvvm_atom_sys_xchg_gen_ll(&sll, ll);
364 
365   // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0i32
366   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_i' needs target feature sm_60}}
367   __nvvm_atom_cta_max_gen_i(ip, i);
368   // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0i32
369   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ui' needs target feature sm_60}}
370   __nvvm_atom_cta_max_gen_ui((unsigned int *)ip, i);
371   // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0i32
372   // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0i64
373   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_l' needs target feature sm_60}}
374   __nvvm_atom_cta_max_gen_l(&dl, l);
375   // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.cta.i32.p0i32
376   // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0i64
377   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ul' needs target feature sm_60}}
378   __nvvm_atom_cta_max_gen_ul((unsigned long *)lp, l);
379   // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0i64
380   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ll' needs target feature sm_60}}
381   __nvvm_atom_cta_max_gen_ll(&sll, ll);
382   // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.cta.i64.p0i64
383   // expected-error@+1 {{'__nvvm_atom_cta_max_gen_ull' needs target feature sm_60}}
384   __nvvm_atom_cta_max_gen_ull((unsigned long long *)llp, ll);
385 
386   // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0i32
387   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_i' needs target feature sm_60}}
388   __nvvm_atom_sys_max_gen_i(ip, i);
389   // CHECK: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0i32
390   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ui' needs target feature sm_60}}
391   __nvvm_atom_sys_max_gen_ui((unsigned int *)ip, i);
392   // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0i32
393   // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0i64
394   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_l' needs target feature sm_60}}
395   __nvvm_atom_sys_max_gen_l(&dl, l);
396   // LP32: call i32 @llvm.nvvm.atomic.max.gen.i.sys.i32.p0i32
397   // LP64: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0i64
398   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ul' needs target feature sm_60}}
399   __nvvm_atom_sys_max_gen_ul((unsigned long *)lp, l);
400   // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0i64
401   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ll' needs target feature sm_60}}
402   __nvvm_atom_sys_max_gen_ll(&sll, ll);
403   // CHECK: call i64 @llvm.nvvm.atomic.max.gen.i.sys.i64.p0i64
404   // expected-error@+1 {{'__nvvm_atom_sys_max_gen_ull' needs target feature sm_60}}
405   __nvvm_atom_sys_max_gen_ull((unsigned long long *)llp, ll);
406 
407   // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0i32
408   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_i' needs target feature sm_60}}
409   __nvvm_atom_cta_min_gen_i(ip, i);
410   // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0i32
411   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ui' needs target feature sm_60}}
412   __nvvm_atom_cta_min_gen_ui((unsigned int *)ip, i);
413   // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0i32
414   // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0i64
415   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_l' needs target feature sm_60}}
416   __nvvm_atom_cta_min_gen_l(&dl, l);
417   // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.cta.i32.p0i32
418   // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0i64
419   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ul' needs target feature sm_60}}
420   __nvvm_atom_cta_min_gen_ul((unsigned long *)lp, l);
421   // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0i64
422   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ll' needs target feature sm_60}}
423   __nvvm_atom_cta_min_gen_ll(&sll, ll);
424   // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.cta.i64.p0i64
425   // expected-error@+1 {{'__nvvm_atom_cta_min_gen_ull' needs target feature sm_60}}
426   __nvvm_atom_cta_min_gen_ull((unsigned long long *)llp, ll);
427 
428   // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0i32
429   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_i' needs target feature sm_60}}
430   __nvvm_atom_sys_min_gen_i(ip, i);
431   // CHECK: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0i32
432   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ui' needs target feature sm_60}}
433   __nvvm_atom_sys_min_gen_ui((unsigned int *)ip, i);
434   // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0i32
435   // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0i64
436   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_l' needs target feature sm_60}}
437   __nvvm_atom_sys_min_gen_l(&dl, l);
438   // LP32: call i32 @llvm.nvvm.atomic.min.gen.i.sys.i32.p0i32
439   // LP64: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0i64
440   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ul' needs target feature sm_60}}
441   __nvvm_atom_sys_min_gen_ul((unsigned long *)lp, l);
442   // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0i64
443   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ll' needs target feature sm_60}}
444   __nvvm_atom_sys_min_gen_ll(&sll, ll);
445   // CHECK: call i64 @llvm.nvvm.atomic.min.gen.i.sys.i64.p0i64
446   // expected-error@+1 {{'__nvvm_atom_sys_min_gen_ull' needs target feature sm_60}}
447   __nvvm_atom_sys_min_gen_ull((unsigned long long *)llp, ll);
448 
449   // CHECK: call i32 @llvm.nvvm.atomic.inc.gen.i.cta.i32.p0i32
450   // expected-error@+1 {{'__nvvm_atom_cta_inc_gen_ui' needs target feature sm_60}}
451   __nvvm_atom_cta_inc_gen_ui((unsigned int *)ip, i);
452   // CHECK: call i32 @llvm.nvvm.atomic.inc.gen.i.sys.i32.p0i32
453   // expected-error@+1 {{'__nvvm_atom_sys_inc_gen_ui' needs target feature sm_60}}
454   __nvvm_atom_sys_inc_gen_ui((unsigned int *)ip, i);
455 
456   // CHECK: call i32 @llvm.nvvm.atomic.dec.gen.i.cta.i32.p0i32
457   // expected-error@+1 {{'__nvvm_atom_cta_dec_gen_ui' needs target feature sm_60}}
458   __nvvm_atom_cta_dec_gen_ui((unsigned int *)ip, i);
459   // CHECK: call i32 @llvm.nvvm.atomic.dec.gen.i.sys.i32.p0i32
460   // expected-error@+1 {{'__nvvm_atom_sys_dec_gen_ui' needs target feature sm_60}}
461   __nvvm_atom_sys_dec_gen_ui((unsigned int *)ip, i);
462 
463   // CHECK: call i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0i32
464   // expected-error@+1 {{'__nvvm_atom_cta_and_gen_i' needs target feature sm_60}}
465   __nvvm_atom_cta_and_gen_i(ip, i);
466   // LP32: call i32 @llvm.nvvm.atomic.and.gen.i.cta.i32.p0i32
467   // LP64: call i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0i64
468   // expected-error@+1 {{'__nvvm_atom_cta_and_gen_l' needs target feature sm_60}}
469   __nvvm_atom_cta_and_gen_l(&dl, l);
470   // CHECK: call i64 @llvm.nvvm.atomic.and.gen.i.cta.i64.p0i64
471   // expected-error@+1 {{'__nvvm_atom_cta_and_gen_ll' needs target feature sm_60}}
472   __nvvm_atom_cta_and_gen_ll(&sll, ll);
473 
474   // CHECK: call i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0i32
475   // expected-error@+1 {{'__nvvm_atom_sys_and_gen_i' needs target feature sm_60}}
476   __nvvm_atom_sys_and_gen_i(ip, i);
477   // LP32: call i32 @llvm.nvvm.atomic.and.gen.i.sys.i32.p0i32
478   // LP64: call i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0i64
479   // expected-error@+1 {{'__nvvm_atom_sys_and_gen_l' needs target feature sm_60}}
480   __nvvm_atom_sys_and_gen_l(&dl, l);
481   // CHECK: call i64 @llvm.nvvm.atomic.and.gen.i.sys.i64.p0i64
482   // expected-error@+1 {{'__nvvm_atom_sys_and_gen_ll' needs target feature sm_60}}
483   __nvvm_atom_sys_and_gen_ll(&sll, ll);
484 
485   // CHECK: call i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0i32
486   // expected-error@+1 {{'__nvvm_atom_cta_or_gen_i' needs target feature sm_60}}
487   __nvvm_atom_cta_or_gen_i(ip, i);
488   // LP32: call i32 @llvm.nvvm.atomic.or.gen.i.cta.i32.p0i32
489   // LP64: call i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0i64
490   // expected-error@+1 {{'__nvvm_atom_cta_or_gen_l' needs target feature sm_60}}
491   __nvvm_atom_cta_or_gen_l(&dl, l);
492   // CHECK: call i64 @llvm.nvvm.atomic.or.gen.i.cta.i64.p0i64
493   // expected-error@+1 {{'__nvvm_atom_cta_or_gen_ll' needs target feature sm_60}}
494   __nvvm_atom_cta_or_gen_ll(&sll, ll);
495 
496   // CHECK: call i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0i32
497   // expected-error@+1 {{'__nvvm_atom_sys_or_gen_i' needs target feature sm_60}}
498   __nvvm_atom_sys_or_gen_i(ip, i);
499   // LP32: call i32 @llvm.nvvm.atomic.or.gen.i.sys.i32.p0i32
500   // LP64: call i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0i64
501   // expected-error@+1 {{'__nvvm_atom_sys_or_gen_l' needs target feature sm_60}}
502   __nvvm_atom_sys_or_gen_l(&dl, l);
503   // CHECK: call i64 @llvm.nvvm.atomic.or.gen.i.sys.i64.p0i64
504   // expected-error@+1 {{'__nvvm_atom_sys_or_gen_ll' needs target feature sm_60}}
505   __nvvm_atom_sys_or_gen_ll(&sll, ll);
506 
507   // CHECK: call i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0i32
508   // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_i' needs target feature sm_60}}
509   __nvvm_atom_cta_xor_gen_i(ip, i);
510   // LP32: call i32 @llvm.nvvm.atomic.xor.gen.i.cta.i32.p0i32
511   // LP64: call i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0i64
512   // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_l' needs target feature sm_60}}
513   __nvvm_atom_cta_xor_gen_l(&dl, l);
514   // CHECK: call i64 @llvm.nvvm.atomic.xor.gen.i.cta.i64.p0i64
515   // expected-error@+1 {{'__nvvm_atom_cta_xor_gen_ll' needs target feature sm_60}}
516   __nvvm_atom_cta_xor_gen_ll(&sll, ll);
517 
518   // CHECK: call i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0i32
519   // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_i' needs target feature sm_60}}
520   __nvvm_atom_sys_xor_gen_i(ip, i);
521   // LP32: call i32 @llvm.nvvm.atomic.xor.gen.i.sys.i32.p0i32
522   // LP64: call i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0i64
523   // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_l' needs target feature sm_60}}
524   __nvvm_atom_sys_xor_gen_l(&dl, l);
525   // CHECK: call i64 @llvm.nvvm.atomic.xor.gen.i.sys.i64.p0i64
526   // expected-error@+1 {{'__nvvm_atom_sys_xor_gen_ll' needs target feature sm_60}}
527   __nvvm_atom_sys_xor_gen_ll(&sll, ll);
528 
529   // CHECK: call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0i32
530   // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_i' needs target feature sm_60}}
531   __nvvm_atom_cta_cas_gen_i(ip, i, 0);
532   // LP32: call i32 @llvm.nvvm.atomic.cas.gen.i.cta.i32.p0i32
533   // LP64: call i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0i64
534   // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_l' needs target feature sm_60}}
535   __nvvm_atom_cta_cas_gen_l(&dl, l, 0);
536   // CHECK: call i64 @llvm.nvvm.atomic.cas.gen.i.cta.i64.p0i64
537   // expected-error@+1 {{'__nvvm_atom_cta_cas_gen_ll' needs target feature sm_60}}
538   __nvvm_atom_cta_cas_gen_ll(&sll, ll, 0);
539 
540   // CHECK: call i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0i32
541   // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_i' needs target feature sm_60}}
542   __nvvm_atom_sys_cas_gen_i(ip, i, 0);
543   // LP32: call i32 @llvm.nvvm.atomic.cas.gen.i.sys.i32.p0i32
544   // LP64: call i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0i64
545   // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_l' needs target feature sm_60}}
546   __nvvm_atom_sys_cas_gen_l(&dl, l, 0);
547   // CHECK: call i64 @llvm.nvvm.atomic.cas.gen.i.sys.i64.p0i64
548   // expected-error@+1 {{'__nvvm_atom_sys_cas_gen_ll' needs target feature sm_60}}
549   __nvvm_atom_sys_cas_gen_ll(&sll, ll, 0);
550 #endif
551 
552   // CHECK: ret
553 }
554 
555 // CHECK-LABEL: nvvm_ldg
nvvm_ldg(const void * p)556 __device__ void nvvm_ldg(const void *p) {
557   // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* {{%[0-9]+}}, i32 1)
558   // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* {{%[0-9]+}}, i32 1)
559   __nvvm_ldg_c((const char *)p);
560   __nvvm_ldg_uc((const unsigned char *)p);
561 
562   // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0i16(i16* {{%[0-9]+}}, i32 2)
563   // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0i16(i16* {{%[0-9]+}}, i32 2)
564   __nvvm_ldg_s((const short *)p);
565   __nvvm_ldg_us((const unsigned short *)p);
566 
567   // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
568   // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
569   __nvvm_ldg_i((const int *)p);
570   __nvvm_ldg_ui((const unsigned int *)p);
571 
572   // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
573   // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
574   // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* {{%[0-9]+}}, i32 8)
575   // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* {{%[0-9]+}}, i32 8)
576   __nvvm_ldg_l((const long *)p);
577   __nvvm_ldg_ul((const unsigned long *)p);
578 
579   // CHECK: call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* {{%[0-9]+}}, i32 4)
580   __nvvm_ldg_f((const float *)p);
581   // CHECK: call double @llvm.nvvm.ldg.global.f.f64.p0f64(double* {{%[0-9]+}}, i32 8)
582   __nvvm_ldg_d((const double *)p);
583 
584   // In practice, the pointers we pass to __ldg will be aligned as appropriate
585   // for the CUDA <type>N vector types (e.g. short4), which are not the same as
586   // the LLVM vector types.  However, each LLVM vector type has an alignment
587   // less than or equal to its corresponding CUDA type, so we're OK.
588   //
589   // PTX Interoperability section 2.2: "For a vector with an even number of
590   // elements, its alignment is set to number of elements times the alignment of
591   // its member: n*alignof(t)."
592 
593   // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* {{%[0-9]+}}, i32 2)
594   // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* {{%[0-9]+}}, i32 2)
595   typedef char char2 __attribute__((ext_vector_type(2)));
596   typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
597   __nvvm_ldg_c2((const char2 *)p);
598   __nvvm_ldg_uc2((const uchar2 *)p);
599 
600   // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0v4i8(<4 x i8>* {{%[0-9]+}}, i32 4)
601   // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0v4i8(<4 x i8>* {{%[0-9]+}}, i32 4)
602   typedef char char4 __attribute__((ext_vector_type(4)));
603   typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
604   __nvvm_ldg_c4((const char4 *)p);
605   __nvvm_ldg_uc4((const uchar4 *)p);
606 
607   // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* {{%[0-9]+}}, i32 4)
608   // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* {{%[0-9]+}}, i32 4)
609   typedef short short2 __attribute__((ext_vector_type(2)));
610   typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
611   __nvvm_ldg_s2((const short2 *)p);
612   __nvvm_ldg_us2((const ushort2 *)p);
613 
614   // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0v4i16(<4 x i16>* {{%[0-9]+}}, i32 8)
615   // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0v4i16(<4 x i16>* {{%[0-9]+}}, i32 8)
616   typedef short short4 __attribute__((ext_vector_type(4)));
617   typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
618   __nvvm_ldg_s4((const short4 *)p);
619   __nvvm_ldg_us4((const ushort4 *)p);
620 
621   // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* {{%[0-9]+}}, i32 8)
622   // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* {{%[0-9]+}}, i32 8)
623   typedef int int2 __attribute__((ext_vector_type(2)));
624   typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
625   __nvvm_ldg_i2((const int2 *)p);
626   __nvvm_ldg_ui2((const uint2 *)p);
627 
628   // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0v4i32(<4 x i32>* {{%[0-9]+}}, i32 16)
629   // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0v4i32(<4 x i32>* {{%[0-9]+}}, i32 16)
630   typedef int int4 __attribute__((ext_vector_type(4)));
631   typedef unsigned int uint4 __attribute__((ext_vector_type(4)));
632   __nvvm_ldg_i4((const int4 *)p);
633   __nvvm_ldg_ui4((const uint4 *)p);
634 
635   // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* {{%[0-9]+}}, i32 16)
636   // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* {{%[0-9]+}}, i32 16)
637   typedef long long longlong2 __attribute__((ext_vector_type(2)));
638   typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2)));
639   __nvvm_ldg_ll2((const longlong2 *)p);
640   __nvvm_ldg_ull2((const ulonglong2 *)p);
641 
642   // CHECK: call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0v2f32(<2 x float>* {{%[0-9]+}}, i32 8)
643   typedef float float2 __attribute__((ext_vector_type(2)));
644   __nvvm_ldg_f2((const float2 *)p);
645 
646   // CHECK: call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0v4f32(<4 x float>* {{%[0-9]+}}, i32 16)
647   typedef float float4 __attribute__((ext_vector_type(4)));
648   __nvvm_ldg_f4((const float4 *)p);
649 
650   // CHECK: call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0v2f64(<2 x double>* {{%[0-9]+}}, i32 16)
651   typedef double double2 __attribute__((ext_vector_type(2)));
652   __nvvm_ldg_d2((const double2 *)p);
653 }
654 
655 // CHECK-LABEL: nvvm_shfl
nvvm_shfl(int i,float f,int a,int b)656 __device__ void nvvm_shfl(int i, float f, int a, int b) {
657   // CHECK: call i32 @llvm.nvvm.shfl.down.i32(i32
658   __nvvm_shfl_down_i32(i, a, b);
659   // CHECK: call float @llvm.nvvm.shfl.down.f32(float
660   __nvvm_shfl_down_f32(f, a, b);
661   // CHECK: call i32 @llvm.nvvm.shfl.up.i32(i32
662   __nvvm_shfl_up_i32(i, a, b);
663   // CHECK: call float @llvm.nvvm.shfl.up.f32(float
664   __nvvm_shfl_up_f32(f, a, b);
665   // CHECK: call i32 @llvm.nvvm.shfl.bfly.i32(i32
666   __nvvm_shfl_bfly_i32(i, a, b);
667   // CHECK: call float @llvm.nvvm.shfl.bfly.f32(float
668   __nvvm_shfl_bfly_f32(f, a, b);
669   // CHECK: call i32 @llvm.nvvm.shfl.idx.i32(i32
670   __nvvm_shfl_idx_i32(i, a, b);
671   // CHECK: call float @llvm.nvvm.shfl.idx.f32(float
672   __nvvm_shfl_idx_f32(f, a, b);
673   // CHECK: ret void
674 }
675 
nvvm_vote(int pred)676 __device__ void nvvm_vote(int pred) {
677   // CHECK: call i1 @llvm.nvvm.vote.all(i1
678   __nvvm_vote_all(pred);
679   // CHECK: call i1 @llvm.nvvm.vote.any(i1
680   __nvvm_vote_any(pred);
681   // CHECK: call i1 @llvm.nvvm.vote.uni(i1
682   __nvvm_vote_uni(pred);
683   // CHECK: call i32 @llvm.nvvm.vote.ballot(i1
684   __nvvm_vote_ballot(pred);
685   // CHECK: ret void
686 }
687 
688 // CHECK-LABEL: nvvm_mbarrier
nvvm_mbarrier(long long * addr,long long * sharedAddr,int count,long long state)689 __device__ void nvvm_mbarrier(long long* addr, __attribute__((address_space(3))) long long* sharedAddr, int count, long long state) {
690   #if __CUDA_ARCH__ >= 800
691   __nvvm_mbarrier_init(addr, count);
692   // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init
693   __nvvm_mbarrier_init_shared(sharedAddr, count);
694   // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.init.shared
695 
696   __nvvm_mbarrier_inval(addr);
697   // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval
698   __nvvm_mbarrier_inval_shared(sharedAddr);
699   // CHECK_PTX70_SM80: call void @llvm.nvvm.mbarrier.inval.shared
700 
701   __nvvm_mbarrier_arrive(addr);
702   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive
703   __nvvm_mbarrier_arrive_shared(sharedAddr);
704   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.shared
705   __nvvm_mbarrier_arrive_noComplete(addr, count);
706   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete
707   __nvvm_mbarrier_arrive_noComplete_shared(sharedAddr, count);
708   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.noComplete.shared
709 
710   __nvvm_mbarrier_arrive_drop(addr);
711   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop
712   __nvvm_mbarrier_arrive_drop_shared(sharedAddr);
713   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.shared
714   __nvvm_mbarrier_arrive_drop_noComplete(addr, count);
715   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete
716   __nvvm_mbarrier_arrive_drop_noComplete_shared(sharedAddr, count);
717   // CHECK_PTX70_SM80: call i64 @llvm.nvvm.mbarrier.arrive.drop.noComplete.shared
718 
719   __nvvm_mbarrier_test_wait(addr, state);
720   // CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait
721   __nvvm_mbarrier_test_wait_shared(sharedAddr, state);
722   // CHECK_PTX70_SM80: call i1 @llvm.nvvm.mbarrier.test.wait.shared
723 
724   __nvvm_mbarrier_pending_count(state);
725   // CHECK_PTX70_SM80: call i32 @llvm.nvvm.mbarrier.pending.count
726   #endif
727   // CHECK: ret void
728 }
729 
730 // CHECK-LABEL: nvvm_async_copy
nvvm_async_copy(void * dst,const void * src,long long * addr,long long * sharedAddr)731 __device__ void nvvm_async_copy(__attribute__((address_space(3))) void* dst, __attribute__((address_space(1))) const void* src, long long* addr, __attribute__((address_space(3))) long long* sharedAddr) {
732   #if __CUDA_ARCH__ >= 800
733   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive
734   __nvvm_cp_async_mbarrier_arrive(addr);
735   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared
736   __nvvm_cp_async_mbarrier_arrive_shared(sharedAddr);
737   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc
738   __nvvm_cp_async_mbarrier_arrive_noinc(addr);
739   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared
740   __nvvm_cp_async_mbarrier_arrive_noinc_shared(sharedAddr);
741 
742   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.4
743   __nvvm_cp_async_ca_shared_global_4(dst, src);
744   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.8
745   __nvvm_cp_async_ca_shared_global_8(dst, src);
746   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.ca.shared.global.16
747   __nvvm_cp_async_ca_shared_global_16(dst, src);
748   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.cg.shared.global.16
749   __nvvm_cp_async_cg_shared_global_16(dst, src);
750 
751   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.commit.group
752   __nvvm_cp_async_commit_group();
753   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 0)
754   __nvvm_cp_async_wait_group(0);
755     // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 8)
756   __nvvm_cp_async_wait_group(8);
757     // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.group(i32 16)
758   __nvvm_cp_async_wait_group(16);
759   // CHECK_PTX70_SM80: call void @llvm.nvvm.cp.async.wait.all
760   __nvvm_cp_async_wait_all();
761   #endif
762   // CHECK: ret void
763 }
764 
765 // CHECK-LABEL: nvvm_cvt_sm80
nvvm_cvt_sm80()766 __device__ void nvvm_cvt_sm80() {
767 #if __CUDA_ARCH__ >= 800
768   // CHECK_PTX70_SM80: call i32 @llvm.nvvm.ff2bf16x2.rn(float 1.000000e+00, float 1.000000e+00)
769   __nvvm_ff2bf16x2_rn(1, 1);
770   // CHECK_PTX70_SM80: call i32 @llvm.nvvm.ff2bf16x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
771   __nvvm_ff2bf16x2_rn_relu(1, 1);
772   // CHECK_PTX70_SM80: call i32 @llvm.nvvm.ff2bf16x2.rz(float 1.000000e+00, float 1.000000e+00)
773   __nvvm_ff2bf16x2_rz(1, 1);
774   // CHECK_PTX70_SM80: call i32 @llvm.nvvm.ff2bf16x2.rz.relu(float 1.000000e+00, float 1.000000e+00)
775   __nvvm_ff2bf16x2_rz_relu(1, 1);
776 
777   // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn(float 1.000000e+00, float 1.000000e+00)
778   __nvvm_ff2f16x2_rn(1, 1);
779   // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn.relu(float 1.000000e+00, float 1.000000e+00)
780   __nvvm_ff2f16x2_rn_relu(1, 1);
781   // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz(float 1.000000e+00, float 1.000000e+00)
782   __nvvm_ff2f16x2_rz(1, 1);
783   // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz.relu(float 1.000000e+00, float 1.000000e+00)
784   __nvvm_ff2f16x2_rz_relu(1, 1);
785 
786   // CHECK_PTX70_SM80: call i16 @llvm.nvvm.f2bf16.rn(float 1.000000e+00)
787   __nvvm_f2bf16_rn(1);
788   // CHECK_PTX70_SM80: call i16 @llvm.nvvm.f2bf16.rn.relu(float 1.000000e+00)
789   __nvvm_f2bf16_rn_relu(1);
790   // CHECK_PTX70_SM80: call i16 @llvm.nvvm.f2bf16.rz(float 1.000000e+00)
791   __nvvm_f2bf16_rz(1);
792   // CHECK_PTX70_SM80: call i16 @llvm.nvvm.f2bf16.rz.relu(float 1.000000e+00)
793   __nvvm_f2bf16_rz_relu(1);
794 
795   // CHECK_PTX70_SM80: call i32 @llvm.nvvm.f2tf32.rna(float 1.000000e+00)
796   __nvvm_f2tf32_rna(1);
797 #endif
798   // CHECK: ret void
799 }
800 
801 // CHECK-LABEL: nvvm_abs_neg_bf16_bf16x2_sm80
nvvm_abs_neg_bf16_bf16x2_sm80()802 __device__ void nvvm_abs_neg_bf16_bf16x2_sm80() {
803 #if __CUDA_ARCH__ >= 800
804 
805   // CHECK_PTX70_SM80: call i16 @llvm.nvvm.abs.bf16(i16 -1)
806   __nvvm_abs_bf16(0xFFFF);
807   // CHECK_PTX70_SM80: call i32 @llvm.nvvm.abs.bf16x2(i32 -1)
808   __nvvm_abs_bf16x2(0xFFFFFFFF);
809 
810   // CHECK_PTX70_SM80: call i16 @llvm.nvvm.neg.bf16(i16 -1)
811   __nvvm_neg_bf16(0xFFFF);
812   // CHECK_PTX70_SM80: call i32 @llvm.nvvm.neg.bf16x2(i32 -1)
813   __nvvm_neg_bf16x2(0xFFFFFFFF);
814 #endif
815   // CHECK: ret void
816 }
817 
818 #define NAN32 0x7FBFFFFF
819 #define NAN16 0x7FBF
820 #define BF16 0x1234
821 #define BF16_2 0x4321
822 #define NANBF16 0xFFC1
823 #define BF16X2 0x12341234
824 #define BF16X2_2 0x32343234
825 #define NANBF16X2 0xFFC1FFC1
826 
827 // CHECK-LABEL: nvvm_min_max_sm80
nvvm_min_max_sm80()828 __device__ void nvvm_min_max_sm80() {
829 #if __CUDA_ARCH__ >= 800
830 
831   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmin.nan.f
832   __nvvm_fmin_nan_f(0.1f, (float)NAN32);
833   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmin.ftz.nan.f
834   __nvvm_fmin_ftz_nan_f(0.1f, (float)NAN32);
835 
836   // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fmin.bf16
837   __nvvm_fmin_bf16(BF16, BF16_2);
838   // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fmin.nan.bf16
839   __nvvm_fmin_nan_bf16(BF16, NANBF16);
840   // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fmin.bf16x2
841   __nvvm_fmin_bf16x2(BF16X2, BF16X2_2);
842   // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fmin.nan.bf16x2
843   __nvvm_fmin_nan_bf16x2(BF16X2, NANBF16X2);
844   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f
845   __nvvm_fmax_nan_f(0.1f, 0.11f);
846   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f
847   __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32);
848 
849   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f
850   __nvvm_fmax_nan_f(0.1f, (float)NAN32);
851   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f
852   __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32);
853   // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fmax.bf16
854   __nvvm_fmax_bf16(BF16, BF16_2);
855   // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fmax.nan.bf16
856   __nvvm_fmax_nan_bf16(BF16, NANBF16);
857   // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fmax.bf16x2
858   __nvvm_fmax_bf16x2(BF16X2, BF16X2_2);
859   // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fmax.nan.bf16x2
860   __nvvm_fmax_nan_bf16x2(NANBF16X2, BF16X2);
861   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.nan.f
862   __nvvm_fmax_nan_f(0.1f, (float)NAN32);
863   // CHECK_PTX70_SM80: call float @llvm.nvvm.fmax.ftz.nan.f
864   __nvvm_fmax_ftz_nan_f(0.1f, (float)NAN32);
865 
866 #endif
867   // CHECK: ret void
868 }
869 
870 // CHECK-LABEL: nvvm_fma_bf16_bf16x2_sm80
nvvm_fma_bf16_bf16x2_sm80()871 __device__ void nvvm_fma_bf16_bf16x2_sm80() {
872 #if __CUDA_ARCH__ >= 800
873   // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fma.rn.bf16
874   __nvvm_fma_rn_bf16(0x1234, 0x7FBF, 0x1234);
875   // CHECK_PTX70_SM80: call i16 @llvm.nvvm.fma.rn.relu.bf16
876   __nvvm_fma_rn_relu_bf16(0x1234, 0x7FBF, 0x1234);
877   // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fma.rn.bf16x2
878   __nvvm_fma_rn_bf16x2(0x7FBFFFFF, 0xFFFFFFFF, 0x7FBFFFFF);
879   // CHECK_PTX70_SM80: call i32 @llvm.nvvm.fma.rn.relu.bf16x2
880   __nvvm_fma_rn_relu_bf16x2(0x7FBFFFFF, 0xFFFFFFFF, 0x7FBFFFFF);
881 #endif
882   // CHECK: ret void
883 }
884 
885 // CHECK-LABEL: nvvm_min_max_sm86
nvvm_min_max_sm86()886 __device__ void nvvm_min_max_sm86() {
887 #if __CUDA_ARCH__ >= 860
888 
889   // CHECK_PTX72_SM86: call i16 @llvm.nvvm.fmin.xorsign.abs.bf16
890   __nvvm_fmin_xorsign_abs_bf16(BF16, BF16_2);
891   // CHECK_PTX72_SM86: call i16 @llvm.nvvm.fmin.nan.xorsign.abs.bf16
892   __nvvm_fmin_nan_xorsign_abs_bf16(BF16, NANBF16);
893   // CHECK_PTX72_SM86: call i32 @llvm.nvvm.fmin.xorsign.abs.bf16x2
894   __nvvm_fmin_xorsign_abs_bf16x2(BF16X2, BF16X2_2);
895   // CHECK_PTX72_SM86: call i32 @llvm.nvvm.fmin.nan.xorsign.abs.bf16x2
896   __nvvm_fmin_nan_xorsign_abs_bf16x2(BF16X2, NANBF16X2);
897   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.xorsign.abs.f
898   __nvvm_fmin_xorsign_abs_f(-0.1f, 0.1f);
899   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.ftz.xorsign.abs.f
900   __nvvm_fmin_ftz_xorsign_abs_f(-0.1f, 0.1f);
901   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.nan.xorsign.abs.f
902   __nvvm_fmin_nan_xorsign_abs_f(-0.1f, (float)NAN32);
903   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmin.ftz.nan.xorsign.abs.f
904   __nvvm_fmin_ftz_nan_xorsign_abs_f(-0.1f, (float)NAN32);
905 
906   // CHECK_PTX72_SM86: call i16 @llvm.nvvm.fmax.xorsign.abs.bf16
907   __nvvm_fmax_xorsign_abs_bf16(BF16, BF16_2);
908   // CHECK_PTX72_SM86: call i16 @llvm.nvvm.fmax.nan.xorsign.abs.bf16
909   __nvvm_fmax_nan_xorsign_abs_bf16(BF16, NANBF16);
910   // CHECK_PTX72_SM86: call i32 @llvm.nvvm.fmax.xorsign.abs.bf16x2
911   __nvvm_fmax_xorsign_abs_bf16x2(BF16X2, BF16X2_2);
912   // CHECK_PTX72_SM86: call i32 @llvm.nvvm.fmax.nan.xorsign.abs.bf16x2
913   __nvvm_fmax_nan_xorsign_abs_bf16x2(BF16X2, NANBF16X2);
914   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.xorsign.abs.f
915   __nvvm_fmax_xorsign_abs_f(-0.1f, 0.1f);
916   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.ftz.xorsign.abs.f
917   __nvvm_fmax_ftz_xorsign_abs_f(-0.1f, 0.1f);
918   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.nan.xorsign.abs.f
919   __nvvm_fmax_nan_xorsign_abs_f(-0.1f, (float)NAN32);
920   // CHECK_PTX72_SM86: call float @llvm.nvvm.fmax.ftz.nan.xorsign.abs.f
921   __nvvm_fmax_ftz_nan_xorsign_abs_f(-0.1f, (float)NAN32);
922 #endif
923   // CHECK: ret void
924 }
925