1 // REQUIRES: x86-registered-target
2 // REQUIRES: amdgpu-registered-target
3 
4 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -o - | FileCheck --check-prefixes=COMMON,CHECK %s
5 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -x hip %s -disable-O0-optnone -o - | opt -S -O2 | FileCheck %s --check-prefixes=COMMON,OPT
6 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -x hip %s -o - | FileCheck -check-prefix=HOST %s
7 
8 #include "Inputs/cuda.h"
9 
10 // Coerced struct from `struct S` without all generic pointers lowered into
11 // global ones.
12 
13 // On the host-side compilation, generic pointer won't be coerced.
14 // HOST-NOT: %struct.S.coerce
15 // HOST-NOT: %struct.T.coerce
16 
17 // HOST: define{{.*}} void @_Z22__device_stub__kernel1Pi(i32* noundef %x)
18 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel1Pi(i32 addrspace(1)*{{.*}} %x.coerce)
19 // CHECK:     ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
20 // CHECK-NOT: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
21 // OPT: [[VAL:%.*]] = load i32, i32 addrspace(4)* %x.coerce.const, align 4
22 // OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1
23 // OPT: store i32 [[INC]], i32 addrspace(1)* %x.coerce, align 4
24 // OPT: ret void
25 __global__ void kernel1(int *x) {
26   x[0]++;
27 }
28 
29 // HOST: define{{.*}} void @_Z22__device_stub__kernel2Ri(i32* noundef nonnull align 4 dereferenceable(4) %x)
30 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel2Ri(i32 addrspace(1)*{{.*}} nonnull align 4 dereferenceable(4) %x.coerce)
31 // CHECK:     ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
32 // CHECK-NOT: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
33 // OPT: [[VAL:%.*]] = load i32, i32 addrspace(4)* %x.coerce.const, align 4
34 // OPT: [[INC:%.*]] = add nsw i32 [[VAL]], 1
35 // OPT: store i32 [[INC]], i32 addrspace(1)* %x.coerce, align 4
36 // OPT: ret void
37 __global__ void kernel2(int &x) {
38   x++;
39 }
40 
41 // HOST: define{{.*}} void @_Z22__device_stub__kernel3PU3AS2iPU3AS1i(i32 addrspace(2)*  noundef %x, i32 addrspace(1)* noundef %y)
42 // CHECK-LABEL: define{{.*}} amdgpu_kernel void  @_Z7kernel3PU3AS2iPU3AS1i(i32 addrspace(2)*{{.*}} %x, i32 addrspace(1)*{{.*}} %y)
43 // CHECK-NOT: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
44 __global__ void kernel3(__attribute__((address_space(2))) int *x,
45                         __attribute__((address_space(1))) int *y) {
46   y[0] = x[0];
47 }
48 
49 // COMMON-LABEL: define{{.*}} void @_Z4funcPi(i32*{{.*}} %x)
50 // CHECK-NOT: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
51 __device__ void func(int *x) {
52   x[0]++;
53 }
54 
55 struct S {
56   int *x;
57   float *y;
58 };
59 // `by-val` struct is passed by-indirect-alias (a mix of by-ref and indirect
60 // by-val). However, the enhanced address inferring pass should be able to
61 // assume they are global pointers.
62 //
63 // HOST: define{{.*}} void @_Z22__device_stub__kernel41S(i32* %s.coerce0, float* %s.coerce1)
64 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel41S(%struct.S addrspace(4)*{{.*}} byref(%struct.S) align 8 %0)
65 // OPT: [[R0:%.*]] = getelementptr inbounds %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 0
66 // OPT: [[P0:%.*]] = load i32*, i32* addrspace(4)* [[R0]], align 8
67 // OPT: [[G0:%.*]] ={{.*}} addrspacecast i32* [[P0]] to i32 addrspace(1)*
68 // OPT: [[R1:%.*]] = getelementptr inbounds %struct.S, %struct.S addrspace(4)* %0, i64 0, i32 1
69 // OPT: [[P1:%.*]] = load float*, float* addrspace(4)* [[R1]], align 8
70 // OPT: [[G1:%.*]] ={{.*}} addrspacecast float* [[P1]] to float addrspace(1)*
71 // OPT: [[G2:%.*]] ={{.*}} addrspacecast i32* [[P0]] to i32 addrspace(4)*
72 // OPT: [[V0:%.*]] = load i32, i32 addrspace(4)* [[G2]], align 4
73 // OPT: [[INC:%.*]] = add nsw i32 [[V0]], 1
74 // OPT: store i32 [[INC]], i32 addrspace(1)* [[G0]], align 4
75 // OPT: [[V1:%.*]] = load float, float addrspace(1)* [[G1]], align 4
76 // OPT: [[ADD:%.*]] = fadd contract float [[V1]], 1.000000e+00
77 // OPT: store float [[ADD]], float addrspace(1)* [[G1]], align 4
78 // OPT: ret void
79 __global__ void kernel4(struct S s) {
80   s.x[0]++;
81   s.y[0] += 1.f;
82 }
83 
84 // If a pointer to struct is passed, only the pointer itself is coerced into the global one.
85 // HOST: define{{.*}} void @_Z22__device_stub__kernel5P1S(%struct.S* noundef %s)
86 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel5P1S(%struct.S addrspace(1)*{{.*}} %s.coerce)
87 __global__ void kernel5(struct S *s) {
88   s->x[0]++;
89   s->y[0] += 1.f;
90 }
91 
92 struct T {
93   float *x[2];
94 };
95 // `by-val` array is passed by-indirect-alias (a mix of by-ref and indirect
96 // by-val). However, the enhanced address inferring pass should be able to
97 // assume they are global pointers.
98 //
99 // HOST: define{{.*}} void @_Z22__device_stub__kernel61T(float* %t.coerce0, float* %t.coerce1)
100 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel61T(%struct.T addrspace(4)*{{.*}} byref(%struct.T) align 8 %0)
101 // OPT: [[R0:%.*]] = getelementptr inbounds %struct.T, %struct.T addrspace(4)* %0, i64 0, i32 0, i64 0
102 // OPT: [[P0:%.*]] = load float*, float* addrspace(4)* [[R0]], align 8
103 // OPT: [[G0:%.*]] ={{.*}} addrspacecast float* [[P0]] to float addrspace(1)*
104 // OPT: [[R1:%.*]] = getelementptr inbounds %struct.T, %struct.T addrspace(4)* %0, i64 0, i32 0, i64 1
105 // OPT: [[P1:%.*]] = load float*, float* addrspace(4)* [[R1]], align 8
106 // OPT: [[G1:%.*]] ={{.*}} addrspacecast float* [[P1]] to float addrspace(1)*
107 // OPT: [[G2:%.*]] ={{.*}} addrspacecast float* [[P0]] to float addrspace(4)*
108 // OPT: [[V0:%.*]] = load float, float addrspace(4)* [[G2]], align 4
109 // OPT: [[ADD0:%.*]] = fadd contract float [[V0]], 1.000000e+00
110 // OPT: store float [[ADD0]], float addrspace(1)* [[G0]], align 4
111 // OPT: [[V1:%.*]] = load float, float addrspace(1)* [[G1]], align 4
112 // OPT: [[ADD1:%.*]] = fadd contract float [[V1]], 2.000000e+00
113 // OPT: store float [[ADD1]], float addrspace(1)* [[G1]], align 4
114 // OPT: ret void
115 __global__ void kernel6(struct T t) {
116   t.x[0][0] += 1.f;
117   t.x[1][0] += 2.f;
118 }
119 
120 // Check that coerced pointers retain the noalias attribute when qualified with __restrict.
121 // HOST: define{{.*}} void @_Z22__device_stub__kernel7Pi(i32* noalias noundef %x)
122 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel7Pi(i32 addrspace(1)* noalias{{.*}} %x.coerce)
123 __global__ void kernel7(int *__restrict x) {
124   x[0]++;
125 }
126 
127 // Single element struct.
128 struct SS {
129   float *x;
130 };
131 // HOST: define{{.*}} void @_Z22__device_stub__kernel82SS(float* %a.coerce)
132 // COMMON-LABEL: define{{.*}} amdgpu_kernel void @_Z7kernel82SS(float addrspace(1)*{{.*}} %a.coerce)
133 // CHECK:     ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
134 // CHECK-NOT: ={{.*}} addrspacecast [[TYPE:.*]] addrspace(1)* %{{.*}} to [[TYPE]]*
135 // OPT: [[VAL:%.*]] = load float, float addrspace(4)* %a.coerce.const, align 4
136 // OPT: [[INC:%.*]] = fadd contract float [[VAL]], 3.000000e+00
137 // OPT: store float [[INC]], float addrspace(1)* %a.coerce, align 4
138 // OPT: ret void
139 __global__ void kernel8(struct SS a) {
140   *a.x += 3.f;
141 }
142