1; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
2; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
3; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
4; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
5; RUN: llc -march=r600 -mtriple=r600-- -mcpu=redwood < %s | FileCheck --check-prefixes=EG,FUNC %s
6; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s
7; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-promote-alloca,sroa,instcombine < %s | FileCheck -check-prefix=OPT %s
8target datalayout = "A5"
9
10; OPT-LABEL: @vector_read(
11; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
12; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
13
14; FUNC-LABEL: {{^}}vector_read:
15; EG: MOV
16; EG: MOV
17; EG: MOV
18; EG: MOV
19; EG: MOVA_INT
20define amdgpu_kernel void @vector_read(i32 addrspace(1)* %out, i32 %index) {
21entry:
22  %tmp = alloca [4 x i32], addrspace(5)
23  %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
24  %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
25  %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
26  %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
27  store i32 0, i32 addrspace(5)* %x
28  store i32 1, i32 addrspace(5)* %y
29  store i32 2, i32 addrspace(5)* %z
30  store i32 3, i32 addrspace(5)* %w
31  %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
32  %tmp2 = load i32, i32 addrspace(5)* %tmp1
33  store i32 %tmp2, i32 addrspace(1)* %out
34  ret void
35}
36
37; OPT-LABEL: @vector_write(
38; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index
39; OPT: %1 = extractelement <4 x i32> %0, i32 %r_index
40; OPT: store i32 %1, i32 addrspace(1)* %out, align 4
41
42; FUNC-LABEL: {{^}}vector_write:
43; EG: MOV
44; EG: MOV
45; EG: MOV
46; EG: MOV
47; EG: MOVA_INT
48; EG: MOVA_INT
49define amdgpu_kernel void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
50entry:
51  %tmp = alloca [4 x i32], addrspace(5)
52  %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
53  %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
54  %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
55  %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
56  store i32 0, i32 addrspace(5)* %x
57  store i32 0, i32 addrspace(5)* %y
58  store i32 0, i32 addrspace(5)* %z
59  store i32 0, i32 addrspace(5)* %w
60  %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %w_index
61  store i32 1, i32 addrspace(5)* %tmp1
62  %tmp2 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %r_index
63  %tmp3 = load i32, i32 addrspace(5)* %tmp2
64  store i32 %tmp3, i32 addrspace(1)* %out
65  ret void
66}
67
68; This test should be optimize to:
69; store i32 0, i32 addrspace(1)* %out
70
71; OPT-LABEL: @bitcast_gep(
72; OPT-LABEL: store i32 0, i32 addrspace(1)* %out, align 4
73
74; FUNC-LABEL: {{^}}bitcast_gep:
75; EG: STORE_RAW
76define amdgpu_kernel void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
77entry:
78  %tmp = alloca [4 x i32], addrspace(5)
79  %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
80  %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
81  %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
82  %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
83  store i32 0, i32 addrspace(5)* %x
84  store i32 0, i32 addrspace(5)* %y
85  store i32 0, i32 addrspace(5)* %z
86  store i32 0, i32 addrspace(5)* %w
87  %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
88  %tmp2 = bitcast i32 addrspace(5)* %tmp1 to [4 x i32] addrspace(5)*
89  %tmp3 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp2, i32 0, i32 0
90  %tmp4 = load i32, i32 addrspace(5)* %tmp3
91  store i32 %tmp4, i32 addrspace(1)* %out
92  ret void
93}
94
95; OPT-LABEL: @vector_read_bitcast_gep(
96; OPT: %0 = extractelement <4 x i32> <i32 1065353216, i32 1, i32 2, i32 3>, i32 %index
97; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
98define amdgpu_kernel void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) {
99entry:
100  %tmp = alloca [4 x i32], addrspace(5)
101  %x = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
102  %y = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
103  %z = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
104  %w = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
105  %bc = bitcast i32 addrspace(5)* %x to float addrspace(5)*
106  store float 1.0, float addrspace(5)* %bc
107  store i32 1, i32 addrspace(5)* %y
108  store i32 2, i32 addrspace(5)* %z
109  store i32 3, i32 addrspace(5)* %w
110  %tmp1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
111  %tmp2 = load i32, i32 addrspace(5)* %tmp1
112  store i32 %tmp2, i32 addrspace(1)* %out
113  ret void
114}
115
116; FIXME: Should be able to promote this. Instcombine should fold the
117; cast in the hasOneUse case so it might not matter in practice
118
119; OPT-LABEL: @vector_read_bitcast_alloca(
120; OPT: alloca [4 x float]
121; OPT: store float
122; OPT: store float
123; OPT: store float
124; OPT: store float
125; OPT: load float
126define amdgpu_kernel void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
127entry:
128  %tmp = alloca [4 x i32], addrspace(5)
129  %tmp.bc = bitcast [4 x i32] addrspace(5)* %tmp to [4 x float] addrspace(5)*
130  %x = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 0
131  %y = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 1
132  %z = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 2
133  %w = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 3
134  store float 0.0, float addrspace(5)* %x
135  store float 1.0, float addrspace(5)* %y
136  store float 2.0, float addrspace(5)* %z
137  store float 4.0, float addrspace(5)* %w
138  %tmp1 = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 %index
139  %tmp2 = load float, float addrspace(5)* %tmp1
140  store float %tmp2, float addrspace(1)* %out
141  ret void
142}
143
144; The pointer arguments in local address space should not affect promotion to vector.
145
146; OPT-LABEL: @vector_read_with_local_arg(
147; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
148; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
149define amdgpu_kernel void @vector_read_with_local_arg(i32 addrspace(3)* %stopper, i32 addrspace(1)* %out, i32 %index) {
150entry:
151  %tmp = alloca [4 x i32], addrspace(5)
152  %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
153  %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
154  %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
155  %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
156  store i32 0, i32 addrspace(5)* %x
157  store i32 1, i32 addrspace(5)* %y
158  store i32 2, i32 addrspace(5)* %z
159  store i32 3, i32 addrspace(5)* %w
160  %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
161  %tmp2 = load i32, i32 addrspace(5)* %tmp1
162  store i32 %tmp2, i32 addrspace(1)* %out
163  ret void
164}
165