1; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
2; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
3; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
4; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC %s
5; RUN: llc -march=r600 -mtriple=r600-- -mcpu=redwood < %s | FileCheck --check-prefixes=EG,FUNC %s
6; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s
7; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-promote-alloca,sroa,instcombine < %s | FileCheck -check-prefix=OPT %s
8target datalayout = "A5"
9
10; OPT-LABEL: @vector_read(
11; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
12; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
13
14; FUNC-LABEL: {{^}}vector_read:
15; EG: MOV
16; EG: MOV
17; EG: MOV
18; EG: MOV
19; EG: MOVA_INT
20define amdgpu_kernel void @vector_read(i32 addrspace(1)* %out, i32 %index) {
21entry:
22  %tmp = alloca [4 x i32], addrspace(5)
23  %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
24  %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
25  %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
26  %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
27  store i32 0, i32 addrspace(5)* %x
28  store i32 1, i32 addrspace(5)* %y
29  store i32 2, i32 addrspace(5)* %z
30  store i32 3, i32 addrspace(5)* %w
31  %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
32  %tmp2 = load i32, i32 addrspace(5)* %tmp1
33  store i32 %tmp2, i32 addrspace(1)* %out
34  ret void
35}
36
37; OPT-LABEL: @vector_write(
38; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index
39; OPT: %1 = extractelement <4 x i32> %0, i32 %r_index
40; OPT: store i32 %1, i32 addrspace(1)* %out, align 4
41
42; FUNC-LABEL: {{^}}vector_write:
43; EG: MOV
44; EG: MOV
45; EG: MOV
46; EG: MOV
47; EG: MOVA_INT
48; EG: MOVA_INT
49define amdgpu_kernel void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
50entry:
51  %tmp = alloca [4 x i32], addrspace(5)
52  %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
53  %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
54  %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
55  %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
56  store i32 0, i32 addrspace(5)* %x
57  store i32 0, i32 addrspace(5)* %y
58  store i32 0, i32 addrspace(5)* %z
59  store i32 0, i32 addrspace(5)* %w
60  %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %w_index
61  store i32 1, i32 addrspace(5)* %tmp1
62  %tmp2 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %r_index
63  %tmp3 = load i32, i32 addrspace(5)* %tmp2
64  store i32 %tmp3, i32 addrspace(1)* %out
65  ret void
66}
67
68; This test should be optimize to:
69; store i32 0, i32 addrspace(1)* %out
70
71; OPT-LABEL: @bitcast_gep(
72; OPT-LABEL: store i32 0, i32 addrspace(1)* %out, align 4
73
74; FUNC-LABEL: {{^}}bitcast_gep:
75; EG: STORE_RAW
76define amdgpu_kernel void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
77entry:
78  %tmp = alloca [4 x i32], addrspace(5)
79  %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
80  %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
81  %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
82  %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
83  store i32 0, i32 addrspace(5)* %x
84  store i32 0, i32 addrspace(5)* %y
85  store i32 0, i32 addrspace(5)* %z
86  store i32 0, i32 addrspace(5)* %w
87  %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
88  %tmp2 = bitcast i32 addrspace(5)* %tmp1 to [4 x i32] addrspace(5)*
89  %tmp3 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp2, i32 0, i32 0
90  %tmp4 = load i32, i32 addrspace(5)* %tmp3
91  store i32 %tmp4, i32 addrspace(1)* %out
92  ret void
93}
94
95; OPT-LABEL: @vector_read_bitcast_gep(
96; OPT: %0 = extractelement <4 x i32> <i32 1065353216, i32 1, i32 2, i32 3>, i32 %index
97; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
98define amdgpu_kernel void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) {
99entry:
100  %tmp = alloca [4 x i32], addrspace(5)
101  %x = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
102  %y = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
103  %z = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
104  %w = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
105  %bc = bitcast i32 addrspace(5)* %x to float addrspace(5)*
106  store float 1.0, float addrspace(5)* %bc
107  store i32 1, i32 addrspace(5)* %y
108  store i32 2, i32 addrspace(5)* %z
109  store i32 3, i32 addrspace(5)* %w
110  %tmp1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
111  %tmp2 = load i32, i32 addrspace(5)* %tmp1
112  store i32 %tmp2, i32 addrspace(1)* %out
113  ret void
114}
115
116; OPT-LABEL: @vector_read_bitcast_alloca(
117; OPT: %0 = extractelement <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, i32 %index
118; OPT: store float %0, float addrspace(1)* %out, align 4
119define amdgpu_kernel void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
120entry:
121  %tmp = alloca [4 x i32], addrspace(5)
122  %tmp.bc = bitcast [4 x i32] addrspace(5)* %tmp to [4 x float] addrspace(5)*
123  %x = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 0
124  %y = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 1
125  %z = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 2
126  %w = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 3
127  store float 0.0, float addrspace(5)* %x
128  store float 1.0, float addrspace(5)* %y
129  store float 2.0, float addrspace(5)* %z
130  store float 4.0, float addrspace(5)* %w
131  %tmp1 = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 %index
132  %tmp2 = load float, float addrspace(5)* %tmp1
133  store float %tmp2, float addrspace(1)* %out
134  ret void
135}
136
137; The pointer arguments in local address space should not affect promotion to vector.
138
139; OPT-LABEL: @vector_read_with_local_arg(
140; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
141; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
142define amdgpu_kernel void @vector_read_with_local_arg(i32 addrspace(3)* %stopper, i32 addrspace(1)* %out, i32 %index) {
143entry:
144  %tmp = alloca [4 x i32], addrspace(5)
145  %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
146  %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
147  %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
148  %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
149  store i32 0, i32 addrspace(5)* %x
150  store i32 1, i32 addrspace(5)* %y
151  store i32 2, i32 addrspace(5)* %z
152  store i32 3, i32 addrspace(5)* %w
153  %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
154  %tmp2 = load i32, i32 addrspace(5)* %tmp1
155  store i32 %tmp2, i32 addrspace(1)* %out
156  ret void
157}
158