1; RUN: llc -march=amdgcn -mcpu=verde -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
2; RUN: llc -march=amdgcn -mcpu=verde -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
4; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
5; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s
6; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s
7
8; OPT-LABEL: @vector_read(
9; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
10; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
11
12; FUNC-LABEL: {{^}}vector_read:
13; EG: MOV
14; EG: MOV
15; EG: MOV
16; EG: MOV
17; EG: MOVA_INT
18define void @vector_read(i32 addrspace(1)* %out, i32 %index) {
19entry:
20  %tmp = alloca [4 x i32]
21  %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
22  %y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
23  %z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
24  %w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
25  store i32 0, i32* %x
26  store i32 1, i32* %y
27  store i32 2, i32* %z
28  store i32 3, i32* %w
29  %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %index
30  %tmp2 = load i32, i32* %tmp1
31  store i32 %tmp2, i32 addrspace(1)* %out
32  ret void
33}
34
35; OPT-LABEL: @vector_write(
36; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index
37; OPT: %1 = extractelement <4 x i32> %0, i32 %r_index
38; OPT: store i32 %1, i32 addrspace(1)* %out, align 4
39
40; FUNC-LABEL: {{^}}vector_write:
41; EG: MOV
42; EG: MOV
43; EG: MOV
44; EG: MOV
45; EG: MOVA_INT
46; EG: MOVA_INT
47define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
48entry:
49  %tmp = alloca [4 x i32]
50  %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
51  %y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
52  %z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
53  %w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
54  store i32 0, i32* %x
55  store i32 0, i32* %y
56  store i32 0, i32* %z
57  store i32 0, i32* %w
58  %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %w_index
59  store i32 1, i32* %tmp1
60  %tmp2 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %r_index
61  %tmp3 = load i32, i32* %tmp2
62  store i32 %tmp3, i32 addrspace(1)* %out
63  ret void
64}
65
66; This test should be optimize to:
67; store i32 0, i32 addrspace(1)* %out
68
69; OPT-LABEL: @bitcast_gep(
70; OPT-LABEL: store i32 0, i32 addrspace(1)* %out, align 4
71
72; FUNC-LABEL: {{^}}bitcast_gep:
73; EG: STORE_RAW
74define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
75entry:
76  %tmp = alloca [4 x i32]
77  %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
78  %y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
79  %z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
80  %w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
81  store i32 0, i32* %x
82  store i32 0, i32* %y
83  store i32 0, i32* %z
84  store i32 0, i32* %w
85  %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
86  %tmp2 = bitcast i32* %tmp1 to [4 x i32]*
87  %tmp3 = getelementptr [4 x i32], [4 x i32]* %tmp2, i32 0, i32 0
88  %tmp4 = load i32, i32* %tmp3
89  store i32 %tmp4, i32 addrspace(1)* %out
90  ret void
91}
92
93; OPT-LABEL: @vector_read_bitcast_gep(
94; OPT: %0 = extractelement <4 x i32> <i32 1065353216, i32 1, i32 2, i32 3>, i32 %index
95; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
96define void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) {
97entry:
98  %tmp = alloca [4 x i32]
99  %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
100  %y = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
101  %z = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
102  %w = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
103  %bc = bitcast i32* %x to float*
104  store float 1.0, float* %bc
105  store i32 1, i32* %y
106  store i32 2, i32* %z
107  store i32 3, i32* %w
108  %tmp1 = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 %index
109  %tmp2 = load i32, i32* %tmp1
110  store i32 %tmp2, i32 addrspace(1)* %out
111  ret void
112}
113
114; FIXME: Should be able to promote this. Instcombine should fold the
115; cast in the hasOneUse case so it might not matter in practice
116
117; OPT-LABEL: @vector_read_bitcast_alloca(
118; OPT: alloca [4 x float]
119; OPT: store float
120; OPT: store float
121; OPT: store float
122; OPT: store float
123; OPT: load float
124define void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
125entry:
126  %tmp = alloca [4 x i32]
127  %tmp.bc = bitcast [4 x i32]* %tmp to [4 x float]*
128  %x = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 0
129  %y = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 1
130  %z = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 2
131  %w = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 3
132  store float 0.0, float* %x
133  store float 1.0, float* %y
134  store float 2.0, float* %z
135  store float 4.0, float* %w
136  %tmp1 = getelementptr inbounds [4 x float], [4 x float]* %tmp.bc, i32 0, i32 %index
137  %tmp2 = load float, float* %tmp1
138  store float %tmp2, float addrspace(1)* %out
139  ret void
140}
141