1; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GCN
2; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
3; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
4
5; GCN-LABEL: {{^}}full_mask:
6; GCN: s_mov_b64 exec, -1
7; GCN: v_add_f32_e32 v0,
8define amdgpu_ps float @full_mask(float %a, float %b) {
9main_body:
10  %s = fadd float %a, %b
11  call void @llvm.amdgcn.init.exec(i64 -1)
12  ret float %s
13}
14
15; GCN-LABEL: {{^}}partial_mask:
16; GCN: s_mov_b64 exec, 0x1e240
17; GCN: v_add_f32_e32 v0,
18define amdgpu_ps float @partial_mask(float %a, float %b) {
19main_body:
20  %s = fadd float %a, %b
21  call void @llvm.amdgcn.init.exec(i64 123456)
22  ret float %s
23}
24
25; GCN-LABEL: {{^}}input_s3off8:
26; GCN: s_bfe_u32 s0, s3, 0x70008
27; GCN: s_bfm_b64 exec, s0, 0
28; GCN: s_cmp_eq_u32 s0, 64
29; GCN: s_cmov_b64 exec, -1
30; GCN: v_add_f32_e32 v0,
31define amdgpu_ps float @input_s3off8(i32 inreg, i32 inreg, i32 inreg, i32 inreg %count, float %a, float %b) {
32main_body:
33  %s = fadd float %a, %b
34  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
35  ret float %s
36}
37
38; GCN-LABEL: {{^}}input_s0off19:
39; GCN: s_bfe_u32 s0, s0, 0x70013
40; GCN: s_bfm_b64 exec, s0, 0
41; GCN: s_cmp_eq_u32 s0, 64
42; GCN: s_cmov_b64 exec, -1
43; GCN: v_add_f32_e32 v0,
44define amdgpu_ps float @input_s0off19(i32 inreg %count, float %a, float %b) {
45main_body:
46  %s = fadd float %a, %b
47  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
48  ret float %s
49}
50
51; GCN-LABEL: {{^}}reuse_input:
52; GCN: s_bfe_u32 s1, s0, 0x70013
53; GCN: s_bfm_b64 exec, s1, 0
54; GCN: s_cmp_eq_u32 s1, 64
55; GCN: s_cmov_b64 exec, -1
56; GCN: v_add{{(_nc)?}}_u32_e32 v0, s0, v0
57define amdgpu_ps float @reuse_input(i32 inreg %count, i32 %a) {
58main_body:
59  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
60  %s = add i32 %a, %count
61  %f = sitofp i32 %s to float
62  ret float %f
63}
64
65; GCN-LABEL: {{^}}reuse_input2:
66; GCN: s_bfe_u32 s1, s0, 0x70013
67; GCN: s_bfm_b64 exec, s1, 0
68; GCN: s_cmp_eq_u32 s1, 64
69; GCN: s_cmov_b64 exec, -1
70; GCN: v_add{{(_nc)?}}_u32_e32 v0, s0, v0
71define amdgpu_ps float @reuse_input2(i32 inreg %count, i32 %a) {
72main_body:
73  %s = add i32 %a, %count
74  %f = sitofp i32 %s to float
75  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 19)
76  ret float %f
77}
78
79; GCN-LABEL: {{^}}init_unreachable:
80;
81; This used to crash.
82define amdgpu_ps void @init_unreachable() {
83main_body:
84  call void @llvm.amdgcn.init.exec(i64 -1)
85  unreachable
86}
87
88; GCN-LABEL: {{^}}init_exec_before_frame_materialize:
89; GCN-NOT: {{^}}v_
90; GCN: s_mov_b64 exec, -1
91; GCN: v_mov
92; GCN: v_add
93define amdgpu_ps float @init_exec_before_frame_materialize(i32 inreg %a, i32 inreg %b) {
94main_body:
95  %array0 = alloca [1024 x i32], align 16, addrspace(5)
96  %array1 = alloca [20 x i32], align 16, addrspace(5)
97  call void @llvm.amdgcn.init.exec(i64 -1)
98
99  %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
100  store i32 %a, i32 addrspace(5)* %ptr0, align 4
101
102  %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
103  store i32 %a, i32 addrspace(5)* %ptr1, align 4
104
105  %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
106  store i32 %b, i32 addrspace(5)* %ptr2, align 4
107
108  %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
109  %v3 = load i32, i32 addrspace(5)* %ptr3, align 4
110
111  %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
112  %v4 = load i32, i32 addrspace(5)* %ptr4, align 4
113
114  %v5 = add i32 %v3, %v4
115  %v = bitcast i32 %v5 to float
116  ret float %v
117}
118
119; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize:
120; GCN-NOT: {{^}}v_
121; GCN: s_bfe_u32 s2, s2, 0x70008
122; GCN-NEXT: s_bfm_b64 exec, s2, 0
123; GCN-NEXT: s_cmp_eq_u32 s2, 64
124; GCN-NEXT: s_cmov_b64 exec, -1
125; GCN: v_mov
126; GCN: v_add
127define amdgpu_ps float @init_exec_input_before_frame_materialize(i32 inreg %a, i32 inreg %b, i32 inreg %count) {
128main_body:
129  %array0 = alloca [1024 x i32], align 16, addrspace(5)
130  %array1 = alloca [20 x i32], align 16, addrspace(5)
131  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
132
133  %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
134  store i32 %a, i32 addrspace(5)* %ptr0, align 4
135
136  %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
137  store i32 %a, i32 addrspace(5)* %ptr1, align 4
138
139  %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
140  store i32 %b, i32 addrspace(5)* %ptr2, align 4
141
142  %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
143  %v3 = load i32, i32 addrspace(5)* %ptr3, align 4
144
145  %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
146  %v4 = load i32, i32 addrspace(5)* %ptr4, align 4
147
148  %v5 = add i32 %v3, %v4
149  %v = bitcast i32 %v5 to float
150  ret float %v
151}
152
153; GCN-LABEL: {{^}}init_exec_input_before_frame_materialize_nonentry:
154; GCN-NOT: {{^}}v_
155; GCN: %endif
156; GCN: s_bfe_u32 [[S:s[0-9]+]], s2, 0x70008
157; GCN-NEXT: s_bfm_b64 exec, [[S]], 0
158; GCN-NEXT: s_cmp_eq_u32 [[S]], 64
159; GCN-NEXT: s_cmov_b64 exec, -1
160; GCN: v_mov
161; GCN: v_add
162define amdgpu_ps float @init_exec_input_before_frame_materialize_nonentry(i32 inreg %a, i32 inreg %b, i32 inreg %count) {
163main_body:
164  ; ideally these alloca would be in %endif, but this causes problems on Windows GlobalISel
165  %array0 = alloca [1024 x i32], align 16, addrspace(5)
166  %array1 = alloca [20 x i32], align 16, addrspace(5)
167
168  %cc = icmp uge i32 %count, 32
169  br i1 %cc, label %endif, label %if
170
171if:
172  call void asm sideeffect "", ""()
173  br label %endif
174
175endif:
176  call void @llvm.amdgcn.init.exec.from.input(i32 %count, i32 8)
177
178  %ptr0 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 1
179  store i32 %a, i32 addrspace(5)* %ptr0, align 4
180
181  %ptr1 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 1
182  store i32 %a, i32 addrspace(5)* %ptr1, align 4
183
184  %ptr2 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 2
185  store i32 %b, i32 addrspace(5)* %ptr2, align 4
186
187  %ptr3 = getelementptr inbounds [20 x i32], [20 x i32] addrspace(5)* %array1, i32 0, i32 %b
188  %v3 = load i32, i32 addrspace(5)* %ptr3, align 4
189
190  %ptr4 = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(5)* %array0, i32 0, i32 %b
191  %v4 = load i32, i32 addrspace(5)* %ptr4, align 4
192
193  %v5 = add i32 %v3, %v4
194  %v6 = add i32 %v5, %count
195  %v = bitcast i32 %v6 to float
196  ret float %v
197}
198
199declare void @llvm.amdgcn.init.exec(i64) #1
200declare void @llvm.amdgcn.init.exec.from.input(i32, i32) #1
201
202attributes #1 = { convergent }
203