1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
3
4define amdgpu_kernel void @divergent_or3_b32(<3 x i32> addrspace(1)* %arg) {
5; GCN-LABEL: divergent_or3_b32:
6; GCN:       ; %bb.0: ; %bb
7; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
8; GCN-NEXT:    v_lshlrev_b32_e32 v3, 4, v0
9; GCN-NEXT:    s_waitcnt lgkmcnt(0)
10; GCN-NEXT:    global_load_dwordx3 v[0:2], v3, s[0:1]
11; GCN-NEXT:    s_waitcnt vmcnt(0)
12; GCN-NEXT:    v_or3_b32 v0, v1, v0, v2
13; GCN-NEXT:    v_not_b32_e32 v0, v0
14; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
15; GCN-NEXT:    s_endpgm
16bb:
17  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
18  %i1 = zext i32 %i to i64
19  %i2 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %arg, i64 %i1
20  %i3 = load <3 x i32>, <3 x i32> addrspace(1)* %i2, align 16
21  %i4 = extractelement <3 x i32> %i3, i64 0
22  %i5 = extractelement <3 x i32> %i3, i64 1
23  %i6 = extractelement <3 x i32> %i3, i64 2
24  %i7 = or i32 %i5, %i4
25  %i8 = or i32 %i7, %i6
26  %i9 = xor i32 %i8, -1
27  %i10 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %i2, i64 0, i64 0
28  store i32 %i9, i32 addrspace(1)* %i10, align 16
29  ret void
30}
31
32define amdgpu_kernel void @divergent_or3_b64(<3 x i64> addrspace(1)* %arg) {
33; GCN-LABEL: divergent_or3_b64:
34; GCN:       ; %bb.0: ; %bb
35; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
36; GCN-NEXT:    v_lshlrev_b32_e32 v6, 5, v0
37; GCN-NEXT:    s_waitcnt lgkmcnt(0)
38; GCN-NEXT:    global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
39; GCN-NEXT:    global_load_dwordx4 v[0:3], v6, s[0:1]
40; GCN-NEXT:    s_waitcnt vmcnt(0)
41; GCN-NEXT:    v_or3_b32 v1, v3, v1, v5
42; GCN-NEXT:    v_or3_b32 v0, v2, v0, v4
43; GCN-NEXT:    v_not_b32_e32 v1, v1
44; GCN-NEXT:    v_not_b32_e32 v0, v0
45; GCN-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
46; GCN-NEXT:    s_endpgm
47bb:
48  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
49  %i1 = zext i32 %i to i64
50  %i2 = getelementptr inbounds <3 x i64>, <3 x i64> addrspace(1)* %arg, i64 %i1
51  %i3 = load <3 x i64>, <3 x i64> addrspace(1)* %i2, align 32
52  %i4 = extractelement <3 x i64> %i3, i64 0
53  %i5 = extractelement <3 x i64> %i3, i64 1
54  %i6 = extractelement <3 x i64> %i3, i64 2
55  %i7 = or i64 %i5, %i4
56  %i8 = or i64 %i7, %i6
57  %i9 = xor i64 %i8, -1
58  %i10 = getelementptr inbounds <3 x i64>, <3 x i64> addrspace(1)* %i2, i64 0, i64 0
59  store i64 %i9, i64 addrspace(1)* %i10, align 32
60  ret void
61}
62
63define amdgpu_kernel void @divergent_and3_b32(<3 x i32> addrspace(1)* %arg) {
64; GCN-LABEL: divergent_and3_b32:
65; GCN:       ; %bb.0: ; %bb
66; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
67; GCN-NEXT:    v_lshlrev_b32_e32 v3, 4, v0
68; GCN-NEXT:    s_waitcnt lgkmcnt(0)
69; GCN-NEXT:    global_load_dwordx3 v[0:2], v3, s[0:1]
70; GCN-NEXT:    s_waitcnt vmcnt(0)
71; GCN-NEXT:    v_and_b32_e32 v0, v1, v0
72; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
73; GCN-NEXT:    v_not_b32_e32 v0, v0
74; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
75; GCN-NEXT:    s_endpgm
76bb:
77  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
78  %i1 = zext i32 %i to i64
79  %i2 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %arg, i64 %i1
80  %i3 = load <3 x i32>, <3 x i32> addrspace(1)* %i2, align 16
81  %i4 = extractelement <3 x i32> %i3, i64 0
82  %i5 = extractelement <3 x i32> %i3, i64 1
83  %i6 = extractelement <3 x i32> %i3, i64 2
84  %i7 = and i32 %i5, %i4
85  %i8 = and i32 %i7, %i6
86  %i9 = xor i32 %i8, -1
87  %i10 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %i2, i64 0, i64 0
88  store i32 %i9, i32 addrspace(1)* %i10, align 16
89  ret void
90}
91
92define amdgpu_kernel void @divergent_and3_b64(<3 x i64> addrspace(1)* %arg) {
93; GCN-LABEL: divergent_and3_b64:
94; GCN:       ; %bb.0: ; %bb
95; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
96; GCN-NEXT:    v_lshlrev_b32_e32 v6, 5, v0
97; GCN-NEXT:    s_waitcnt lgkmcnt(0)
98; GCN-NEXT:    global_load_dwordx4 v[0:3], v6, s[0:1]
99; GCN-NEXT:    global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
100; GCN-NEXT:    s_waitcnt vmcnt(1)
101; GCN-NEXT:    v_and_b32_e32 v1, v3, v1
102; GCN-NEXT:    v_and_b32_e32 v0, v2, v0
103; GCN-NEXT:    s_waitcnt vmcnt(0)
104; GCN-NEXT:    v_and_b32_e32 v1, v1, v5
105; GCN-NEXT:    v_and_b32_e32 v0, v0, v4
106; GCN-NEXT:    v_not_b32_e32 v1, v1
107; GCN-NEXT:    v_not_b32_e32 v0, v0
108; GCN-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
109; GCN-NEXT:    s_endpgm
110bb:
111  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
112  %i1 = zext i32 %i to i64
113  %i2 = getelementptr inbounds <3 x i64>, <3 x i64> addrspace(1)* %arg, i64 %i1
114  %i3 = load <3 x i64>, <3 x i64> addrspace(1)* %i2, align 32
115  %i4 = extractelement <3 x i64> %i3, i64 0
116  %i5 = extractelement <3 x i64> %i3, i64 1
117  %i6 = extractelement <3 x i64> %i3, i64 2
118  %i7 = and i64 %i5, %i4
119  %i8 = and i64 %i7, %i6
120  %i9 = xor i64 %i8, -1
121  %i10 = getelementptr inbounds <3 x i64>, <3 x i64> addrspace(1)* %i2, i64 0, i64 0
122  store i64 %i9, i64 addrspace(1)* %i10, align 32
123  ret void
124}
125
126define amdgpu_kernel void @divergent_xor3_b32(<3 x i32> addrspace(1)* %arg) {
127; GCN-LABEL: divergent_xor3_b32:
128; GCN:       ; %bb.0: ; %bb
129; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
130; GCN-NEXT:    v_lshlrev_b32_e32 v3, 4, v0
131; GCN-NEXT:    s_waitcnt lgkmcnt(0)
132; GCN-NEXT:    global_load_dwordx3 v[0:2], v3, s[0:1]
133; GCN-NEXT:    s_waitcnt vmcnt(0)
134; GCN-NEXT:    v_xor_b32_e32 v0, v1, v0
135; GCN-NEXT:    v_xnor_b32_e32 v0, v0, v2
136; GCN-NEXT:    global_store_dword v3, v0, s[0:1]
137; GCN-NEXT:    s_endpgm
138bb:
139  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
140  %i1 = zext i32 %i to i64
141  %i2 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %arg, i64 %i1
142  %i3 = load <3 x i32>, <3 x i32> addrspace(1)* %i2, align 16
143  %i4 = extractelement <3 x i32> %i3, i64 0
144  %i5 = extractelement <3 x i32> %i3, i64 1
145  %i6 = extractelement <3 x i32> %i3, i64 2
146  %i7 = xor i32 %i5, %i4
147  %i8 = xor i32 %i7, %i6
148  %i9 = xor i32 %i8, -1
149  %i10 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %i2, i64 0, i64 0
150  store i32 %i9, i32 addrspace(1)* %i10, align 16
151  ret void
152}
153
154define amdgpu_kernel void @divergent_xor3_b64(<3 x i64> addrspace(1)* %arg) {
155; GCN-LABEL: divergent_xor3_b64:
156; GCN:       ; %bb.0: ; %bb
157; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
158; GCN-NEXT:    v_lshlrev_b32_e32 v6, 5, v0
159; GCN-NEXT:    s_waitcnt lgkmcnt(0)
160; GCN-NEXT:    global_load_dwordx4 v[0:3], v6, s[0:1]
161; GCN-NEXT:    global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
162; GCN-NEXT:    s_waitcnt vmcnt(1)
163; GCN-NEXT:    v_xor_b32_e32 v1, v3, v1
164; GCN-NEXT:    v_xor_b32_e32 v0, v2, v0
165; GCN-NEXT:    s_waitcnt vmcnt(0)
166; GCN-NEXT:    v_xnor_b32_e32 v1, v1, v5
167; GCN-NEXT:    v_xnor_b32_e32 v0, v0, v4
168; GCN-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
169; GCN-NEXT:    s_endpgm
170bb:
171  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
172  %i1 = zext i32 %i to i64
173  %i2 = getelementptr inbounds <3 x i64>, <3 x i64> addrspace(1)* %arg, i64 %i1
174  %i3 = load <3 x i64>, <3 x i64> addrspace(1)* %i2, align 32
175  %i4 = extractelement <3 x i64> %i3, i64 0
176  %i5 = extractelement <3 x i64> %i3, i64 1
177  %i6 = extractelement <3 x i64> %i3, i64 2
178  %i7 = xor i64 %i5, %i4
179  %i8 = xor i64 %i7, %i6
180  %i9 = xor i64 %i8, -1
181  %i10 = getelementptr inbounds <3 x i64>, <3 x i64> addrspace(1)* %i2, i64 0, i64 0
182  store i64 %i9, i64 addrspace(1)* %i10, align 32
183  ret void
184}
185
186define amdgpu_kernel void @uniform_or3_b32(<3 x i32> addrspace(1)* %arg) {
187; GCN-LABEL: uniform_or3_b32:
188; GCN:       ; %bb.0: ; %bb
189; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
190; GCN-NEXT:    v_mov_b32_e32 v0, 0
191; GCN-NEXT:    s_waitcnt lgkmcnt(0)
192; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
193; GCN-NEXT:    s_waitcnt lgkmcnt(0)
194; GCN-NEXT:    s_or_b32 s0, s1, s0
195; GCN-NEXT:    s_nor_b32 s0, s0, s2
196; GCN-NEXT:    v_mov_b32_e32 v1, s0
197; GCN-NEXT:    global_store_dword v0, v1, s[4:5]
198; GCN-NEXT:    s_endpgm
199bb:
200  %i3 = load <3 x i32>, <3 x i32> addrspace(1)* %arg, align 16
201  %i4 = extractelement <3 x i32> %i3, i64 0
202  %i5 = extractelement <3 x i32> %i3, i64 1
203  %i6 = extractelement <3 x i32> %i3, i64 2
204  %i7 = or i32 %i5, %i4
205  %i8 = or i32 %i7, %i6
206  %i9 = xor i32 %i8, -1
207  %i10 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %arg, i64 0, i64 0
208  store i32 %i9, i32 addrspace(1)* %i10, align 16
209  ret void
210}
211
212define amdgpu_kernel void @uniform_or3_b64(<3 x i64> addrspace(1)* %arg) {
213; GCN-LABEL: uniform_or3_b64:
214; GCN:       ; %bb.0: ; %bb
215; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
216; GCN-NEXT:    v_mov_b32_e32 v2, 0
217; GCN-NEXT:    s_waitcnt lgkmcnt(0)
218; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
219; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
220; GCN-NEXT:    s_waitcnt lgkmcnt(0)
221; GCN-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
222; GCN-NEXT:    s_nor_b64 s[0:1], s[0:1], s[6:7]
223; GCN-NEXT:    v_mov_b32_e32 v0, s0
224; GCN-NEXT:    v_mov_b32_e32 v1, s1
225; GCN-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
226; GCN-NEXT:    s_endpgm
227bb:
228  %i3 = load <3 x i64>, <3 x i64> addrspace(1)* %arg, align 32
229  %i4 = extractelement <3 x i64> %i3, i64 0
230  %i5 = extractelement <3 x i64> %i3, i64 1
231  %i6 = extractelement <3 x i64> %i3, i64 2
232  %i7 = or i64 %i5, %i4
233  %i8 = or i64 %i7, %i6
234  %i9 = xor i64 %i8, -1
235  %i10 = getelementptr inbounds <3 x i64>, <3 x i64> addrspace(1)* %arg, i64 0, i64 0
236  store i64 %i9, i64 addrspace(1)* %i10, align 32
237  ret void
238}
239
240define amdgpu_kernel void @uniform_and3_b32(<3 x i32> addrspace(1)* %arg) {
241; GCN-LABEL: uniform_and3_b32:
242; GCN:       ; %bb.0: ; %bb
243; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
244; GCN-NEXT:    v_mov_b32_e32 v0, 0
245; GCN-NEXT:    s_waitcnt lgkmcnt(0)
246; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
247; GCN-NEXT:    s_waitcnt lgkmcnt(0)
248; GCN-NEXT:    s_and_b32 s0, s1, s0
249; GCN-NEXT:    s_nand_b32 s0, s0, s2
250; GCN-NEXT:    v_mov_b32_e32 v1, s0
251; GCN-NEXT:    global_store_dword v0, v1, s[4:5]
252; GCN-NEXT:    s_endpgm
253bb:
254  %i3 = load <3 x i32>, <3 x i32> addrspace(1)* %arg, align 16
255  %i4 = extractelement <3 x i32> %i3, i64 0
256  %i5 = extractelement <3 x i32> %i3, i64 1
257  %i6 = extractelement <3 x i32> %i3, i64 2
258  %i7 = and i32 %i5, %i4
259  %i8 = and i32 %i7, %i6
260  %i9 = xor i32 %i8, -1
261  %i10 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %arg, i64 0, i64 0
262  store i32 %i9, i32 addrspace(1)* %i10, align 16
263  ret void
264}
265
266define amdgpu_kernel void @uniform_and3_b64(<3 x i64> addrspace(1)* %arg) {
267; GCN-LABEL: uniform_and3_b64:
268; GCN:       ; %bb.0: ; %bb
269; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
270; GCN-NEXT:    v_mov_b32_e32 v2, 0
271; GCN-NEXT:    s_waitcnt lgkmcnt(0)
272; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
273; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
274; GCN-NEXT:    s_waitcnt lgkmcnt(0)
275; GCN-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
276; GCN-NEXT:    s_nand_b64 s[0:1], s[0:1], s[6:7]
277; GCN-NEXT:    v_mov_b32_e32 v0, s0
278; GCN-NEXT:    v_mov_b32_e32 v1, s1
279; GCN-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
280; GCN-NEXT:    s_endpgm
281bb:
282  %i3 = load <3 x i64>, <3 x i64> addrspace(1)* %arg, align 32
283  %i4 = extractelement <3 x i64> %i3, i64 0
284  %i5 = extractelement <3 x i64> %i3, i64 1
285  %i6 = extractelement <3 x i64> %i3, i64 2
286  %i7 = and i64 %i5, %i4
287  %i8 = and i64 %i7, %i6
288  %i9 = xor i64 %i8, -1
289  %i10 = getelementptr inbounds <3 x i64>, <3 x i64> addrspace(1)* %arg, i64 0, i64 0
290  store i64 %i9, i64 addrspace(1)* %i10, align 32
291  ret void
292}
293
294define amdgpu_kernel void @uniform_xor3_b32(<3 x i32> addrspace(1)* %arg) {
295; GCN-LABEL: uniform_xor3_b32:
296; GCN:       ; %bb.0: ; %bb
297; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
298; GCN-NEXT:    v_mov_b32_e32 v0, 0
299; GCN-NEXT:    s_waitcnt lgkmcnt(0)
300; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
301; GCN-NEXT:    s_waitcnt lgkmcnt(0)
302; GCN-NEXT:    s_xor_b32 s0, s1, s0
303; GCN-NEXT:    s_xnor_b32 s0, s0, s2
304; GCN-NEXT:    v_mov_b32_e32 v1, s0
305; GCN-NEXT:    global_store_dword v0, v1, s[4:5]
306; GCN-NEXT:    s_endpgm
307bb:
308  %i3 = load <3 x i32>, <3 x i32> addrspace(1)* %arg, align 16
309  %i4 = extractelement <3 x i32> %i3, i64 0
310  %i5 = extractelement <3 x i32> %i3, i64 1
311  %i6 = extractelement <3 x i32> %i3, i64 2
312  %i7 = xor i32 %i5, %i4
313  %i8 = xor i32 %i7, %i6
314  %i9 = xor i32 %i8, -1
315  %i10 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %arg, i64 0, i64 0
316  store i32 %i9, i32 addrspace(1)* %i10, align 16
317  ret void
318}
319
320define amdgpu_kernel void @uniform_xor3_b64(<3 x i64> addrspace(1)* %arg) {
321; GCN-LABEL: uniform_xor3_b64:
322; GCN:       ; %bb.0: ; %bb
323; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
324; GCN-NEXT:    v_mov_b32_e32 v2, 0
325; GCN-NEXT:    s_waitcnt lgkmcnt(0)
326; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
327; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
328; GCN-NEXT:    s_waitcnt lgkmcnt(0)
329; GCN-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
330; GCN-NEXT:    s_xnor_b64 s[0:1], s[0:1], s[6:7]
331; GCN-NEXT:    v_mov_b32_e32 v0, s0
332; GCN-NEXT:    v_mov_b32_e32 v1, s1
333; GCN-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
334; GCN-NEXT:    s_endpgm
335bb:
336  %i3 = load <3 x i64>, <3 x i64> addrspace(1)* %arg, align 32
337  %i4 = extractelement <3 x i64> %i3, i64 0
338  %i5 = extractelement <3 x i64> %i3, i64 1
339  %i6 = extractelement <3 x i64> %i3, i64 2
340  %i7 = xor i64 %i5, %i4
341  %i8 = xor i64 %i7, %i6
342  %i9 = xor i64 %i8, -1
343  %i10 = getelementptr inbounds <3 x i64>, <3 x i64> addrspace(1)* %arg, i64 0, i64 0
344  store i64 %i9, i64 addrspace(1)* %i10, align 32
345  ret void
346}
347
348declare i32 @llvm.amdgcn.workitem.id.x()
349