1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
3; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT
4; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT
5; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -global-isel -verify-machineinstrs | FileCheck %s --check-prefix=GISEL
6
7declare i32 @llvm.amdgcn.workitem.id.x() #1
8
9declare i16 @llvm.bitreverse.i16(i16) #1
10declare i32 @llvm.bitreverse.i32(i32) #1
11declare i64 @llvm.bitreverse.i64(i64) #1
12
13declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>) #1
14declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) #1
15
16declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) #1
17declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1
18
19define amdgpu_kernel void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
20; SI-LABEL: s_brev_i16:
21; SI:       ; %bb.0:
22; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
23; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
24; SI-NEXT:    s_mov_b32 s3, 0xf000
25; SI-NEXT:    s_mov_b32 s2, -1
26; SI-NEXT:    s_waitcnt lgkmcnt(0)
27; SI-NEXT:    s_brev_b32 s4, s4
28; SI-NEXT:    s_lshr_b32 s4, s4, 16
29; SI-NEXT:    v_mov_b32_e32 v0, s4
30; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
31; SI-NEXT:    s_endpgm
32;
33; FLAT-LABEL: s_brev_i16:
34; FLAT:       ; %bb.0:
35; FLAT-NEXT:    s_load_dword s4, s[0:1], 0x2c
36; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
37; FLAT-NEXT:    s_mov_b32 s3, 0xf000
38; FLAT-NEXT:    s_mov_b32 s2, -1
39; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
40; FLAT-NEXT:    s_brev_b32 s4, s4
41; FLAT-NEXT:    s_lshr_b32 s4, s4, 16
42; FLAT-NEXT:    v_mov_b32_e32 v0, s4
43; FLAT-NEXT:    buffer_store_short v0, off, s[0:3], 0
44; FLAT-NEXT:    s_endpgm
45;
46; GISEL-LABEL: s_brev_i16:
47; GISEL:       ; %bb.0:
48; GISEL-NEXT:    s_load_dword s2, s[0:1], 0x2c
49; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
50; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
51; GISEL-NEXT:    s_and_b32 s2, s2, 0xffff
52; GISEL-NEXT:    s_brev_b32 s2, s2
53; GISEL-NEXT:    s_lshr_b32 s2, s2, 16
54; GISEL-NEXT:    v_mov_b32_e32 v0, s0
55; GISEL-NEXT:    v_mov_b32_e32 v2, s2
56; GISEL-NEXT:    v_mov_b32_e32 v1, s1
57; GISEL-NEXT:    flat_store_short v[0:1], v2
58; GISEL-NEXT:    s_endpgm
59  %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
60  store i16 %brev, i16 addrspace(1)* %out
61  ret void
62}
63
64define amdgpu_kernel void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 {
65; SI-LABEL: v_brev_i16:
66; SI:       ; %bb.0:
67; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
68; SI-NEXT:    s_mov_b32 s3, 0xf000
69; SI-NEXT:    s_mov_b32 s2, -1
70; SI-NEXT:    s_mov_b32 s6, s2
71; SI-NEXT:    s_mov_b32 s7, s3
72; SI-NEXT:    s_waitcnt lgkmcnt(0)
73; SI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
74; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
75; SI-NEXT:    s_waitcnt vmcnt(0)
76; SI-NEXT:    v_bfrev_b32_e32 v0, v0
77; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
78; SI-NEXT:    s_waitcnt lgkmcnt(0)
79; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
80; SI-NEXT:    s_endpgm
81;
82; FLAT-LABEL: v_brev_i16:
83; FLAT:       ; %bb.0:
84; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
85; FLAT-NEXT:    s_mov_b32 s3, 0xf000
86; FLAT-NEXT:    s_mov_b32 s2, -1
87; FLAT-NEXT:    s_mov_b32 s6, s2
88; FLAT-NEXT:    s_mov_b32 s7, s3
89; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
90; FLAT-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
91; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
92; FLAT-NEXT:    s_waitcnt vmcnt(0)
93; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
94; FLAT-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
95; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
96; FLAT-NEXT:    buffer_store_short v0, off, s[0:3], 0
97; FLAT-NEXT:    s_endpgm
98;
99; GISEL-LABEL: v_brev_i16:
100; GISEL:       ; %bb.0:
101; GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
102; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
103; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
104; GISEL-NEXT:    v_mov_b32_e32 v0, s2
105; GISEL-NEXT:    v_mov_b32_e32 v1, s3
106; GISEL-NEXT:    flat_load_ushort v0, v[0:1]
107; GISEL-NEXT:    s_waitcnt vmcnt(0)
108; GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
109; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
110; GISEL-NEXT:    v_mov_b32_e32 v0, s0
111; GISEL-NEXT:    v_mov_b32_e32 v1, s1
112; GISEL-NEXT:    flat_store_short v[0:1], v2
113; GISEL-NEXT:    s_endpgm
114  %val = load i16, i16 addrspace(1)* %valptr
115  %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
116  store i16 %brev, i16 addrspace(1)* %out
117  ret void
118}
119
120define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 {
121; SI-LABEL: s_brev_i32:
122; SI:       ; %bb.0:
123; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
124; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
125; SI-NEXT:    s_mov_b32 s3, 0xf000
126; SI-NEXT:    s_mov_b32 s2, -1
127; SI-NEXT:    s_waitcnt lgkmcnt(0)
128; SI-NEXT:    s_brev_b32 s4, s4
129; SI-NEXT:    v_mov_b32_e32 v0, s4
130; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
131; SI-NEXT:    s_endpgm
132;
133; FLAT-LABEL: s_brev_i32:
134; FLAT:       ; %bb.0:
135; FLAT-NEXT:    s_load_dword s4, s[0:1], 0x2c
136; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
137; FLAT-NEXT:    s_mov_b32 s3, 0xf000
138; FLAT-NEXT:    s_mov_b32 s2, -1
139; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
140; FLAT-NEXT:    s_brev_b32 s4, s4
141; FLAT-NEXT:    v_mov_b32_e32 v0, s4
142; FLAT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
143; FLAT-NEXT:    s_endpgm
144;
145; GISEL-LABEL: s_brev_i32:
146; GISEL:       ; %bb.0:
147; GISEL-NEXT:    s_load_dword s2, s[0:1], 0x2c
148; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
149; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
150; GISEL-NEXT:    s_brev_b32 s2, s2
151; GISEL-NEXT:    v_mov_b32_e32 v0, s0
152; GISEL-NEXT:    v_mov_b32_e32 v2, s2
153; GISEL-NEXT:    v_mov_b32_e32 v1, s1
154; GISEL-NEXT:    flat_store_dword v[0:1], v2
155; GISEL-NEXT:    s_endpgm
156  %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
157  store i32 %brev, i32 addrspace(1)* %out
158  ret void
159}
160
161define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
162; SI-LABEL: v_brev_i32:
163; SI:       ; %bb.0:
164; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
165; SI-NEXT:    s_mov_b32 s3, 0xf000
166; SI-NEXT:    s_mov_b32 s6, 0
167; SI-NEXT:    s_mov_b32 s7, s3
168; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
169; SI-NEXT:    v_mov_b32_e32 v1, 0
170; SI-NEXT:    s_waitcnt lgkmcnt(0)
171; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
172; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
173; SI-NEXT:    s_mov_b32 s2, -1
174; SI-NEXT:    s_waitcnt vmcnt(0)
175; SI-NEXT:    v_bfrev_b32_e32 v0, v0
176; SI-NEXT:    s_waitcnt lgkmcnt(0)
177; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
178; SI-NEXT:    s_endpgm
179;
180; FLAT-LABEL: v_brev_i32:
181; FLAT:       ; %bb.0:
182; FLAT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
183; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
184; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
185; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
186; FLAT-NEXT:    v_mov_b32_e32 v1, s3
187; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
188; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
189; FLAT-NEXT:    flat_load_dword v0, v[0:1]
190; FLAT-NEXT:    s_mov_b32 s3, 0xf000
191; FLAT-NEXT:    s_mov_b32 s2, -1
192; FLAT-NEXT:    s_waitcnt vmcnt(0)
193; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
194; FLAT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
195; FLAT-NEXT:    s_endpgm
196;
197; GISEL-LABEL: v_brev_i32:
198; GISEL:       ; %bb.0:
199; GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
200; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
201; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
202; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
203; GISEL-NEXT:    v_mov_b32_e32 v0, s2
204; GISEL-NEXT:    v_mov_b32_e32 v1, s3
205; GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
206; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
207; GISEL-NEXT:    flat_load_dword v0, v[0:1]
208; GISEL-NEXT:    s_waitcnt vmcnt(0)
209; GISEL-NEXT:    v_bfrev_b32_e32 v2, v0
210; GISEL-NEXT:    v_mov_b32_e32 v0, s0
211; GISEL-NEXT:    v_mov_b32_e32 v1, s1
212; GISEL-NEXT:    flat_store_dword v[0:1], v2
213; GISEL-NEXT:    s_endpgm
214  %tid = call i32 @llvm.amdgcn.workitem.id.x()
215  %gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
216  %val = load i32, i32 addrspace(1)* %gep
217  %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
218  store i32 %brev, i32 addrspace(1)* %out
219  ret void
220}
221
222define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 {
223; SI-LABEL: s_brev_v2i32:
224; SI:       ; %bb.0:
225; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
226; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
227; SI-NEXT:    s_mov_b32 s3, 0xf000
228; SI-NEXT:    s_mov_b32 s2, -1
229; SI-NEXT:    s_waitcnt lgkmcnt(0)
230; SI-NEXT:    s_brev_b32 s5, s5
231; SI-NEXT:    s_brev_b32 s4, s4
232; SI-NEXT:    v_mov_b32_e32 v0, s4
233; SI-NEXT:    v_mov_b32_e32 v1, s5
234; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
235; SI-NEXT:    s_endpgm
236;
237; FLAT-LABEL: s_brev_v2i32:
238; FLAT:       ; %bb.0:
239; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
240; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
241; FLAT-NEXT:    s_mov_b32 s3, 0xf000
242; FLAT-NEXT:    s_mov_b32 s2, -1
243; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
244; FLAT-NEXT:    s_brev_b32 s5, s5
245; FLAT-NEXT:    s_brev_b32 s4, s4
246; FLAT-NEXT:    v_mov_b32_e32 v0, s4
247; FLAT-NEXT:    v_mov_b32_e32 v1, s5
248; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
249; FLAT-NEXT:    s_endpgm
250;
251; GISEL-LABEL: s_brev_v2i32:
252; GISEL:       ; %bb.0:
253; GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
254; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
255; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
256; GISEL-NEXT:    s_brev_b32 s2, s2
257; GISEL-NEXT:    s_brev_b32 s3, s3
258; GISEL-NEXT:    v_mov_b32_e32 v0, s2
259; GISEL-NEXT:    v_mov_b32_e32 v3, s1
260; GISEL-NEXT:    v_mov_b32_e32 v1, s3
261; GISEL-NEXT:    v_mov_b32_e32 v2, s0
262; GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
263; GISEL-NEXT:    s_endpgm
264  %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
265  store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
266  ret void
267}
268
269define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
270; SI-LABEL: v_brev_v2i32:
271; SI:       ; %bb.0:
272; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
273; SI-NEXT:    s_mov_b32 s3, 0xf000
274; SI-NEXT:    s_mov_b32 s6, 0
275; SI-NEXT:    s_mov_b32 s7, s3
276; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
277; SI-NEXT:    v_mov_b32_e32 v1, 0
278; SI-NEXT:    s_waitcnt lgkmcnt(0)
279; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
280; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
281; SI-NEXT:    s_mov_b32 s2, -1
282; SI-NEXT:    s_waitcnt vmcnt(0)
283; SI-NEXT:    v_bfrev_b32_e32 v1, v1
284; SI-NEXT:    v_bfrev_b32_e32 v0, v0
285; SI-NEXT:    s_waitcnt lgkmcnt(0)
286; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
287; SI-NEXT:    s_endpgm
288;
289; FLAT-LABEL: v_brev_v2i32:
290; FLAT:       ; %bb.0:
291; FLAT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
292; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
293; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
294; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
295; FLAT-NEXT:    v_mov_b32_e32 v1, s3
296; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
297; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
298; FLAT-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
299; FLAT-NEXT:    s_mov_b32 s3, 0xf000
300; FLAT-NEXT:    s_mov_b32 s2, -1
301; FLAT-NEXT:    s_waitcnt vmcnt(0)
302; FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
303; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
304; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
305; FLAT-NEXT:    s_endpgm
306;
307; GISEL-LABEL: v_brev_v2i32:
308; GISEL:       ; %bb.0:
309; GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
310; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
311; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
312; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
313; GISEL-NEXT:    v_mov_b32_e32 v0, s2
314; GISEL-NEXT:    v_mov_b32_e32 v1, s3
315; GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
316; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
317; GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
318; GISEL-NEXT:    v_mov_b32_e32 v3, s1
319; GISEL-NEXT:    v_mov_b32_e32 v2, s0
320; GISEL-NEXT:    s_waitcnt vmcnt(0)
321; GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
322; GISEL-NEXT:    v_bfrev_b32_e32 v1, v1
323; GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
324; GISEL-NEXT:    s_endpgm
325  %tid = call i32 @llvm.amdgcn.workitem.id.x()
326  %gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
327  %val = load <2 x i32>, <2 x i32> addrspace(1)* %gep
328  %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
329  store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
330  ret void
331}
332
333define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {
334; SI-LABEL: s_brev_i64:
335; SI:       ; %bb.0:
336; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
337; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
338; SI-NEXT:    s_mov_b32 s3, 0xf000
339; SI-NEXT:    s_mov_b32 s2, -1
340; SI-NEXT:    s_waitcnt lgkmcnt(0)
341; SI-NEXT:    s_brev_b64 s[4:5], s[4:5]
342; SI-NEXT:    v_mov_b32_e32 v0, s4
343; SI-NEXT:    v_mov_b32_e32 v1, s5
344; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
345; SI-NEXT:    s_endpgm
346;
347; FLAT-LABEL: s_brev_i64:
348; FLAT:       ; %bb.0:
349; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
350; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
351; FLAT-NEXT:    s_mov_b32 s3, 0xf000
352; FLAT-NEXT:    s_mov_b32 s2, -1
353; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
354; FLAT-NEXT:    s_brev_b64 s[4:5], s[4:5]
355; FLAT-NEXT:    v_mov_b32_e32 v0, s4
356; FLAT-NEXT:    v_mov_b32_e32 v1, s5
357; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
358; FLAT-NEXT:    s_endpgm
359;
360; GISEL-LABEL: s_brev_i64:
361; GISEL:       ; %bb.0:
362; GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
363; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
364; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
365; GISEL-NEXT:    s_brev_b64 s[2:3], s[2:3]
366; GISEL-NEXT:    v_mov_b32_e32 v0, s2
367; GISEL-NEXT:    v_mov_b32_e32 v3, s1
368; GISEL-NEXT:    v_mov_b32_e32 v1, s3
369; GISEL-NEXT:    v_mov_b32_e32 v2, s0
370; GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
371; GISEL-NEXT:    s_endpgm
372  %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
373  store i64 %brev, i64 addrspace(1)* %out
374  ret void
375}
376
377define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
378; SI-LABEL: v_brev_i64:
379; SI:       ; %bb.0:
380; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
381; SI-NEXT:    s_mov_b32 s3, 0xf000
382; SI-NEXT:    s_mov_b32 s6, 0
383; SI-NEXT:    s_mov_b32 s7, s3
384; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
385; SI-NEXT:    v_mov_b32_e32 v1, 0
386; SI-NEXT:    s_waitcnt lgkmcnt(0)
387; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
388; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
389; SI-NEXT:    s_mov_b32 s2, -1
390; SI-NEXT:    s_waitcnt vmcnt(0)
391; SI-NEXT:    v_bfrev_b32_e32 v2, v0
392; SI-NEXT:    v_bfrev_b32_e32 v1, v1
393; SI-NEXT:    s_waitcnt lgkmcnt(0)
394; SI-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
395; SI-NEXT:    s_endpgm
396;
397; FLAT-LABEL: v_brev_i64:
398; FLAT:       ; %bb.0:
399; FLAT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
400; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
401; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
402; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
403; FLAT-NEXT:    v_mov_b32_e32 v1, s3
404; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
405; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
406; FLAT-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
407; FLAT-NEXT:    s_mov_b32 s3, 0xf000
408; FLAT-NEXT:    s_mov_b32 s2, -1
409; FLAT-NEXT:    s_waitcnt vmcnt(0)
410; FLAT-NEXT:    v_bfrev_b32_e32 v2, v0
411; FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
412; FLAT-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
413; FLAT-NEXT:    s_endpgm
414;
415; GISEL-LABEL: v_brev_i64:
416; GISEL:       ; %bb.0:
417; GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
418; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
419; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
420; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
421; GISEL-NEXT:    v_mov_b32_e32 v0, s2
422; GISEL-NEXT:    v_mov_b32_e32 v1, s3
423; GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
424; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
425; GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
426; GISEL-NEXT:    v_mov_b32_e32 v4, s1
427; GISEL-NEXT:    v_mov_b32_e32 v3, s0
428; GISEL-NEXT:    s_waitcnt vmcnt(0)
429; GISEL-NEXT:    v_bfrev_b32_e32 v1, v1
430; GISEL-NEXT:    v_bfrev_b32_e32 v2, v0
431; GISEL-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
432; GISEL-NEXT:    s_endpgm
433  %tid = call i32 @llvm.amdgcn.workitem.id.x()
434  %gep = getelementptr i64, i64 addrspace(1)* %valptr, i32 %tid
435  %val = load i64, i64 addrspace(1)* %gep
436  %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
437  store i64 %brev, i64 addrspace(1)* %out
438  ret void
439}
440
441define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 {
442; SI-LABEL: s_brev_v2i64:
443; SI:       ; %bb.0:
444; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
445; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
446; SI-NEXT:    s_mov_b32 s3, 0xf000
447; SI-NEXT:    s_mov_b32 s2, -1
448; SI-NEXT:    s_waitcnt lgkmcnt(0)
449; SI-NEXT:    s_brev_b64 s[6:7], s[6:7]
450; SI-NEXT:    s_brev_b64 s[4:5], s[4:5]
451; SI-NEXT:    v_mov_b32_e32 v0, s4
452; SI-NEXT:    v_mov_b32_e32 v1, s5
453; SI-NEXT:    v_mov_b32_e32 v2, s6
454; SI-NEXT:    v_mov_b32_e32 v3, s7
455; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
456; SI-NEXT:    s_endpgm
457;
458; FLAT-LABEL: s_brev_v2i64:
459; FLAT:       ; %bb.0:
460; FLAT-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
461; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
462; FLAT-NEXT:    s_mov_b32 s3, 0xf000
463; FLAT-NEXT:    s_mov_b32 s2, -1
464; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
465; FLAT-NEXT:    s_brev_b64 s[6:7], s[6:7]
466; FLAT-NEXT:    s_brev_b64 s[4:5], s[4:5]
467; FLAT-NEXT:    v_mov_b32_e32 v0, s4
468; FLAT-NEXT:    v_mov_b32_e32 v1, s5
469; FLAT-NEXT:    v_mov_b32_e32 v2, s6
470; FLAT-NEXT:    v_mov_b32_e32 v3, s7
471; FLAT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
472; FLAT-NEXT:    s_endpgm
473;
474; GISEL-LABEL: s_brev_v2i64:
475; GISEL:       ; %bb.0:
476; GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
477; GISEL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
478; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
479; GISEL-NEXT:    s_brev_b64 s[0:1], s[4:5]
480; GISEL-NEXT:    s_brev_b64 s[2:3], s[6:7]
481; GISEL-NEXT:    v_mov_b32_e32 v0, s0
482; GISEL-NEXT:    v_mov_b32_e32 v4, s8
483; GISEL-NEXT:    v_mov_b32_e32 v1, s1
484; GISEL-NEXT:    v_mov_b32_e32 v2, s2
485; GISEL-NEXT:    v_mov_b32_e32 v3, s3
486; GISEL-NEXT:    v_mov_b32_e32 v5, s9
487; GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
488; GISEL-NEXT:    s_endpgm
489  %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
490  store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
491  ret void
492}
493
494define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
495; SI-LABEL: v_brev_v2i64:
496; SI:       ; %bb.0:
497; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
498; SI-NEXT:    s_mov_b32 s3, 0xf000
499; SI-NEXT:    s_mov_b32 s6, 0
500; SI-NEXT:    s_mov_b32 s7, s3
501; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
502; SI-NEXT:    v_mov_b32_e32 v1, 0
503; SI-NEXT:    s_waitcnt lgkmcnt(0)
504; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[4:7], 0 addr64
505; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
506; SI-NEXT:    s_mov_b32 s2, -1
507; SI-NEXT:    s_waitcnt vmcnt(0)
508; SI-NEXT:    v_bfrev_b32_e32 v4, v2
509; SI-NEXT:    v_bfrev_b32_e32 v3, v3
510; SI-NEXT:    v_bfrev_b32_e32 v2, v0
511; SI-NEXT:    v_bfrev_b32_e32 v1, v1
512; SI-NEXT:    s_waitcnt lgkmcnt(0)
513; SI-NEXT:    buffer_store_dwordx4 v[1:4], off, s[0:3], 0
514; SI-NEXT:    s_endpgm
515;
516; FLAT-LABEL: v_brev_v2i64:
517; FLAT:       ; %bb.0:
518; FLAT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
519; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
520; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
521; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
522; FLAT-NEXT:    v_mov_b32_e32 v1, s3
523; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
524; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
525; FLAT-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
526; FLAT-NEXT:    s_mov_b32 s3, 0xf000
527; FLAT-NEXT:    s_mov_b32 s2, -1
528; FLAT-NEXT:    s_waitcnt vmcnt(0)
529; FLAT-NEXT:    v_bfrev_b32_e32 v4, v2
530; FLAT-NEXT:    v_bfrev_b32_e32 v3, v3
531; FLAT-NEXT:    v_bfrev_b32_e32 v2, v0
532; FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
533; FLAT-NEXT:    buffer_store_dwordx4 v[1:4], off, s[0:3], 0
534; FLAT-NEXT:    s_endpgm
535;
536; GISEL-LABEL: v_brev_v2i64:
537; GISEL:       ; %bb.0:
538; GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
539; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
540; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
541; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
542; GISEL-NEXT:    v_mov_b32_e32 v0, s2
543; GISEL-NEXT:    v_mov_b32_e32 v1, s3
544; GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
545; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
546; GISEL-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
547; GISEL-NEXT:    s_waitcnt vmcnt(0)
548; GISEL-NEXT:    v_bfrev_b32_e32 v4, v1
549; GISEL-NEXT:    v_bfrev_b32_e32 v5, v0
550; GISEL-NEXT:    v_mov_b32_e32 v0, s0
551; GISEL-NEXT:    v_bfrev_b32_e32 v6, v3
552; GISEL-NEXT:    v_bfrev_b32_e32 v7, v2
553; GISEL-NEXT:    v_mov_b32_e32 v1, s1
554; GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
555; GISEL-NEXT:    s_endpgm
556  %tid = call i32 @llvm.amdgcn.workitem.id.x()
557  %gep = getelementptr <2 x i64> , <2 x i64> addrspace(1)* %valptr, i32 %tid
558  %val = load <2 x i64>, <2 x i64> addrspace(1)* %gep
559  %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
560  store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
561  ret void
562}
563
564define float @missing_truncate_promote_bitreverse(i32 %arg) {
565; SI-LABEL: missing_truncate_promote_bitreverse:
566; SI:       ; %bb.0: ; %bb
567; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
568; SI-NEXT:    v_bfrev_b32_e32 v0, v0
569; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
570; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
571; SI-NEXT:    s_setpc_b64 s[30:31]
572;
573; FLAT-LABEL: missing_truncate_promote_bitreverse:
574; FLAT:       ; %bb.0: ; %bb
575; FLAT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
576; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
577; FLAT-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
578; FLAT-NEXT:    s_setpc_b64 s[30:31]
579;
580; GISEL-LABEL: missing_truncate_promote_bitreverse:
581; GISEL:       ; %bb.0: ; %bb
582; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
583; GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
584; GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
585; GISEL-NEXT:    s_setpc_b64 s[30:31]
586bb:
587  %tmp = trunc i32 %arg to i16
588  %tmp1 = call i16 @llvm.bitreverse.i16(i16 %tmp)
589  %tmp2 = bitcast i16 %tmp1 to half
590  %tmp3 = fpext half %tmp2 to float
591  ret float %tmp3
592}
593
594attributes #0 = { nounwind }
595attributes #1 = { nounwind readnone }
596