1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
3; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT
4; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT
5; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -global-isel -verify-machineinstrs | FileCheck %s --check-prefix=GISEL
6
7declare i32 @llvm.amdgcn.workitem.id.x() #1
8
9declare i16 @llvm.bitreverse.i16(i16) #1
10declare i32 @llvm.bitreverse.i32(i32) #1
11declare i64 @llvm.bitreverse.i64(i64) #1
12
13declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>) #1
14declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) #1
15
16declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) #1
17declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1
18
19define amdgpu_kernel void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
20; SI-LABEL: s_brev_i16:
21; SI:       ; %bb.0:
22; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
23; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
24; SI-NEXT:    s_mov_b32 s7, 0xf000
25; SI-NEXT:    s_mov_b32 s6, -1
26; SI-NEXT:    s_waitcnt lgkmcnt(0)
27; SI-NEXT:    s_brev_b32 s0, s0
28; SI-NEXT:    s_lshr_b32 s0, s0, 16
29; SI-NEXT:    v_mov_b32_e32 v0, s0
30; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
31; SI-NEXT:    s_endpgm
32;
33; FLAT-LABEL: s_brev_i16:
34; FLAT:       ; %bb.0:
35; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
36; FLAT-NEXT:    s_load_dword s0, s[0:1], 0x2c
37; FLAT-NEXT:    s_mov_b32 s7, 0xf000
38; FLAT-NEXT:    s_mov_b32 s6, -1
39; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
40; FLAT-NEXT:    s_brev_b32 s0, s0
41; FLAT-NEXT:    s_lshr_b32 s0, s0, 16
42; FLAT-NEXT:    v_mov_b32_e32 v0, s0
43; FLAT-NEXT:    buffer_store_short v0, off, s[4:7], 0
44; FLAT-NEXT:    s_endpgm
45;
46; GISEL-LABEL: s_brev_i16:
47; GISEL:       ; %bb.0:
48; GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
49; GISEL-NEXT:    s_load_dword s0, s[0:1], 0x2c
50; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
51; GISEL-NEXT:    v_mov_b32_e32 v0, s2
52; GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
53; GISEL-NEXT:    s_brev_b32 s0, s0
54; GISEL-NEXT:    s_lshr_b32 s0, s0, 16
55; GISEL-NEXT:    v_mov_b32_e32 v2, s0
56; GISEL-NEXT:    v_mov_b32_e32 v1, s3
57; GISEL-NEXT:    flat_store_short v[0:1], v2
58; GISEL-NEXT:    s_endpgm
59  %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
60  store i16 %brev, i16 addrspace(1)* %out
61  ret void
62}
63
64define amdgpu_kernel void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 {
65; SI-LABEL: v_brev_i16:
66; SI:       ; %bb.0:
67; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
68; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
69; SI-NEXT:    s_mov_b32 s7, 0xf000
70; SI-NEXT:    s_mov_b32 s6, -1
71; SI-NEXT:    s_mov_b32 s2, s6
72; SI-NEXT:    s_mov_b32 s3, s7
73; SI-NEXT:    s_waitcnt lgkmcnt(0)
74; SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
75; SI-NEXT:    s_waitcnt vmcnt(0)
76; SI-NEXT:    v_bfrev_b32_e32 v0, v0
77; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
78; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
79; SI-NEXT:    s_endpgm
80;
81; FLAT-LABEL: v_brev_i16:
82; FLAT:       ; %bb.0:
83; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
84; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
85; FLAT-NEXT:    s_mov_b32 s7, 0xf000
86; FLAT-NEXT:    s_mov_b32 s6, -1
87; FLAT-NEXT:    s_mov_b32 s2, s6
88; FLAT-NEXT:    s_mov_b32 s3, s7
89; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
90; FLAT-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
91; FLAT-NEXT:    s_waitcnt vmcnt(0)
92; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
93; FLAT-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
94; FLAT-NEXT:    buffer_store_short v0, off, s[4:7], 0
95; FLAT-NEXT:    s_endpgm
96;
97; GISEL-LABEL: v_brev_i16:
98; GISEL:       ; %bb.0:
99; GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
100; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
101; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
102; GISEL-NEXT:    v_mov_b32_e32 v0, s0
103; GISEL-NEXT:    v_mov_b32_e32 v1, s1
104; GISEL-NEXT:    flat_load_ushort v0, v[0:1]
105; GISEL-NEXT:    s_waitcnt vmcnt(0)
106; GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
107; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
108; GISEL-NEXT:    v_mov_b32_e32 v0, s2
109; GISEL-NEXT:    v_mov_b32_e32 v1, s3
110; GISEL-NEXT:    flat_store_short v[0:1], v2
111; GISEL-NEXT:    s_endpgm
112  %val = load i16, i16 addrspace(1)* %valptr
113  %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
114  store i16 %brev, i16 addrspace(1)* %out
115  ret void
116}
117
118define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 {
119; SI-LABEL: s_brev_i32:
120; SI:       ; %bb.0:
121; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
122; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
123; SI-NEXT:    s_mov_b32 s7, 0xf000
124; SI-NEXT:    s_mov_b32 s6, -1
125; SI-NEXT:    s_waitcnt lgkmcnt(0)
126; SI-NEXT:    s_brev_b32 s0, s0
127; SI-NEXT:    v_mov_b32_e32 v0, s0
128; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
129; SI-NEXT:    s_endpgm
130;
131; FLAT-LABEL: s_brev_i32:
132; FLAT:       ; %bb.0:
133; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
134; FLAT-NEXT:    s_load_dword s0, s[0:1], 0x2c
135; FLAT-NEXT:    s_mov_b32 s7, 0xf000
136; FLAT-NEXT:    s_mov_b32 s6, -1
137; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
138; FLAT-NEXT:    s_brev_b32 s0, s0
139; FLAT-NEXT:    v_mov_b32_e32 v0, s0
140; FLAT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
141; FLAT-NEXT:    s_endpgm
142;
143; GISEL-LABEL: s_brev_i32:
144; GISEL:       ; %bb.0:
145; GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
146; GISEL-NEXT:    s_load_dword s0, s[0:1], 0x2c
147; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
148; GISEL-NEXT:    v_mov_b32_e32 v0, s2
149; GISEL-NEXT:    s_brev_b32 s0, s0
150; GISEL-NEXT:    v_mov_b32_e32 v2, s0
151; GISEL-NEXT:    v_mov_b32_e32 v1, s3
152; GISEL-NEXT:    flat_store_dword v[0:1], v2
153; GISEL-NEXT:    s_endpgm
154  %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
155  store i32 %brev, i32 addrspace(1)* %out
156  ret void
157}
158
159define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
160; SI-LABEL: v_brev_i32:
161; SI:       ; %bb.0:
162; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
163; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
164; SI-NEXT:    s_mov_b32 s7, 0xf000
165; SI-NEXT:    s_mov_b32 s2, 0
166; SI-NEXT:    s_mov_b32 s3, s7
167; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
168; SI-NEXT:    v_mov_b32_e32 v1, 0
169; SI-NEXT:    s_waitcnt lgkmcnt(0)
170; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
171; SI-NEXT:    s_mov_b32 s6, -1
172; SI-NEXT:    s_waitcnt vmcnt(0)
173; SI-NEXT:    v_bfrev_b32_e32 v0, v0
174; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
175; SI-NEXT:    s_endpgm
176;
177; FLAT-LABEL: v_brev_i32:
178; FLAT:       ; %bb.0:
179; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
180; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
181; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
182; FLAT-NEXT:    s_mov_b32 s7, 0xf000
183; FLAT-NEXT:    s_mov_b32 s6, -1
184; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
185; FLAT-NEXT:    v_mov_b32_e32 v1, s1
186; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
187; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
188; FLAT-NEXT:    flat_load_dword v0, v[0:1]
189; FLAT-NEXT:    s_waitcnt vmcnt(0)
190; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
191; FLAT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
192; FLAT-NEXT:    s_endpgm
193;
194; GISEL-LABEL: v_brev_i32:
195; GISEL:       ; %bb.0:
196; GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
197; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
198; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
199; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
200; GISEL-NEXT:    v_mov_b32_e32 v0, s0
201; GISEL-NEXT:    v_mov_b32_e32 v1, s1
202; GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
203; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
204; GISEL-NEXT:    flat_load_dword v0, v[0:1]
205; GISEL-NEXT:    s_waitcnt vmcnt(0)
206; GISEL-NEXT:    v_bfrev_b32_e32 v2, v0
207; GISEL-NEXT:    v_mov_b32_e32 v0, s2
208; GISEL-NEXT:    v_mov_b32_e32 v1, s3
209; GISEL-NEXT:    flat_store_dword v[0:1], v2
210; GISEL-NEXT:    s_endpgm
211  %tid = call i32 @llvm.amdgcn.workitem.id.x()
212  %gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
213  %val = load i32, i32 addrspace(1)* %gep
214  %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
215  store i32 %brev, i32 addrspace(1)* %out
216  ret void
217}
218
219define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 {
220; SI-LABEL: s_brev_v2i32:
221; SI:       ; %bb.0:
222; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
223; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
224; SI-NEXT:    s_mov_b32 s7, 0xf000
225; SI-NEXT:    s_mov_b32 s6, -1
226; SI-NEXT:    s_waitcnt lgkmcnt(0)
227; SI-NEXT:    s_brev_b32 s1, s1
228; SI-NEXT:    s_brev_b32 s0, s0
229; SI-NEXT:    v_mov_b32_e32 v0, s0
230; SI-NEXT:    v_mov_b32_e32 v1, s1
231; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
232; SI-NEXT:    s_endpgm
233;
234; FLAT-LABEL: s_brev_v2i32:
235; FLAT:       ; %bb.0:
236; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
237; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
238; FLAT-NEXT:    s_mov_b32 s7, 0xf000
239; FLAT-NEXT:    s_mov_b32 s6, -1
240; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
241; FLAT-NEXT:    s_brev_b32 s1, s1
242; FLAT-NEXT:    s_brev_b32 s0, s0
243; FLAT-NEXT:    v_mov_b32_e32 v0, s0
244; FLAT-NEXT:    v_mov_b32_e32 v1, s1
245; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
246; FLAT-NEXT:    s_endpgm
247;
248; GISEL-LABEL: s_brev_v2i32:
249; GISEL:       ; %bb.0:
250; GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
251; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
252; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
253; GISEL-NEXT:    v_mov_b32_e32 v2, s2
254; GISEL-NEXT:    s_brev_b32 s0, s0
255; GISEL-NEXT:    s_brev_b32 s1, s1
256; GISEL-NEXT:    v_mov_b32_e32 v0, s0
257; GISEL-NEXT:    v_mov_b32_e32 v1, s1
258; GISEL-NEXT:    v_mov_b32_e32 v3, s3
259; GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
260; GISEL-NEXT:    s_endpgm
261  %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
262  store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
263  ret void
264}
265
266define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
267; SI-LABEL: v_brev_v2i32:
268; SI:       ; %bb.0:
269; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
270; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
271; SI-NEXT:    s_mov_b32 s7, 0xf000
272; SI-NEXT:    s_mov_b32 s2, 0
273; SI-NEXT:    s_mov_b32 s3, s7
274; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
275; SI-NEXT:    v_mov_b32_e32 v1, 0
276; SI-NEXT:    s_waitcnt lgkmcnt(0)
277; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
278; SI-NEXT:    s_mov_b32 s6, -1
279; SI-NEXT:    s_waitcnt vmcnt(0)
280; SI-NEXT:    v_bfrev_b32_e32 v1, v1
281; SI-NEXT:    v_bfrev_b32_e32 v0, v0
282; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
283; SI-NEXT:    s_endpgm
284;
285; FLAT-LABEL: v_brev_v2i32:
286; FLAT:       ; %bb.0:
287; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
288; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
289; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
290; FLAT-NEXT:    s_mov_b32 s7, 0xf000
291; FLAT-NEXT:    s_mov_b32 s6, -1
292; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
293; FLAT-NEXT:    v_mov_b32_e32 v1, s1
294; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
295; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
296; FLAT-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
297; FLAT-NEXT:    s_waitcnt vmcnt(0)
298; FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
299; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
300; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
301; FLAT-NEXT:    s_endpgm
302;
303; GISEL-LABEL: v_brev_v2i32:
304; GISEL:       ; %bb.0:
305; GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
306; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
307; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
308; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
309; GISEL-NEXT:    v_mov_b32_e32 v0, s0
310; GISEL-NEXT:    v_mov_b32_e32 v1, s1
311; GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
312; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
313; GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
314; GISEL-NEXT:    v_mov_b32_e32 v2, s2
315; GISEL-NEXT:    v_mov_b32_e32 v3, s3
316; GISEL-NEXT:    s_waitcnt vmcnt(0)
317; GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
318; GISEL-NEXT:    v_bfrev_b32_e32 v1, v1
319; GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
320; GISEL-NEXT:    s_endpgm
321  %tid = call i32 @llvm.amdgcn.workitem.id.x()
322  %gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
323  %val = load <2 x i32>, <2 x i32> addrspace(1)* %gep
324  %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
325  store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
326  ret void
327}
328
329define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {
330; SI-LABEL: s_brev_i64:
331; SI:       ; %bb.0:
332; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
333; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
334; SI-NEXT:    s_mov_b32 s7, 0xf000
335; SI-NEXT:    s_mov_b32 s6, -1
336; SI-NEXT:    s_waitcnt lgkmcnt(0)
337; SI-NEXT:    s_brev_b64 s[0:1], s[0:1]
338; SI-NEXT:    v_mov_b32_e32 v0, s0
339; SI-NEXT:    v_mov_b32_e32 v1, s1
340; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
341; SI-NEXT:    s_endpgm
342;
343; FLAT-LABEL: s_brev_i64:
344; FLAT:       ; %bb.0:
345; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
346; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
347; FLAT-NEXT:    s_mov_b32 s7, 0xf000
348; FLAT-NEXT:    s_mov_b32 s6, -1
349; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
350; FLAT-NEXT:    s_brev_b64 s[0:1], s[0:1]
351; FLAT-NEXT:    v_mov_b32_e32 v0, s0
352; FLAT-NEXT:    v_mov_b32_e32 v1, s1
353; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
354; FLAT-NEXT:    s_endpgm
355;
356; GISEL-LABEL: s_brev_i64:
357; GISEL:       ; %bb.0:
358; GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
359; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
360; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
361; GISEL-NEXT:    v_mov_b32_e32 v2, s2
362; GISEL-NEXT:    s_brev_b64 s[0:1], s[0:1]
363; GISEL-NEXT:    v_mov_b32_e32 v0, s0
364; GISEL-NEXT:    v_mov_b32_e32 v1, s1
365; GISEL-NEXT:    v_mov_b32_e32 v3, s3
366; GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
367; GISEL-NEXT:    s_endpgm
368  %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
369  store i64 %brev, i64 addrspace(1)* %out
370  ret void
371}
372
373define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
374; SI-LABEL: v_brev_i64:
375; SI:       ; %bb.0:
376; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
377; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
378; SI-NEXT:    s_mov_b32 s7, 0xf000
379; SI-NEXT:    s_mov_b32 s2, 0
380; SI-NEXT:    s_mov_b32 s3, s7
381; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
382; SI-NEXT:    v_mov_b32_e32 v1, 0
383; SI-NEXT:    s_waitcnt lgkmcnt(0)
384; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
385; SI-NEXT:    s_mov_b32 s6, -1
386; SI-NEXT:    s_waitcnt vmcnt(0)
387; SI-NEXT:    v_bfrev_b32_e32 v2, v0
388; SI-NEXT:    v_bfrev_b32_e32 v1, v1
389; SI-NEXT:    buffer_store_dwordx2 v[1:2], off, s[4:7], 0
390; SI-NEXT:    s_endpgm
391;
392; FLAT-LABEL: v_brev_i64:
393; FLAT:       ; %bb.0:
394; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
395; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
396; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
397; FLAT-NEXT:    s_mov_b32 s7, 0xf000
398; FLAT-NEXT:    s_mov_b32 s6, -1
399; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
400; FLAT-NEXT:    v_mov_b32_e32 v1, s1
401; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
402; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
403; FLAT-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
404; FLAT-NEXT:    s_waitcnt vmcnt(0)
405; FLAT-NEXT:    v_bfrev_b32_e32 v2, v0
406; FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
407; FLAT-NEXT:    buffer_store_dwordx2 v[1:2], off, s[4:7], 0
408; FLAT-NEXT:    s_endpgm
409;
410; GISEL-LABEL: v_brev_i64:
411; GISEL:       ; %bb.0:
412; GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
413; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
414; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
415; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
416; GISEL-NEXT:    v_mov_b32_e32 v4, s3
417; GISEL-NEXT:    v_mov_b32_e32 v0, s0
418; GISEL-NEXT:    v_mov_b32_e32 v1, s1
419; GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
420; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
421; GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
422; GISEL-NEXT:    v_mov_b32_e32 v3, s2
423; GISEL-NEXT:    s_waitcnt vmcnt(0)
424; GISEL-NEXT:    v_bfrev_b32_e32 v1, v1
425; GISEL-NEXT:    v_bfrev_b32_e32 v2, v0
426; GISEL-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
427; GISEL-NEXT:    s_endpgm
428  %tid = call i32 @llvm.amdgcn.workitem.id.x()
429  %gep = getelementptr i64, i64 addrspace(1)* %valptr, i32 %tid
430  %val = load i64, i64 addrspace(1)* %gep
431  %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
432  store i64 %brev, i64 addrspace(1)* %out
433  ret void
434}
435
436define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 {
437; SI-LABEL: s_brev_v2i64:
438; SI:       ; %bb.0:
439; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
440; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
441; SI-NEXT:    s_mov_b32 s7, 0xf000
442; SI-NEXT:    s_mov_b32 s6, -1
443; SI-NEXT:    s_waitcnt lgkmcnt(0)
444; SI-NEXT:    s_brev_b64 s[2:3], s[2:3]
445; SI-NEXT:    s_brev_b64 s[0:1], s[0:1]
446; SI-NEXT:    v_mov_b32_e32 v0, s0
447; SI-NEXT:    v_mov_b32_e32 v1, s1
448; SI-NEXT:    v_mov_b32_e32 v2, s2
449; SI-NEXT:    v_mov_b32_e32 v3, s3
450; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
451; SI-NEXT:    s_endpgm
452;
453; FLAT-LABEL: s_brev_v2i64:
454; FLAT:       ; %bb.0:
455; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
456; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
457; FLAT-NEXT:    s_mov_b32 s7, 0xf000
458; FLAT-NEXT:    s_mov_b32 s6, -1
459; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
460; FLAT-NEXT:    s_brev_b64 s[2:3], s[2:3]
461; FLAT-NEXT:    s_brev_b64 s[0:1], s[0:1]
462; FLAT-NEXT:    v_mov_b32_e32 v0, s0
463; FLAT-NEXT:    v_mov_b32_e32 v1, s1
464; FLAT-NEXT:    v_mov_b32_e32 v2, s2
465; FLAT-NEXT:    v_mov_b32_e32 v3, s3
466; FLAT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
467; FLAT-NEXT:    s_endpgm
468;
469; GISEL-LABEL: s_brev_v2i64:
470; GISEL:       ; %bb.0:
471; GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
472; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
473; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
474; GISEL-NEXT:    v_mov_b32_e32 v4, s4
475; GISEL-NEXT:    s_brev_b64 s[0:1], s[0:1]
476; GISEL-NEXT:    s_brev_b64 s[2:3], s[2:3]
477; GISEL-NEXT:    v_mov_b32_e32 v0, s0
478; GISEL-NEXT:    v_mov_b32_e32 v1, s1
479; GISEL-NEXT:    v_mov_b32_e32 v2, s2
480; GISEL-NEXT:    v_mov_b32_e32 v3, s3
481; GISEL-NEXT:    v_mov_b32_e32 v5, s5
482; GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
483; GISEL-NEXT:    s_endpgm
484  %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
485  store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
486  ret void
487}
488
489define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
490; SI-LABEL: v_brev_v2i64:
491; SI:       ; %bb.0:
492; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
493; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
494; SI-NEXT:    s_mov_b32 s7, 0xf000
495; SI-NEXT:    s_mov_b32 s2, 0
496; SI-NEXT:    s_mov_b32 s3, s7
497; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
498; SI-NEXT:    v_mov_b32_e32 v1, 0
499; SI-NEXT:    s_waitcnt lgkmcnt(0)
500; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64
501; SI-NEXT:    s_mov_b32 s6, -1
502; SI-NEXT:    s_waitcnt vmcnt(0)
503; SI-NEXT:    v_bfrev_b32_e32 v4, v2
504; SI-NEXT:    v_bfrev_b32_e32 v3, v3
505; SI-NEXT:    v_bfrev_b32_e32 v2, v0
506; SI-NEXT:    v_bfrev_b32_e32 v1, v1
507; SI-NEXT:    buffer_store_dwordx4 v[1:4], off, s[4:7], 0
508; SI-NEXT:    s_endpgm
509;
510; FLAT-LABEL: v_brev_v2i64:
511; FLAT:       ; %bb.0:
512; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
513; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
514; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
515; FLAT-NEXT:    s_mov_b32 s7, 0xf000
516; FLAT-NEXT:    s_mov_b32 s6, -1
517; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
518; FLAT-NEXT:    v_mov_b32_e32 v1, s1
519; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
520; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
521; FLAT-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
522; FLAT-NEXT:    s_waitcnt vmcnt(0)
523; FLAT-NEXT:    v_bfrev_b32_e32 v4, v2
524; FLAT-NEXT:    v_bfrev_b32_e32 v3, v3
525; FLAT-NEXT:    v_bfrev_b32_e32 v2, v0
526; FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
527; FLAT-NEXT:    buffer_store_dwordx4 v[1:4], off, s[4:7], 0
528; FLAT-NEXT:    s_endpgm
529;
530; GISEL-LABEL: v_brev_v2i64:
531; GISEL:       ; %bb.0:
532; GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
533; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
534; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
535; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
536; GISEL-NEXT:    v_mov_b32_e32 v0, s0
537; GISEL-NEXT:    v_mov_b32_e32 v1, s1
538; GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
539; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
540; GISEL-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
541; GISEL-NEXT:    s_waitcnt vmcnt(0)
542; GISEL-NEXT:    v_bfrev_b32_e32 v4, v1
543; GISEL-NEXT:    v_bfrev_b32_e32 v5, v0
544; GISEL-NEXT:    v_mov_b32_e32 v0, s2
545; GISEL-NEXT:    v_bfrev_b32_e32 v6, v3
546; GISEL-NEXT:    v_bfrev_b32_e32 v7, v2
547; GISEL-NEXT:    v_mov_b32_e32 v1, s3
548; GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
549; GISEL-NEXT:    s_endpgm
550  %tid = call i32 @llvm.amdgcn.workitem.id.x()
551  %gep = getelementptr <2 x i64> , <2 x i64> addrspace(1)* %valptr, i32 %tid
552  %val = load <2 x i64>, <2 x i64> addrspace(1)* %gep
553  %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
554  store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
555  ret void
556}
557
558define float @missing_truncate_promote_bitreverse(i32 %arg) {
559; SI-LABEL: missing_truncate_promote_bitreverse:
560; SI:       ; %bb.0: ; %bb
561; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
562; SI-NEXT:    v_bfrev_b32_e32 v0, v0
563; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
564; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
565; SI-NEXT:    s_setpc_b64 s[30:31]
566;
567; FLAT-LABEL: missing_truncate_promote_bitreverse:
568; FLAT:       ; %bb.0: ; %bb
569; FLAT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
570; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
571; FLAT-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
572; FLAT-NEXT:    s_setpc_b64 s[30:31]
573;
574; GISEL-LABEL: missing_truncate_promote_bitreverse:
575; GISEL:       ; %bb.0: ; %bb
576; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
577; GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
578; GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
579; GISEL-NEXT:    s_setpc_b64 s[30:31]
580bb:
581  %tmp = trunc i32 %arg to i16
582  %tmp1 = call i16 @llvm.bitreverse.i16(i16 %tmp)
583  %tmp2 = bitcast i16 %tmp1 to half
584  %tmp3 = fpext half %tmp2 to float
585  ret float %tmp3
586}
587
588attributes #0 = { nounwind }
589attributes #1 = { nounwind readnone }
590