1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,SI
3; RUN: llc < %s -march=amdgcn -mcpu=tonga  -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,VI
4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,GFX9
5; RUN: llc < %s -march=r600 -mcpu=redwood  -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,R600
6
7declare i32 @llvm.fshr.i32(i32, i32, i32)
8declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
9declare <3 x i32> @llvm.fshr.v3i32(<3 x i32>, <3 x i32>, <3 x i32>)
10declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
11declare i16 @llvm.fshr.i16(i16, i16, i16)
12declare <2 x i16> @llvm.fshr.v2i16(<2 x i16>, <2 x i16>, <2 x i16>)
13declare <3 x i16> @llvm.fshr.v3i16(<3 x i16>, <3 x i16>, <3 x i16>)
14declare <4 x i16> @llvm.fshr.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
15declare i64 @llvm.fshr.i64(i64, i64, i64)
16declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
17declare i24 @llvm.fshr.i24(i24, i24, i24)
18declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>)
19
20define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) {
21; SI-LABEL: fshr_i32:
22; SI:       ; %bb.0: ; %entry
23; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
24; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
25; SI-NEXT:    s_mov_b32 s7, 0xf000
26; SI-NEXT:    s_mov_b32 s6, -1
27; SI-NEXT:    s_waitcnt lgkmcnt(0)
28; SI-NEXT:    v_mov_b32_e32 v0, s1
29; SI-NEXT:    v_mov_b32_e32 v1, s2
30; SI-NEXT:    v_alignbit_b32 v0, s0, v0, v1
31; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
32; SI-NEXT:    s_endpgm
33;
34; VI-LABEL: fshr_i32:
35; VI:       ; %bb.0: ; %entry
36; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
37; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
38; VI-NEXT:    s_waitcnt lgkmcnt(0)
39; VI-NEXT:    v_mov_b32_e32 v0, s1
40; VI-NEXT:    v_mov_b32_e32 v1, s2
41; VI-NEXT:    v_alignbit_b32 v2, s0, v0, v1
42; VI-NEXT:    v_mov_b32_e32 v0, s4
43; VI-NEXT:    v_mov_b32_e32 v1, s5
44; VI-NEXT:    flat_store_dword v[0:1], v2
45; VI-NEXT:    s_endpgm
46;
47; GFX9-LABEL: fshr_i32:
48; GFX9:       ; %bb.0: ; %entry
49; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
50; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x2c
51; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
52; GFX9-NEXT:    v_mov_b32_e32 v0, s1
53; GFX9-NEXT:    v_mov_b32_e32 v1, s2
54; GFX9-NEXT:    v_alignbit_b32 v2, s0, v0, v1
55; GFX9-NEXT:    v_mov_b32_e32 v0, s4
56; GFX9-NEXT:    v_mov_b32_e32 v1, s5
57; GFX9-NEXT:    global_store_dword v[0:1], v2, off
58; GFX9-NEXT:    s_endpgm
59;
60; R600-LABEL: fshr_i32:
61; R600:       ; %bb.0: ; %entry
62; R600-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
63; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
64; R600-NEXT:    CF_END
65; R600-NEXT:    PAD
66; R600-NEXT:    ALU clause starting at 4:
67; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
68; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
69; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, KC0[3].X,
70entry:
71  %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
72  store i32 %0, i32 addrspace(1)* %in
73  ret void
74}
75
76define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
77; SI-LABEL: fshr_i32_imm:
78; SI:       ; %bb.0: ; %entry
79; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
80; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
81; SI-NEXT:    s_mov_b32 s7, 0xf000
82; SI-NEXT:    s_mov_b32 s6, -1
83; SI-NEXT:    s_waitcnt lgkmcnt(0)
84; SI-NEXT:    v_mov_b32_e32 v0, s1
85; SI-NEXT:    v_alignbit_b32 v0, s0, v0, 7
86; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
87; SI-NEXT:    s_endpgm
88;
89; VI-LABEL: fshr_i32_imm:
90; VI:       ; %bb.0: ; %entry
91; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
92; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
93; VI-NEXT:    s_waitcnt lgkmcnt(0)
94; VI-NEXT:    v_mov_b32_e32 v0, s1
95; VI-NEXT:    v_alignbit_b32 v2, s0, v0, 7
96; VI-NEXT:    v_mov_b32_e32 v0, s2
97; VI-NEXT:    v_mov_b32_e32 v1, s3
98; VI-NEXT:    flat_store_dword v[0:1], v2
99; VI-NEXT:    s_endpgm
100;
101; GFX9-LABEL: fshr_i32_imm:
102; GFX9:       ; %bb.0: ; %entry
103; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
104; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
105; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
106; GFX9-NEXT:    v_mov_b32_e32 v0, s1
107; GFX9-NEXT:    v_alignbit_b32 v2, s0, v0, 7
108; GFX9-NEXT:    v_mov_b32_e32 v0, s2
109; GFX9-NEXT:    v_mov_b32_e32 v1, s3
110; GFX9-NEXT:    global_store_dword v[0:1], v2, off
111; GFX9-NEXT:    s_endpgm
112;
113; R600-LABEL: fshr_i32_imm:
114; R600:       ; %bb.0: ; %entry
115; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
116; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
117; R600-NEXT:    CF_END
118; R600-NEXT:    PAD
119; R600-NEXT:    ALU clause starting at 4:
120; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
121; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
122; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x,
123; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
124entry:
125  %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7)
126  store i32 %0, i32 addrspace(1)* %in
127  ret void
128}
129
130define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
131; SI-LABEL: fshr_v2i32:
132; SI:       ; %bb.0: ; %entry
133; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
134; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
135; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
136; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
137; SI-NEXT:    s_mov_b32 s7, 0xf000
138; SI-NEXT:    s_mov_b32 s6, -1
139; SI-NEXT:    s_waitcnt lgkmcnt(0)
140; SI-NEXT:    v_mov_b32_e32 v0, s9
141; SI-NEXT:    v_mov_b32_e32 v1, s1
142; SI-NEXT:    v_alignbit_b32 v1, s3, v0, v1
143; SI-NEXT:    v_mov_b32_e32 v0, s8
144; SI-NEXT:    v_mov_b32_e32 v2, s0
145; SI-NEXT:    v_alignbit_b32 v0, s2, v0, v2
146; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
147; SI-NEXT:    s_endpgm
148;
149; VI-LABEL: fshr_v2i32:
150; VI:       ; %bb.0: ; %entry
151; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
152; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
153; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
154; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
155; VI-NEXT:    s_waitcnt lgkmcnt(0)
156; VI-NEXT:    v_mov_b32_e32 v0, s7
157; VI-NEXT:    v_mov_b32_e32 v1, s1
158; VI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
159; VI-NEXT:    v_mov_b32_e32 v0, s6
160; VI-NEXT:    v_mov_b32_e32 v2, s0
161; VI-NEXT:    v_alignbit_b32 v0, s4, v0, v2
162; VI-NEXT:    v_mov_b32_e32 v2, s2
163; VI-NEXT:    v_mov_b32_e32 v3, s3
164; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
165; VI-NEXT:    s_endpgm
166;
167; GFX9-LABEL: fshr_v2i32:
168; GFX9:       ; %bb.0: ; %entry
169; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
170; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
171; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
172; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
173; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
174; GFX9-NEXT:    v_mov_b32_e32 v0, s7
175; GFX9-NEXT:    v_mov_b32_e32 v1, s1
176; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, v1
177; GFX9-NEXT:    v_mov_b32_e32 v0, s6
178; GFX9-NEXT:    v_mov_b32_e32 v2, s0
179; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, v2
180; GFX9-NEXT:    v_mov_b32_e32 v2, s2
181; GFX9-NEXT:    v_mov_b32_e32 v3, s3
182; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
183; GFX9-NEXT:    s_endpgm
184;
185; R600-LABEL: fshr_v2i32:
186; R600:       ; %bb.0: ; %entry
187; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
188; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
189; R600-NEXT:    CF_END
190; R600-NEXT:    PAD
191; R600-NEXT:    ALU clause starting at 4:
192; R600-NEXT:     MOV * T0.W, KC0[4].X,
193; R600-NEXT:     BIT_ALIGN_INT T0.Y, KC0[3].X, KC0[3].Z, PV.W,
194; R600-NEXT:     MOV * T0.W, KC0[3].W,
195; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, PV.W,
196; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
197; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
198entry:
199  %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
200  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
201  ret void
202}
203
204define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
205; SI-LABEL: fshr_v2i32_imm:
206; SI:       ; %bb.0: ; %entry
207; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
208; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
209; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
210; SI-NEXT:    s_mov_b32 s7, 0xf000
211; SI-NEXT:    s_mov_b32 s6, -1
212; SI-NEXT:    s_waitcnt lgkmcnt(0)
213; SI-NEXT:    v_mov_b32_e32 v0, s1
214; SI-NEXT:    v_alignbit_b32 v1, s3, v0, 9
215; SI-NEXT:    v_mov_b32_e32 v0, s0
216; SI-NEXT:    v_alignbit_b32 v0, s2, v0, 7
217; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
218; SI-NEXT:    s_endpgm
219;
220; VI-LABEL: fshr_v2i32_imm:
221; VI:       ; %bb.0: ; %entry
222; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
223; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
224; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
225; VI-NEXT:    s_waitcnt lgkmcnt(0)
226; VI-NEXT:    v_mov_b32_e32 v0, s1
227; VI-NEXT:    v_mov_b32_e32 v2, s0
228; VI-NEXT:    v_alignbit_b32 v1, s5, v0, 9
229; VI-NEXT:    v_alignbit_b32 v0, s4, v2, 7
230; VI-NEXT:    v_mov_b32_e32 v2, s2
231; VI-NEXT:    v_mov_b32_e32 v3, s3
232; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
233; VI-NEXT:    s_endpgm
234;
235; GFX9-LABEL: fshr_v2i32_imm:
236; GFX9:       ; %bb.0: ; %entry
237; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
238; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
239; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
240; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
241; GFX9-NEXT:    v_mov_b32_e32 v0, s1
242; GFX9-NEXT:    v_mov_b32_e32 v2, s0
243; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 9
244; GFX9-NEXT:    v_alignbit_b32 v0, s4, v2, 7
245; GFX9-NEXT:    v_mov_b32_e32 v2, s2
246; GFX9-NEXT:    v_mov_b32_e32 v3, s3
247; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
248; GFX9-NEXT:    s_endpgm
249;
250; R600-LABEL: fshr_v2i32_imm:
251; R600:       ; %bb.0: ; %entry
252; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
253; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
254; R600-NEXT:    CF_END
255; R600-NEXT:    PAD
256; R600-NEXT:    ALU clause starting at 4:
257; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x,
258; R600-NEXT:    9(1.261169e-44), 0(0.000000e+00)
259; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x,
260; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
261; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
262; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
263entry:
264  %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
265  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
266  ret void
267}
268
269define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
270; SI-LABEL: fshr_v4i32:
271; SI:       ; %bb.0: ; %entry
272; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
273; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
274; SI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x11
275; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x15
276; SI-NEXT:    s_mov_b32 s7, 0xf000
277; SI-NEXT:    s_mov_b32 s6, -1
278; SI-NEXT:    s_waitcnt lgkmcnt(0)
279; SI-NEXT:    v_mov_b32_e32 v0, s15
280; SI-NEXT:    v_mov_b32_e32 v1, s3
281; SI-NEXT:    v_alignbit_b32 v3, s11, v0, v1
282; SI-NEXT:    v_mov_b32_e32 v0, s14
283; SI-NEXT:    v_mov_b32_e32 v1, s2
284; SI-NEXT:    v_alignbit_b32 v2, s10, v0, v1
285; SI-NEXT:    v_mov_b32_e32 v0, s13
286; SI-NEXT:    v_mov_b32_e32 v1, s1
287; SI-NEXT:    v_alignbit_b32 v1, s9, v0, v1
288; SI-NEXT:    v_mov_b32_e32 v0, s12
289; SI-NEXT:    v_mov_b32_e32 v4, s0
290; SI-NEXT:    v_alignbit_b32 v0, s8, v0, v4
291; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
292; SI-NEXT:    s_endpgm
293;
294; VI-LABEL: fshr_v4i32:
295; VI:       ; %bb.0: ; %entry
296; VI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x24
297; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
298; VI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
299; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x54
300; VI-NEXT:    s_waitcnt lgkmcnt(0)
301; VI-NEXT:    v_mov_b32_e32 v0, s11
302; VI-NEXT:    v_mov_b32_e32 v1, s3
303; VI-NEXT:    v_alignbit_b32 v3, s7, v0, v1
304; VI-NEXT:    v_mov_b32_e32 v0, s10
305; VI-NEXT:    v_mov_b32_e32 v1, s2
306; VI-NEXT:    v_alignbit_b32 v2, s6, v0, v1
307; VI-NEXT:    v_mov_b32_e32 v0, s9
308; VI-NEXT:    v_mov_b32_e32 v1, s1
309; VI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
310; VI-NEXT:    v_mov_b32_e32 v0, s8
311; VI-NEXT:    v_mov_b32_e32 v4, s0
312; VI-NEXT:    v_alignbit_b32 v0, s4, v0, v4
313; VI-NEXT:    v_mov_b32_e32 v4, s12
314; VI-NEXT:    v_mov_b32_e32 v5, s13
315; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
316; VI-NEXT:    s_endpgm
317;
318; GFX9-LABEL: fshr_v4i32:
319; GFX9:       ; %bb.0: ; %entry
320; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x24
321; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
322; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
323; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x54
324; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
325; GFX9-NEXT:    v_mov_b32_e32 v0, s11
326; GFX9-NEXT:    v_mov_b32_e32 v1, s3
327; GFX9-NEXT:    v_alignbit_b32 v3, s7, v0, v1
328; GFX9-NEXT:    v_mov_b32_e32 v0, s10
329; GFX9-NEXT:    v_mov_b32_e32 v1, s2
330; GFX9-NEXT:    v_alignbit_b32 v2, s6, v0, v1
331; GFX9-NEXT:    v_mov_b32_e32 v0, s9
332; GFX9-NEXT:    v_mov_b32_e32 v1, s1
333; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, v1
334; GFX9-NEXT:    v_mov_b32_e32 v0, s8
335; GFX9-NEXT:    v_mov_b32_e32 v4, s0
336; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, v4
337; GFX9-NEXT:    v_mov_b32_e32 v4, s12
338; GFX9-NEXT:    v_mov_b32_e32 v5, s13
339; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
340; GFX9-NEXT:    s_endpgm
341;
342; R600-LABEL: fshr_v4i32:
343; R600:       ; %bb.0: ; %entry
344; R600-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
345; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
346; R600-NEXT:    CF_END
347; R600-NEXT:    PAD
348; R600-NEXT:    ALU clause starting at 4:
349; R600-NEXT:     MOV * T0.W, KC0[6].X,
350; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, PV.W,
351; R600-NEXT:     MOV * T1.W, KC0[5].W,
352; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, PV.W,
353; R600-NEXT:     MOV * T1.W, KC0[5].Z,
354; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, PV.W,
355; R600-NEXT:     MOV * T1.W, KC0[5].Y,
356; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, PV.W,
357; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
358; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
359entry:
360  %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
361  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
362  ret void
363}
364
365define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
366; SI-LABEL: fshr_v4i32_imm:
367; SI:       ; %bb.0: ; %entry
368; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
369; SI-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
370; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
371; SI-NEXT:    s_mov_b32 s7, 0xf000
372; SI-NEXT:    s_mov_b32 s6, -1
373; SI-NEXT:    s_waitcnt lgkmcnt(0)
374; SI-NEXT:    v_mov_b32_e32 v0, s3
375; SI-NEXT:    v_alignbit_b32 v3, s11, v0, 1
376; SI-NEXT:    v_mov_b32_e32 v0, s2
377; SI-NEXT:    v_alignbit_b32 v2, s10, v0, 9
378; SI-NEXT:    v_mov_b32_e32 v0, s1
379; SI-NEXT:    v_alignbit_b32 v1, s9, v0, 7
380; SI-NEXT:    v_mov_b32_e32 v0, s0
381; SI-NEXT:    v_alignbit_b32 v0, s8, v0, 1
382; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
383; SI-NEXT:    s_endpgm
384;
385; VI-LABEL: fshr_v4i32_imm:
386; VI:       ; %bb.0: ; %entry
387; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
388; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
389; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x44
390; VI-NEXT:    s_waitcnt lgkmcnt(0)
391; VI-NEXT:    v_mov_b32_e32 v4, s8
392; VI-NEXT:    v_mov_b32_e32 v5, s9
393; VI-NEXT:    v_mov_b32_e32 v0, s3
394; VI-NEXT:    v_mov_b32_e32 v1, s2
395; VI-NEXT:    v_alignbit_b32 v3, s7, v0, 1
396; VI-NEXT:    v_mov_b32_e32 v0, s1
397; VI-NEXT:    v_alignbit_b32 v2, s6, v1, 9
398; VI-NEXT:    v_alignbit_b32 v1, s5, v0, 7
399; VI-NEXT:    v_mov_b32_e32 v0, s0
400; VI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
401; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
402; VI-NEXT:    s_endpgm
403;
404; GFX9-LABEL: fshr_v4i32_imm:
405; GFX9:       ; %bb.0: ; %entry
406; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
407; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
408; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x44
409; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
410; GFX9-NEXT:    v_mov_b32_e32 v4, s8
411; GFX9-NEXT:    v_mov_b32_e32 v5, s9
412; GFX9-NEXT:    v_mov_b32_e32 v0, s3
413; GFX9-NEXT:    v_mov_b32_e32 v1, s2
414; GFX9-NEXT:    v_alignbit_b32 v3, s7, v0, 1
415; GFX9-NEXT:    v_mov_b32_e32 v0, s1
416; GFX9-NEXT:    v_alignbit_b32 v2, s6, v1, 9
417; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 7
418; GFX9-NEXT:    v_mov_b32_e32 v0, s0
419; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 1
420; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
421; GFX9-NEXT:    s_endpgm
422;
423; R600-LABEL: fshr_v4i32_imm:
424; R600:       ; %bb.0: ; %entry
425; R600-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
426; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
427; R600-NEXT:    CF_END
428; R600-NEXT:    PAD
429; R600-NEXT:    ALU clause starting at 4:
430; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1,
431; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x,
432; R600-NEXT:    9(1.261169e-44), 0(0.000000e+00)
433; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x,
434; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
435; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, 1,
436; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
437; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
438entry:
439  %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>)
440  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
441  ret void
442}
443
444define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) {
445; GFX89-LABEL: v_fshr_i32:
446; GFX89:       ; %bb.0:
447; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
448; GFX89-NEXT:    v_alignbit_b32 v0, v0, v1, v2
449; GFX89-NEXT:    s_setpc_b64 s[30:31]
450;
451; R600-LABEL: v_fshr_i32:
452; R600:       ; %bb.0:
453; R600-NEXT:    CF_END
454; R600-NEXT:    PAD
455  %ret = call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2)
456  ret i32 %ret
457}
458
459define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) {
460; GFX89-LABEL: v_fshr_v2i32:
461; GFX89:       ; %bb.0:
462; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
463; GFX89-NEXT:    v_alignbit_b32 v0, v0, v2, v4
464; GFX89-NEXT:    v_alignbit_b32 v1, v1, v3, v5
465; GFX89-NEXT:    s_setpc_b64 s[30:31]
466;
467; R600-LABEL: v_fshr_v2i32:
468; R600:       ; %bb.0:
469; R600-NEXT:    CF_END
470; R600-NEXT:    PAD
471  %ret = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2)
472  ret <2 x i32> %ret
473}
474
475define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) {
476; GFX89-LABEL: v_fshr_v3i32:
477; GFX89:       ; %bb.0:
478; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
479; GFX89-NEXT:    v_alignbit_b32 v0, v0, v3, v6
480; GFX89-NEXT:    v_alignbit_b32 v1, v1, v4, v7
481; GFX89-NEXT:    v_alignbit_b32 v2, v2, v5, v8
482; GFX89-NEXT:    s_setpc_b64 s[30:31]
483;
484; R600-LABEL: v_fshr_v3i32:
485; R600:       ; %bb.0:
486; R600-NEXT:    CF_END
487; R600-NEXT:    PAD
488  %ret = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2)
489  ret <3 x i32> %ret
490}
491
492define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) {
493; GFX89-LABEL: v_fshr_v4i32:
494; GFX89:       ; %bb.0:
495; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
496; GFX89-NEXT:    v_alignbit_b32 v0, v0, v4, v8
497; GFX89-NEXT:    v_alignbit_b32 v1, v1, v5, v9
498; GFX89-NEXT:    v_alignbit_b32 v2, v2, v6, v10
499; GFX89-NEXT:    v_alignbit_b32 v3, v3, v7, v11
500; GFX89-NEXT:    s_setpc_b64 s[30:31]
501;
502; R600-LABEL: v_fshr_v4i32:
503; R600:       ; %bb.0:
504; R600-NEXT:    CF_END
505; R600-NEXT:    PAD
506  %ret = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2)
507  ret <4 x i32> %ret
508}
509
510define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
511; SI-LABEL: v_fshr_i16:
512; SI:       ; %bb.0:
513; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
514; SI-NEXT:    v_or_b32_e32 v2, 16, v2
515; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
516; SI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
517; SI-NEXT:    s_setpc_b64 s[30:31]
518;
519; VI-LABEL: v_fshr_i16:
520; VI:       ; %bb.0:
521; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
522; VI-NEXT:    v_xor_b32_e32 v3, -1, v2
523; VI-NEXT:    v_and_b32_e32 v2, 15, v2
524; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
525; VI-NEXT:    v_and_b32_e32 v3, 15, v3
526; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
527; VI-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
528; VI-NEXT:    v_or_b32_e32 v0, v0, v1
529; VI-NEXT:    s_setpc_b64 s[30:31]
530;
531; GFX9-LABEL: v_fshr_i16:
532; GFX9:       ; %bb.0:
533; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
534; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v2
535; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
536; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
537; GFX9-NEXT:    v_and_b32_e32 v3, 15, v3
538; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
539; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
540; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
541; GFX9-NEXT:    s_setpc_b64 s[30:31]
542;
543; R600-LABEL: v_fshr_i16:
544; R600:       ; %bb.0:
545; R600-NEXT:    CF_END
546; R600-NEXT:    PAD
547  %ret = call i16 @llvm.fshr.i16(i16 %src0, i16 %src1, i16 %src2)
548  ret i16 %ret
549}
550
551define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) {
552; SI-LABEL: v_fshr_v2i16:
553; SI:       ; %bb.0:
554; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
555; SI-NEXT:    v_or_b32_e32 v5, 16, v5
556; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
557; SI-NEXT:    v_alignbit_b32 v1, v1, v3, v5
558; SI-NEXT:    v_or_b32_e32 v3, 16, v4
559; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
560; SI-NEXT:    v_alignbit_b32 v0, v0, v2, v3
561; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
562; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
563; SI-NEXT:    v_or_b32_e32 v0, v0, v1
564; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
565; SI-NEXT:    s_setpc_b64 s[30:31]
566;
567; VI-LABEL: v_fshr_v2i16:
568; VI:       ; %bb.0:
569; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
570; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
571; VI-NEXT:    v_and_b32_e32 v4, 15, v3
572; VI-NEXT:    v_mov_b32_e32 v5, 1
573; VI-NEXT:    v_xor_b32_e32 v3, -1, v3
574; VI-NEXT:    v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
575; VI-NEXT:    v_and_b32_e32 v3, 15, v3
576; VI-NEXT:    v_lshrrev_b16_sdwa v4, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
577; VI-NEXT:    v_lshlrev_b16_e32 v3, v3, v5
578; VI-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
579; VI-NEXT:    v_xor_b32_e32 v4, -1, v2
580; VI-NEXT:    v_and_b32_e32 v2, 15, v2
581; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
582; VI-NEXT:    v_and_b32_e32 v4, 15, v4
583; VI-NEXT:    v_lshlrev_b16_e32 v0, v4, v0
584; VI-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
585; VI-NEXT:    v_or_b32_e32 v0, v0, v1
586; VI-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
587; VI-NEXT:    s_setpc_b64 s[30:31]
588;
589; GFX9-LABEL: v_fshr_v2i16:
590; GFX9:       ; %bb.0:
591; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
592; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v2
593; GFX9-NEXT:    s_mov_b32 s4, 0xf000f
594; GFX9-NEXT:    v_and_b32_e32 v2, s4, v2
595; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
596; GFX9-NEXT:    v_and_b32_e32 v3, s4, v3
597; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
598; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
599; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
600; GFX9-NEXT:    s_setpc_b64 s[30:31]
601;
602; R600-LABEL: v_fshr_v2i16:
603; R600:       ; %bb.0:
604; R600-NEXT:    CF_END
605; R600-NEXT:    PAD
606  %ret = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2)
607  ret <2 x i16> %ret
608}
609
610define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) {
611; SI-LABEL: v_fshr_v3i16:
612; SI:       ; %bb.0:
613; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
614; SI-NEXT:    v_or_b32_e32 v7, 16, v7
615; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
616; SI-NEXT:    v_alignbit_b32 v1, v1, v4, v7
617; SI-NEXT:    v_or_b32_e32 v4, 16, v6
618; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
619; SI-NEXT:    v_alignbit_b32 v0, v0, v3, v4
620; SI-NEXT:    s_mov_b32 s4, 0xffff
621; SI-NEXT:    v_or_b32_e32 v3, 16, v8
622; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
623; SI-NEXT:    v_alignbit_b32 v3, v2, v4, v3
624; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
625; SI-NEXT:    v_and_b32_e32 v0, s4, v0
626; SI-NEXT:    v_or_b32_e32 v0, v0, v1
627; SI-NEXT:    v_and_b32_e32 v2, s4, v3
628; SI-NEXT:    v_alignbit_b32 v1, v3, v1, 16
629; SI-NEXT:    s_setpc_b64 s[30:31]
630;
631; VI-LABEL: v_fshr_v3i16:
632; VI:       ; %bb.0:
633; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
634; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
635; VI-NEXT:    v_and_b32_e32 v7, 15, v6
636; VI-NEXT:    v_mov_b32_e32 v8, 1
637; VI-NEXT:    v_xor_b32_e32 v6, -1, v6
638; VI-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
639; VI-NEXT:    v_and_b32_e32 v6, 15, v6
640; VI-NEXT:    v_lshrrev_b16_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
641; VI-NEXT:    v_lshlrev_b16_e32 v6, v6, v8
642; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
643; VI-NEXT:    v_xor_b32_e32 v7, -1, v5
644; VI-NEXT:    v_and_b32_e32 v5, 15, v5
645; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
646; VI-NEXT:    v_and_b32_e32 v7, 15, v7
647; VI-NEXT:    v_lshlrev_b16_e32 v1, v7, v1
648; VI-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
649; VI-NEXT:    v_or_b32_e32 v1, v1, v3
650; VI-NEXT:    v_xor_b32_e32 v3, -1, v4
651; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
652; VI-NEXT:    v_and_b32_e32 v3, 15, v3
653; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
654; VI-NEXT:    v_and_b32_e32 v3, 15, v4
655; VI-NEXT:    v_lshrrev_b16_e32 v2, v3, v2
656; VI-NEXT:    v_or_b32_e32 v0, v0, v2
657; VI-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
658; VI-NEXT:    s_setpc_b64 s[30:31]
659;
660; GFX9-LABEL: v_fshr_v3i16:
661; GFX9:       ; %bb.0:
662; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
663; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
664; GFX9-NEXT:    v_and_b32_e32 v7, 15, v6
665; GFX9-NEXT:    v_mov_b32_e32 v8, 1
666; GFX9-NEXT:    v_xor_b32_e32 v6, -1, v6
667; GFX9-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
668; GFX9-NEXT:    v_and_b32_e32 v6, 15, v6
669; GFX9-NEXT:    v_lshrrev_b16_sdwa v7, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
670; GFX9-NEXT:    v_lshlrev_b16_e32 v6, v6, v8
671; GFX9-NEXT:    v_or_b32_e32 v6, v6, v7
672; GFX9-NEXT:    v_xor_b32_e32 v7, -1, v5
673; GFX9-NEXT:    v_and_b32_e32 v5, 15, v5
674; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
675; GFX9-NEXT:    v_and_b32_e32 v7, 15, v7
676; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v7, v1
677; GFX9-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
678; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
679; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v4
680; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
681; GFX9-NEXT:    v_and_b32_e32 v3, 15, v3
682; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
683; GFX9-NEXT:    v_and_b32_e32 v3, 15, v4
684; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v3, v2
685; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
686; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
687; GFX9-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
688; GFX9-NEXT:    s_setpc_b64 s[30:31]
689;
690; R600-LABEL: v_fshr_v3i16:
691; R600:       ; %bb.0:
692; R600-NEXT:    CF_END
693; R600-NEXT:    PAD
694  %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2)
695  ret <3 x i16> %ret
696}
697
698define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) {
699; SI-LABEL: v_fshr_v4i16:
700; SI:       ; %bb.0:
701; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
702; SI-NEXT:    v_or_b32_e32 v9, 16, v9
703; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
704; SI-NEXT:    v_alignbit_b32 v1, v1, v5, v9
705; SI-NEXT:    v_or_b32_e32 v5, 16, v8
706; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
707; SI-NEXT:    v_alignbit_b32 v0, v0, v4, v5
708; SI-NEXT:    v_or_b32_e32 v4, 16, v11
709; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
710; SI-NEXT:    v_alignbit_b32 v3, v3, v5, v4
711; SI-NEXT:    v_or_b32_e32 v4, 16, v10
712; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
713; SI-NEXT:    s_mov_b32 s4, 0xffff
714; SI-NEXT:    v_alignbit_b32 v2, v2, v5, v4
715; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
716; SI-NEXT:    v_and_b32_e32 v2, s4, v2
717; SI-NEXT:    v_or_b32_e32 v2, v2, v3
718; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
719; SI-NEXT:    v_and_b32_e32 v0, s4, v0
720; SI-NEXT:    v_or_b32_e32 v0, v0, v1
721; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
722; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
723; SI-NEXT:    s_setpc_b64 s[30:31]
724;
725; VI-LABEL: v_fshr_v4i16:
726; VI:       ; %bb.0:
727; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
728; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
729; VI-NEXT:    v_and_b32_e32 v7, 15, v6
730; VI-NEXT:    v_xor_b32_e32 v6, -1, v6
731; VI-NEXT:    v_mov_b32_e32 v8, 1
732; VI-NEXT:    v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
733; VI-NEXT:    v_and_b32_e32 v6, 15, v6
734; VI-NEXT:    v_lshrrev_b16_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
735; VI-NEXT:    v_lshlrev_b16_e32 v6, v6, v9
736; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
737; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
738; VI-NEXT:    v_and_b32_e32 v9, 15, v7
739; VI-NEXT:    v_xor_b32_e32 v7, -1, v7
740; VI-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
741; VI-NEXT:    v_and_b32_e32 v7, 15, v7
742; VI-NEXT:    v_lshlrev_b16_e32 v7, v7, v8
743; VI-NEXT:    v_xor_b32_e32 v8, -1, v5
744; VI-NEXT:    v_and_b32_e32 v5, 15, v5
745; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
746; VI-NEXT:    v_and_b32_e32 v8, 15, v8
747; VI-NEXT:    v_lshlrev_b16_e32 v1, v8, v1
748; VI-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
749; VI-NEXT:    v_or_b32_e32 v1, v1, v3
750; VI-NEXT:    v_xor_b32_e32 v3, -1, v4
751; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
752; VI-NEXT:    v_and_b32_e32 v3, 15, v3
753; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
754; VI-NEXT:    v_and_b32_e32 v3, 15, v4
755; VI-NEXT:    v_lshrrev_b16_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
756; VI-NEXT:    v_lshrrev_b16_e32 v2, v3, v2
757; VI-NEXT:    v_or_b32_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
758; VI-NEXT:    v_or_b32_e32 v0, v0, v2
759; VI-NEXT:    v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
760; VI-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
761; VI-NEXT:    s_setpc_b64 s[30:31]
762;
763; GFX9-LABEL: v_fshr_v4i16:
764; GFX9:       ; %bb.0:
765; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
766; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
767; GFX9-NEXT:    v_and_b32_e32 v7, 15, v6
768; GFX9-NEXT:    v_xor_b32_e32 v6, -1, v6
769; GFX9-NEXT:    v_mov_b32_e32 v8, 1
770; GFX9-NEXT:    v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
771; GFX9-NEXT:    v_and_b32_e32 v6, 15, v6
772; GFX9-NEXT:    v_lshrrev_b16_sdwa v7, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
773; GFX9-NEXT:    v_lshlrev_b16_e32 v6, v6, v9
774; GFX9-NEXT:    v_or_b32_e32 v6, v6, v7
775; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
776; GFX9-NEXT:    v_and_b32_e32 v9, 15, v7
777; GFX9-NEXT:    v_xor_b32_e32 v7, -1, v7
778; GFX9-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
779; GFX9-NEXT:    v_and_b32_e32 v7, 15, v7
780; GFX9-NEXT:    v_lshlrev_b16_e32 v7, v7, v8
781; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v5
782; GFX9-NEXT:    v_and_b32_e32 v5, 15, v5
783; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
784; GFX9-NEXT:    v_and_b32_e32 v8, 15, v8
785; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v8, v1
786; GFX9-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
787; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
788; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v4
789; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
790; GFX9-NEXT:    v_and_b32_e32 v3, 15, v3
791; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
792; GFX9-NEXT:    v_and_b32_e32 v3, 15, v4
793; GFX9-NEXT:    v_lshrrev_b16_sdwa v9, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
794; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v3, v2
795; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
796; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
797; GFX9-NEXT:    v_and_b32_e32 v1, v2, v1
798; GFX9-NEXT:    v_or_b32_e32 v7, v7, v9
799; GFX9-NEXT:    v_and_b32_e32 v0, v2, v0
800; GFX9-NEXT:    v_lshl_or_b32 v0, v7, 16, v0
801; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v1
802; GFX9-NEXT:    s_setpc_b64 s[30:31]
803;
804; R600-LABEL: v_fshr_v4i16:
805; R600:       ; %bb.0:
806; R600-NEXT:    CF_END
807; R600-NEXT:    PAD
808  %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2)
809  ret <4 x i16> %ret
810}
811
812define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
813; SI-LABEL: v_fshr_i64:
814; SI:       ; %bb.0:
815; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
816; SI-NEXT:    v_and_b32_e32 v5, 63, v4
817; SI-NEXT:    v_not_b32_e32 v4, v4
818; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
819; SI-NEXT:    v_and_b32_e32 v4, 63, v4
820; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], v5
821; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
822; SI-NEXT:    v_or_b32_e32 v1, v1, v3
823; SI-NEXT:    v_or_b32_e32 v0, v0, v2
824; SI-NEXT:    s_setpc_b64 s[30:31]
825;
826; VI-LABEL: v_fshr_i64:
827; VI:       ; %bb.0:
828; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
829; VI-NEXT:    v_and_b32_e32 v5, 63, v4
830; VI-NEXT:    v_not_b32_e32 v4, v4
831; VI-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
832; VI-NEXT:    v_and_b32_e32 v4, 63, v4
833; VI-NEXT:    v_lshrrev_b64 v[2:3], v5, v[2:3]
834; VI-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
835; VI-NEXT:    v_or_b32_e32 v1, v1, v3
836; VI-NEXT:    v_or_b32_e32 v0, v0, v2
837; VI-NEXT:    s_setpc_b64 s[30:31]
838;
839; GFX9-LABEL: v_fshr_i64:
840; GFX9:       ; %bb.0:
841; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
842; GFX9-NEXT:    v_and_b32_e32 v5, 63, v4
843; GFX9-NEXT:    v_not_b32_e32 v4, v4
844; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
845; GFX9-NEXT:    v_and_b32_e32 v4, 63, v4
846; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v5, v[2:3]
847; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
848; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
849; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
850; GFX9-NEXT:    s_setpc_b64 s[30:31]
851;
852; R600-LABEL: v_fshr_i64:
853; R600:       ; %bb.0:
854; R600-NEXT:    CF_END
855; R600-NEXT:    PAD
856  %ret = call i64 @llvm.fshr.i64(i64 %src0, i64 %src1, i64 %src2)
857  ret i64 %ret
858}
859
860define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) {
861; SI-LABEL: v_fshr_v2i64:
862; SI:       ; %bb.0:
863; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
864; SI-NEXT:    v_and_b32_e32 v9, 63, v8
865; SI-NEXT:    v_not_b32_e32 v8, v8
866; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
867; SI-NEXT:    v_and_b32_e32 v8, 63, v8
868; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], v9
869; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v8
870; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
871; SI-NEXT:    v_or_b32_e32 v1, v1, v5
872; SI-NEXT:    v_and_b32_e32 v5, 63, v10
873; SI-NEXT:    v_lshr_b64 v[5:6], v[6:7], v5
874; SI-NEXT:    v_not_b32_e32 v7, v10
875; SI-NEXT:    v_and_b32_e32 v7, 63, v7
876; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], v7
877; SI-NEXT:    v_or_b32_e32 v0, v0, v4
878; SI-NEXT:    v_or_b32_e32 v3, v3, v6
879; SI-NEXT:    v_or_b32_e32 v2, v2, v5
880; SI-NEXT:    s_setpc_b64 s[30:31]
881;
882; VI-LABEL: v_fshr_v2i64:
883; VI:       ; %bb.0:
884; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
885; VI-NEXT:    v_and_b32_e32 v9, 63, v8
886; VI-NEXT:    v_not_b32_e32 v8, v8
887; VI-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
888; VI-NEXT:    v_and_b32_e32 v8, 63, v8
889; VI-NEXT:    v_lshrrev_b64 v[4:5], v9, v[4:5]
890; VI-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
891; VI-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
892; VI-NEXT:    v_or_b32_e32 v1, v1, v5
893; VI-NEXT:    v_and_b32_e32 v5, 63, v10
894; VI-NEXT:    v_lshrrev_b64 v[5:6], v5, v[6:7]
895; VI-NEXT:    v_not_b32_e32 v7, v10
896; VI-NEXT:    v_and_b32_e32 v7, 63, v7
897; VI-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
898; VI-NEXT:    v_or_b32_e32 v0, v0, v4
899; VI-NEXT:    v_or_b32_e32 v3, v3, v6
900; VI-NEXT:    v_or_b32_e32 v2, v2, v5
901; VI-NEXT:    s_setpc_b64 s[30:31]
902;
903; GFX9-LABEL: v_fshr_v2i64:
904; GFX9:       ; %bb.0:
905; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
906; GFX9-NEXT:    v_and_b32_e32 v9, 63, v8
907; GFX9-NEXT:    v_not_b32_e32 v8, v8
908; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
909; GFX9-NEXT:    v_and_b32_e32 v8, 63, v8
910; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v9, v[4:5]
911; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
912; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
913; GFX9-NEXT:    v_or_b32_e32 v1, v1, v5
914; GFX9-NEXT:    v_and_b32_e32 v5, 63, v10
915; GFX9-NEXT:    v_lshrrev_b64 v[5:6], v5, v[6:7]
916; GFX9-NEXT:    v_not_b32_e32 v7, v10
917; GFX9-NEXT:    v_and_b32_e32 v7, 63, v7
918; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
919; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4
920; GFX9-NEXT:    v_or_b32_e32 v3, v3, v6
921; GFX9-NEXT:    v_or_b32_e32 v2, v2, v5
922; GFX9-NEXT:    s_setpc_b64 s[30:31]
923;
924; R600-LABEL: v_fshr_v2i64:
925; R600:       ; %bb.0:
926; R600-NEXT:    CF_END
927; R600-NEXT:    PAD
928  %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2)
929  ret <2 x i64> %ret
930}
931
932define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
933; SI-LABEL: v_fshr_i24:
934; SI:       ; %bb.0:
935; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
936; SI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
937; SI-NEXT:    v_mul_hi_u32 v3, v2, s4
938; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
939; SI-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
940; SI-NEXT:    v_mul_lo_u32 v3, v3, 24
941; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
942; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
943; SI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
944; SI-NEXT:    s_setpc_b64 s[30:31]
945;
946; VI-LABEL: v_fshr_i24:
947; VI:       ; %bb.0:
948; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
949; VI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
950; VI-NEXT:    v_mul_hi_u32 v3, v2, s4
951; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
952; VI-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
953; VI-NEXT:    v_mul_lo_u32 v3, v3, 24
954; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
955; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v2
956; VI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
957; VI-NEXT:    s_setpc_b64 s[30:31]
958;
959; GFX9-LABEL: v_fshr_i24:
960; GFX9:       ; %bb.0:
961; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
962; GFX9-NEXT:    s_mov_b32 s4, 0xaaaaaaab
963; GFX9-NEXT:    v_mul_hi_u32 v3, v2, s4
964; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
965; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
966; GFX9-NEXT:    v_mul_lo_u32 v3, v3, 24
967; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
968; GFX9-NEXT:    v_add_u32_e32 v2, 8, v2
969; GFX9-NEXT:    v_alignbit_b32 v0, v0, v1, v2
970; GFX9-NEXT:    s_setpc_b64 s[30:31]
971;
972; R600-LABEL: v_fshr_i24:
973; R600:       ; %bb.0:
974; R600-NEXT:    CF_END
975; R600-NEXT:    PAD
976  %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2)
977  ret i24 %ret
978}
979
980define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) {
981; SI-LABEL: v_fshr_v2i24:
982; SI:       ; %bb.0:
983; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
984; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32
985; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16
986; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20
987; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
988; SI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
989; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
990; SI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
991; SI-NEXT:    v_add_i32_e32 v7, vcc, 3, v0
992; SI-NEXT:    v_add_i32_e32 v8, vcc, 4, v0
993; SI-NEXT:    v_add_i32_e32 v9, vcc, 5, v0
994; SI-NEXT:    v_add_i32_e32 v10, vcc, 2, v0
995; SI-NEXT:    s_waitcnt vmcnt(4)
996; SI-NEXT:    v_mul_hi_u32 v11, v2, s4
997; SI-NEXT:    s_waitcnt vmcnt(3)
998; SI-NEXT:    v_mul_hi_u32 v12, v3, s4
999; SI-NEXT:    s_waitcnt vmcnt(2)
1000; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
1001; SI-NEXT:    v_lshrrev_b32_e32 v11, 4, v11
1002; SI-NEXT:    v_lshrrev_b32_e32 v12, 4, v12
1003; SI-NEXT:    v_mul_lo_u32 v11, v11, 24
1004; SI-NEXT:    v_mul_lo_u32 v12, v12, 24
1005; SI-NEXT:    s_waitcnt vmcnt(0)
1006; SI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
1007; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v11
1008; SI-NEXT:    v_sub_i32_e32 v3, vcc, v3, v12
1009; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
1010; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v3
1011; SI-NEXT:    v_alignbit_b32 v1, v1, v6, v2
1012; SI-NEXT:    v_alignbit_b32 v2, v5, v4, v3
1013; SI-NEXT:    buffer_store_byte v2, v7, s[0:3], 0 offen
1014; SI-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
1015; SI-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
1016; SI-NEXT:    s_waitcnt expcnt(1)
1017; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1018; SI-NEXT:    s_waitcnt expcnt(0)
1019; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1020; SI-NEXT:    buffer_store_byte v0, v8, s[0:3], 0 offen
1021; SI-NEXT:    buffer_store_byte v2, v9, s[0:3], 0 offen
1022; SI-NEXT:    buffer_store_byte v1, v10, s[0:3], 0 offen
1023; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
1024; SI-NEXT:    s_setpc_b64 s[30:31]
1025;
1026; VI-LABEL: v_fshr_v2i24:
1027; VI:       ; %bb.0:
1028; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1029; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32
1030; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16
1031; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20
1032; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
1033; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
1034; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
1035; VI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1036; VI-NEXT:    v_add_u32_e32 v7, vcc, 3, v0
1037; VI-NEXT:    v_add_u32_e32 v8, vcc, 4, v0
1038; VI-NEXT:    v_add_u32_e32 v9, vcc, 5, v0
1039; VI-NEXT:    v_add_u32_e32 v10, vcc, 2, v0
1040; VI-NEXT:    s_waitcnt vmcnt(4)
1041; VI-NEXT:    v_mul_hi_u32 v11, v2, s4
1042; VI-NEXT:    s_waitcnt vmcnt(3)
1043; VI-NEXT:    v_mul_hi_u32 v12, v3, s4
1044; VI-NEXT:    s_waitcnt vmcnt(2)
1045; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
1046; VI-NEXT:    v_lshrrev_b32_e32 v11, 4, v11
1047; VI-NEXT:    v_lshrrev_b32_e32 v12, 4, v12
1048; VI-NEXT:    v_mul_lo_u32 v11, v11, 24
1049; VI-NEXT:    v_mul_lo_u32 v12, v12, 24
1050; VI-NEXT:    s_waitcnt vmcnt(0)
1051; VI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
1052; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v11
1053; VI-NEXT:    v_sub_u32_e32 v3, vcc, v3, v12
1054; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v2
1055; VI-NEXT:    v_add_u32_e32 v3, vcc, 8, v3
1056; VI-NEXT:    v_alignbit_b32 v1, v1, v6, v2
1057; VI-NEXT:    v_alignbit_b32 v2, v5, v4, v3
1058; VI-NEXT:    buffer_store_byte v2, v7, s[0:3], 0 offen
1059; VI-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
1060; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
1061; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
1062; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
1063; VI-NEXT:    buffer_store_byte v0, v8, s[0:3], 0 offen
1064; VI-NEXT:    buffer_store_byte v2, v9, s[0:3], 0 offen
1065; VI-NEXT:    buffer_store_byte v1, v10, s[0:3], 0 offen
1066; VI-NEXT:    s_waitcnt vmcnt(0)
1067; VI-NEXT:    s_setpc_b64 s[30:31]
1068;
1069; GFX9-LABEL: v_fshr_v2i24:
1070; GFX9:       ; %bb.0:
1071; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1072; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:16
1073; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:20
1074; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:12
1075; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4
1076; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:8
1077; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32
1078; GFX9-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1079; GFX9-NEXT:    s_waitcnt vmcnt(5)
1080; GFX9-NEXT:    v_mul_hi_u32 v6, v1, s4
1081; GFX9-NEXT:    s_waitcnt vmcnt(4)
1082; GFX9-NEXT:    v_mul_hi_u32 v7, v2, s4
1083; GFX9-NEXT:    s_waitcnt vmcnt(3)
1084; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
1085; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1086; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 4, v7
1087; GFX9-NEXT:    v_mul_lo_u32 v7, v7, 24
1088; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
1089; GFX9-NEXT:    s_waitcnt vmcnt(1)
1090; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
1091; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v7
1092; GFX9-NEXT:    v_sub_u32_e32 v1, v1, v6
1093; GFX9-NEXT:    v_add_u32_e32 v2, 8, v2
1094; GFX9-NEXT:    v_add_u32_e32 v1, 8, v1
1095; GFX9-NEXT:    v_alignbit_b32 v2, v4, v3, v2
1096; GFX9-NEXT:    s_waitcnt vmcnt(0)
1097; GFX9-NEXT:    v_alignbit_b32 v1, v8, v5, v1
1098; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
1099; GFX9-NEXT:    buffer_store_byte v2, v0, s[0:3], 0 offen offset:3
1100; GFX9-NEXT:    buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen offset:2
1101; GFX9-NEXT:    buffer_store_short v1, v0, s[0:3], 0 offen
1102; GFX9-NEXT:    buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5
1103; GFX9-NEXT:    buffer_store_byte v3, v0, s[0:3], 0 offen offset:4
1104; GFX9-NEXT:    s_waitcnt vmcnt(0)
1105; GFX9-NEXT:    s_setpc_b64 s[30:31]
1106;
1107; R600-LABEL: v_fshr_v2i24:
1108; R600:       ; %bb.0:
1109; R600-NEXT:    CF_END
1110; R600-NEXT:    PAD
1111  %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2)
1112  ret <2 x i24> %ret
1113}
1114