1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,SI
3; RUN: llc < %s -march=amdgcn -mcpu=tonga  -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,VI
4; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,GFX9
5; RUN: llc < %s -march=r600 -mcpu=redwood  -verify-machineinstrs | FileCheck %s --check-prefix=R600
6; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10
7
8declare i32 @llvm.fshr.i32(i32, i32, i32)
9declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
10declare <3 x i32> @llvm.fshr.v3i32(<3 x i32>, <3 x i32>, <3 x i32>)
11declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
12declare i16 @llvm.fshr.i16(i16, i16, i16)
13declare <2 x i16> @llvm.fshr.v2i16(<2 x i16>, <2 x i16>, <2 x i16>)
14declare <3 x i16> @llvm.fshr.v3i16(<3 x i16>, <3 x i16>, <3 x i16>)
15declare <4 x i16> @llvm.fshr.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
16declare i64 @llvm.fshr.i64(i64, i64, i64)
17declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
18declare i24 @llvm.fshr.i24(i24, i24, i24)
19declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>)
20
21define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) {
22; SI-LABEL: fshr_i32:
23; SI:       ; %bb.0: ; %entry
24; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
25; SI-NEXT:    s_load_dword s6, s[0:1], 0xd
26; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
27; SI-NEXT:    s_mov_b32 s3, 0xf000
28; SI-NEXT:    s_mov_b32 s2, -1
29; SI-NEXT:    s_waitcnt lgkmcnt(0)
30; SI-NEXT:    v_mov_b32_e32 v0, s5
31; SI-NEXT:    v_mov_b32_e32 v1, s6
32; SI-NEXT:    v_alignbit_b32 v0, s4, v0, v1
33; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
34; SI-NEXT:    s_endpgm
35;
36; VI-LABEL: fshr_i32:
37; VI:       ; %bb.0: ; %entry
38; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
39; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
40; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
41; VI-NEXT:    s_waitcnt lgkmcnt(0)
42; VI-NEXT:    v_mov_b32_e32 v0, s3
43; VI-NEXT:    v_mov_b32_e32 v1, s4
44; VI-NEXT:    v_alignbit_b32 v2, s2, v0, v1
45; VI-NEXT:    v_mov_b32_e32 v0, s0
46; VI-NEXT:    v_mov_b32_e32 v1, s1
47; VI-NEXT:    flat_store_dword v[0:1], v2
48; VI-NEXT:    s_endpgm
49;
50; GFX9-LABEL: fshr_i32:
51; GFX9:       ; %bb.0: ; %entry
52; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
53; GFX9-NEXT:    s_load_dword s6, s[0:1], 0x34
54; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
55; GFX9-NEXT:    v_mov_b32_e32 v0, 0
56; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
57; GFX9-NEXT:    v_mov_b32_e32 v1, s3
58; GFX9-NEXT:    v_mov_b32_e32 v2, s6
59; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, v2
60; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
61; GFX9-NEXT:    s_endpgm
62;
63; R600-LABEL: fshr_i32:
64; R600:       ; %bb.0: ; %entry
65; R600-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
66; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
67; R600-NEXT:    CF_END
68; R600-NEXT:    PAD
69; R600-NEXT:    ALU clause starting at 4:
70; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
71; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
72; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, KC0[3].X,
73;
74; GFX10-LABEL: fshr_i32:
75; GFX10:       ; %bb.0: ; %entry
76; GFX10-NEXT:    s_clause 0x2
77; GFX10-NEXT:    s_load_dword s6, s[0:1], 0x34
78; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
79; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
80; GFX10-NEXT:    v_mov_b32_e32 v1, 0
81; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
82; GFX10-NEXT:    v_mov_b32_e32 v0, s6
83; GFX10-NEXT:    v_alignbit_b32 v0, s2, s3, v0
84; GFX10-NEXT:    global_store_dword v1, v0, s[4:5]
85; GFX10-NEXT:    s_endpgm
86entry:
87  %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
88  store i32 %0, i32 addrspace(1)* %in
89  ret void
90}
91
92define amdgpu_kernel void @fshr_i32_imm(i32 addrspace(1)* %in, i32 %x, i32 %y) {
93; SI-LABEL: fshr_i32_imm:
94; SI:       ; %bb.0: ; %entry
95; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
96; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
97; SI-NEXT:    s_mov_b32 s3, 0xf000
98; SI-NEXT:    s_mov_b32 s2, -1
99; SI-NEXT:    s_waitcnt lgkmcnt(0)
100; SI-NEXT:    v_mov_b32_e32 v0, s5
101; SI-NEXT:    v_alignbit_b32 v0, s4, v0, 7
102; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
103; SI-NEXT:    s_endpgm
104;
105; VI-LABEL: fshr_i32_imm:
106; VI:       ; %bb.0: ; %entry
107; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
108; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
109; VI-NEXT:    s_waitcnt lgkmcnt(0)
110; VI-NEXT:    v_mov_b32_e32 v0, s3
111; VI-NEXT:    v_alignbit_b32 v2, s2, v0, 7
112; VI-NEXT:    v_mov_b32_e32 v0, s0
113; VI-NEXT:    v_mov_b32_e32 v1, s1
114; VI-NEXT:    flat_store_dword v[0:1], v2
115; VI-NEXT:    s_endpgm
116;
117; GFX9-LABEL: fshr_i32_imm:
118; GFX9:       ; %bb.0: ; %entry
119; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
120; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
121; GFX9-NEXT:    v_mov_b32_e32 v0, 0
122; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
123; GFX9-NEXT:    v_mov_b32_e32 v1, s3
124; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, 7
125; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
126; GFX9-NEXT:    s_endpgm
127;
128; R600-LABEL: fshr_i32_imm:
129; R600:       ; %bb.0: ; %entry
130; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
131; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
132; R600-NEXT:    CF_END
133; R600-NEXT:    PAD
134; R600-NEXT:    ALU clause starting at 4:
135; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
136; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
137; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x,
138; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
139;
140; GFX10-LABEL: fshr_i32_imm:
141; GFX10:       ; %bb.0: ; %entry
142; GFX10-NEXT:    s_clause 0x1
143; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
144; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
145; GFX10-NEXT:    v_mov_b32_e32 v0, 0
146; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
147; GFX10-NEXT:    v_alignbit_b32 v1, s2, s3, 7
148; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
149; GFX10-NEXT:    s_endpgm
150entry:
151  %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7)
152  store i32 %0, i32 addrspace(1)* %in
153  ret void
154}
155
156define amdgpu_kernel void @fshr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
157; SI-LABEL: fshr_v2i32:
158; SI:       ; %bb.0: ; %entry
159; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
160; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xf
161; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
162; SI-NEXT:    s_mov_b32 s3, 0xf000
163; SI-NEXT:    s_mov_b32 s2, -1
164; SI-NEXT:    s_waitcnt lgkmcnt(0)
165; SI-NEXT:    v_mov_b32_e32 v0, s7
166; SI-NEXT:    v_mov_b32_e32 v1, s9
167; SI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
168; SI-NEXT:    v_mov_b32_e32 v0, s6
169; SI-NEXT:    v_mov_b32_e32 v2, s8
170; SI-NEXT:    v_alignbit_b32 v0, s4, v0, v2
171; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
172; SI-NEXT:    s_endpgm
173;
174; VI-LABEL: fshr_v2i32:
175; VI:       ; %bb.0: ; %entry
176; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
177; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
178; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
179; VI-NEXT:    s_waitcnt lgkmcnt(0)
180; VI-NEXT:    v_mov_b32_e32 v0, s7
181; VI-NEXT:    v_mov_b32_e32 v1, s3
182; VI-NEXT:    v_mov_b32_e32 v2, s6
183; VI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
184; VI-NEXT:    v_mov_b32_e32 v0, s2
185; VI-NEXT:    v_alignbit_b32 v0, s4, v2, v0
186; VI-NEXT:    v_mov_b32_e32 v3, s1
187; VI-NEXT:    v_mov_b32_e32 v2, s0
188; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
189; VI-NEXT:    s_endpgm
190;
191; GFX9-LABEL: fshr_v2i32:
192; GFX9:       ; %bb.0: ; %entry
193; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
194; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
195; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
196; GFX9-NEXT:    v_mov_b32_e32 v2, 0
197; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
198; GFX9-NEXT:    v_mov_b32_e32 v0, s7
199; GFX9-NEXT:    v_mov_b32_e32 v1, s3
200; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, v1
201; GFX9-NEXT:    v_mov_b32_e32 v0, s6
202; GFX9-NEXT:    v_mov_b32_e32 v3, s2
203; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, v3
204; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
205; GFX9-NEXT:    s_endpgm
206;
207; R600-LABEL: fshr_v2i32:
208; R600:       ; %bb.0: ; %entry
209; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
210; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
211; R600-NEXT:    CF_END
212; R600-NEXT:    PAD
213; R600-NEXT:    ALU clause starting at 4:
214; R600-NEXT:     MOV * T0.W, KC0[4].X,
215; R600-NEXT:     BIT_ALIGN_INT T0.Y, KC0[3].X, KC0[3].Z, PV.W,
216; R600-NEXT:     MOV * T0.W, KC0[3].W,
217; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, PV.W,
218; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
219; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
220;
221; GFX10-LABEL: fshr_v2i32:
222; GFX10:       ; %bb.0: ; %entry
223; GFX10-NEXT:    s_clause 0x2
224; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x3c
225; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
226; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
227; GFX10-NEXT:    v_mov_b32_e32 v3, 0
228; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
229; GFX10-NEXT:    v_mov_b32_e32 v0, s3
230; GFX10-NEXT:    v_mov_b32_e32 v2, s2
231; GFX10-NEXT:    v_alignbit_b32 v1, s5, s7, v0
232; GFX10-NEXT:    v_alignbit_b32 v0, s4, s6, v2
233; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[8:9]
234; GFX10-NEXT:    s_endpgm
235entry:
236  %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
237  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
238  ret void
239}
240
241define amdgpu_kernel void @fshr_v2i32_imm(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
242; SI-LABEL: fshr_v2i32_imm:
243; SI:       ; %bb.0: ; %entry
244; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
245; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
246; SI-NEXT:    s_mov_b32 s3, 0xf000
247; SI-NEXT:    s_mov_b32 s2, -1
248; SI-NEXT:    s_waitcnt lgkmcnt(0)
249; SI-NEXT:    v_mov_b32_e32 v0, s7
250; SI-NEXT:    v_mov_b32_e32 v2, s6
251; SI-NEXT:    v_alignbit_b32 v1, s5, v0, 9
252; SI-NEXT:    v_alignbit_b32 v0, s4, v2, 7
253; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
254; SI-NEXT:    s_endpgm
255;
256; VI-LABEL: fshr_v2i32_imm:
257; VI:       ; %bb.0: ; %entry
258; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
259; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
260; VI-NEXT:    s_waitcnt lgkmcnt(0)
261; VI-NEXT:    v_mov_b32_e32 v0, s7
262; VI-NEXT:    v_mov_b32_e32 v2, s6
263; VI-NEXT:    v_alignbit_b32 v1, s5, v0, 9
264; VI-NEXT:    v_alignbit_b32 v0, s4, v2, 7
265; VI-NEXT:    v_mov_b32_e32 v3, s1
266; VI-NEXT:    v_mov_b32_e32 v2, s0
267; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
268; VI-NEXT:    s_endpgm
269;
270; GFX9-LABEL: fshr_v2i32_imm:
271; GFX9:       ; %bb.0: ; %entry
272; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
273; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
274; GFX9-NEXT:    v_mov_b32_e32 v2, 0
275; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
276; GFX9-NEXT:    v_mov_b32_e32 v0, s7
277; GFX9-NEXT:    v_mov_b32_e32 v3, s6
278; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 9
279; GFX9-NEXT:    v_alignbit_b32 v0, s4, v3, 7
280; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
281; GFX9-NEXT:    s_endpgm
282;
283; R600-LABEL: fshr_v2i32_imm:
284; R600:       ; %bb.0: ; %entry
285; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
286; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
287; R600-NEXT:    CF_END
288; R600-NEXT:    PAD
289; R600-NEXT:    ALU clause starting at 4:
290; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x,
291; R600-NEXT:    9(1.261169e-44), 0(0.000000e+00)
292; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x,
293; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
294; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
295; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
296;
297; GFX10-LABEL: fshr_v2i32_imm:
298; GFX10:       ; %bb.0: ; %entry
299; GFX10-NEXT:    s_clause 0x1
300; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
301; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
302; GFX10-NEXT:    v_mov_b32_e32 v2, 0
303; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
304; GFX10-NEXT:    v_alignbit_b32 v1, s5, s7, 9
305; GFX10-NEXT:    v_alignbit_b32 v0, s4, s6, 7
306; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
307; GFX10-NEXT:    s_endpgm
308entry:
309  %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
310  store <2 x i32> %0, <2 x i32> addrspace(1)* %in
311  ret void
312}
313
314define amdgpu_kernel void @fshr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
315; SI-LABEL: fshr_v4i32:
316; SI:       ; %bb.0: ; %entry
317; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
318; SI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x15
319; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
320; SI-NEXT:    s_mov_b32 s3, 0xf000
321; SI-NEXT:    s_mov_b32 s2, -1
322; SI-NEXT:    s_waitcnt lgkmcnt(0)
323; SI-NEXT:    v_mov_b32_e32 v0, s11
324; SI-NEXT:    v_mov_b32_e32 v1, s15
325; SI-NEXT:    v_alignbit_b32 v3, s7, v0, v1
326; SI-NEXT:    v_mov_b32_e32 v0, s10
327; SI-NEXT:    v_mov_b32_e32 v1, s14
328; SI-NEXT:    v_alignbit_b32 v2, s6, v0, v1
329; SI-NEXT:    v_mov_b32_e32 v0, s9
330; SI-NEXT:    v_mov_b32_e32 v1, s13
331; SI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
332; SI-NEXT:    v_mov_b32_e32 v0, s8
333; SI-NEXT:    v_mov_b32_e32 v4, s12
334; SI-NEXT:    v_alignbit_b32 v0, s4, v0, v4
335; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
336; SI-NEXT:    s_endpgm
337;
338; VI-LABEL: fshr_v4i32:
339; VI:       ; %bb.0: ; %entry
340; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
341; VI-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
342; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
343; VI-NEXT:    s_waitcnt lgkmcnt(0)
344; VI-NEXT:    v_mov_b32_e32 v0, s11
345; VI-NEXT:    v_mov_b32_e32 v1, s15
346; VI-NEXT:    v_mov_b32_e32 v2, s10
347; VI-NEXT:    v_alignbit_b32 v3, s7, v0, v1
348; VI-NEXT:    v_mov_b32_e32 v0, s14
349; VI-NEXT:    v_alignbit_b32 v2, s6, v2, v0
350; VI-NEXT:    v_mov_b32_e32 v0, s9
351; VI-NEXT:    v_mov_b32_e32 v1, s13
352; VI-NEXT:    v_alignbit_b32 v1, s5, v0, v1
353; VI-NEXT:    v_mov_b32_e32 v0, s8
354; VI-NEXT:    v_mov_b32_e32 v4, s12
355; VI-NEXT:    v_alignbit_b32 v0, s4, v0, v4
356; VI-NEXT:    v_mov_b32_e32 v5, s1
357; VI-NEXT:    v_mov_b32_e32 v4, s0
358; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
359; VI-NEXT:    s_endpgm
360;
361; GFX9-LABEL: fshr_v4i32:
362; GFX9:       ; %bb.0: ; %entry
363; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
364; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
365; GFX9-NEXT:    v_mov_b32_e32 v4, 0
366; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
367; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
368; GFX9-NEXT:    v_mov_b32_e32 v0, s11
369; GFX9-NEXT:    v_mov_b32_e32 v1, s15
370; GFX9-NEXT:    v_alignbit_b32 v3, s7, v0, v1
371; GFX9-NEXT:    v_mov_b32_e32 v0, s10
372; GFX9-NEXT:    v_mov_b32_e32 v1, s14
373; GFX9-NEXT:    v_alignbit_b32 v2, s6, v0, v1
374; GFX9-NEXT:    v_mov_b32_e32 v0, s9
375; GFX9-NEXT:    v_mov_b32_e32 v1, s13
376; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, v1
377; GFX9-NEXT:    v_mov_b32_e32 v0, s8
378; GFX9-NEXT:    v_mov_b32_e32 v5, s12
379; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, v5
380; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
381; GFX9-NEXT:    s_endpgm
382;
383; R600-LABEL: fshr_v4i32:
384; R600:       ; %bb.0: ; %entry
385; R600-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
386; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
387; R600-NEXT:    CF_END
388; R600-NEXT:    PAD
389; R600-NEXT:    ALU clause starting at 4:
390; R600-NEXT:     MOV * T0.W, KC0[6].X,
391; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, PV.W,
392; R600-NEXT:     MOV * T1.W, KC0[5].W,
393; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, PV.W,
394; R600-NEXT:     MOV * T1.W, KC0[5].Z,
395; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, PV.W,
396; R600-NEXT:     MOV * T1.W, KC0[5].Y,
397; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, PV.W,
398; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
399; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
400;
401; GFX10-LABEL: fshr_v4i32:
402; GFX10:       ; %bb.0: ; %entry
403; GFX10-NEXT:    s_clause 0x2
404; GFX10-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x54
405; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
406; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
407; GFX10-NEXT:    v_mov_b32_e32 v6, 0
408; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
409; GFX10-NEXT:    v_mov_b32_e32 v0, s15
410; GFX10-NEXT:    v_mov_b32_e32 v1, s14
411; GFX10-NEXT:    v_mov_b32_e32 v4, s13
412; GFX10-NEXT:    v_mov_b32_e32 v5, s12
413; GFX10-NEXT:    v_alignbit_b32 v3, s7, s11, v0
414; GFX10-NEXT:    v_alignbit_b32 v2, s6, s10, v1
415; GFX10-NEXT:    v_alignbit_b32 v1, s5, s9, v4
416; GFX10-NEXT:    v_alignbit_b32 v0, s4, s8, v5
417; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[2:3]
418; GFX10-NEXT:    s_endpgm
419entry:
420  %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
421  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
422  ret void
423}
424
425define amdgpu_kernel void @fshr_v4i32_imm(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
426; SI-LABEL: fshr_v4i32_imm:
427; SI:       ; %bb.0: ; %entry
428; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
429; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
430; SI-NEXT:    s_mov_b32 s3, 0xf000
431; SI-NEXT:    s_mov_b32 s2, -1
432; SI-NEXT:    s_waitcnt lgkmcnt(0)
433; SI-NEXT:    v_mov_b32_e32 v0, s11
434; SI-NEXT:    v_mov_b32_e32 v1, s10
435; SI-NEXT:    v_alignbit_b32 v3, s7, v0, 1
436; SI-NEXT:    v_mov_b32_e32 v0, s9
437; SI-NEXT:    v_alignbit_b32 v2, s6, v1, 9
438; SI-NEXT:    v_alignbit_b32 v1, s5, v0, 7
439; SI-NEXT:    v_mov_b32_e32 v0, s8
440; SI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
441; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
442; SI-NEXT:    s_endpgm
443;
444; VI-LABEL: fshr_v4i32_imm:
445; VI:       ; %bb.0: ; %entry
446; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
447; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
448; VI-NEXT:    s_waitcnt lgkmcnt(0)
449; VI-NEXT:    v_mov_b32_e32 v0, s11
450; VI-NEXT:    v_mov_b32_e32 v1, s10
451; VI-NEXT:    v_mov_b32_e32 v4, s9
452; VI-NEXT:    v_alignbit_b32 v3, s7, v0, 1
453; VI-NEXT:    v_alignbit_b32 v2, s6, v1, 9
454; VI-NEXT:    v_alignbit_b32 v1, s5, v4, 7
455; VI-NEXT:    v_mov_b32_e32 v0, s8
456; VI-NEXT:    v_mov_b32_e32 v5, s1
457; VI-NEXT:    v_alignbit_b32 v0, s4, v0, 1
458; VI-NEXT:    v_mov_b32_e32 v4, s0
459; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
460; VI-NEXT:    s_endpgm
461;
462; GFX9-LABEL: fshr_v4i32_imm:
463; GFX9:       ; %bb.0: ; %entry
464; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
465; GFX9-NEXT:    v_mov_b32_e32 v4, 0
466; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
467; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
468; GFX9-NEXT:    v_mov_b32_e32 v0, s11
469; GFX9-NEXT:    v_mov_b32_e32 v1, s10
470; GFX9-NEXT:    v_alignbit_b32 v3, s7, v0, 1
471; GFX9-NEXT:    v_mov_b32_e32 v0, s9
472; GFX9-NEXT:    v_alignbit_b32 v2, s6, v1, 9
473; GFX9-NEXT:    v_alignbit_b32 v1, s5, v0, 7
474; GFX9-NEXT:    v_mov_b32_e32 v0, s8
475; GFX9-NEXT:    v_alignbit_b32 v0, s4, v0, 1
476; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
477; GFX9-NEXT:    s_endpgm
478;
479; R600-LABEL: fshr_v4i32_imm:
480; R600:       ; %bb.0: ; %entry
481; R600-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
482; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
483; R600-NEXT:    CF_END
484; R600-NEXT:    PAD
485; R600-NEXT:    ALU clause starting at 4:
486; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1,
487; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x,
488; R600-NEXT:    9(1.261169e-44), 0(0.000000e+00)
489; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x,
490; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
491; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, 1,
492; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
493; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
494;
495; GFX10-LABEL: fshr_v4i32_imm:
496; GFX10:       ; %bb.0: ; %entry
497; GFX10-NEXT:    s_clause 0x1
498; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
499; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
500; GFX10-NEXT:    v_mov_b32_e32 v4, 0
501; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
502; GFX10-NEXT:    v_alignbit_b32 v3, s7, s11, 1
503; GFX10-NEXT:    v_alignbit_b32 v2, s6, s10, 9
504; GFX10-NEXT:    v_alignbit_b32 v1, s5, s9, 7
505; GFX10-NEXT:    v_alignbit_b32 v0, s4, s8, 1
506; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
507; GFX10-NEXT:    s_endpgm
508entry:
509  %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>)
510  store <4 x i32> %0, <4 x i32> addrspace(1)* %in
511  ret void
512}
513
514define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) {
515; GFX89-LABEL: v_fshr_i32:
516; GFX89:       ; %bb.0:
517; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
518; GFX89-NEXT:    v_alignbit_b32 v0, v0, v1, v2
519; GFX89-NEXT:    s_setpc_b64 s[30:31]
520;
521; R600-LABEL: v_fshr_i32:
522; R600:       ; %bb.0:
523; R600-NEXT:    CF_END
524; R600-NEXT:    PAD
525;
526; GFX10-LABEL: v_fshr_i32:
527; GFX10:       ; %bb.0:
528; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
529; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
530; GFX10-NEXT:    v_alignbit_b32 v0, v0, v1, v2
531; GFX10-NEXT:    s_setpc_b64 s[30:31]
532  %ret = call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2)
533  ret i32 %ret
534}
535
536define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) {
537; GFX89-LABEL: v_fshr_v2i32:
538; GFX89:       ; %bb.0:
539; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
540; GFX89-NEXT:    v_alignbit_b32 v0, v0, v2, v4
541; GFX89-NEXT:    v_alignbit_b32 v1, v1, v3, v5
542; GFX89-NEXT:    s_setpc_b64 s[30:31]
543;
544; R600-LABEL: v_fshr_v2i32:
545; R600:       ; %bb.0:
546; R600-NEXT:    CF_END
547; R600-NEXT:    PAD
548;
549; GFX10-LABEL: v_fshr_v2i32:
550; GFX10:       ; %bb.0:
551; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
552; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
553; GFX10-NEXT:    v_alignbit_b32 v0, v0, v2, v4
554; GFX10-NEXT:    v_alignbit_b32 v1, v1, v3, v5
555; GFX10-NEXT:    s_setpc_b64 s[30:31]
556  %ret = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2)
557  ret <2 x i32> %ret
558}
559
560define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) {
561; GFX89-LABEL: v_fshr_v3i32:
562; GFX89:       ; %bb.0:
563; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
564; GFX89-NEXT:    v_alignbit_b32 v0, v0, v3, v6
565; GFX89-NEXT:    v_alignbit_b32 v1, v1, v4, v7
566; GFX89-NEXT:    v_alignbit_b32 v2, v2, v5, v8
567; GFX89-NEXT:    s_setpc_b64 s[30:31]
568;
569; R600-LABEL: v_fshr_v3i32:
570; R600:       ; %bb.0:
571; R600-NEXT:    CF_END
572; R600-NEXT:    PAD
573;
574; GFX10-LABEL: v_fshr_v3i32:
575; GFX10:       ; %bb.0:
576; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
577; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
578; GFX10-NEXT:    v_alignbit_b32 v0, v0, v3, v6
579; GFX10-NEXT:    v_alignbit_b32 v1, v1, v4, v7
580; GFX10-NEXT:    v_alignbit_b32 v2, v2, v5, v8
581; GFX10-NEXT:    s_setpc_b64 s[30:31]
582  %ret = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2)
583  ret <3 x i32> %ret
584}
585
586define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) {
587; GFX89-LABEL: v_fshr_v4i32:
588; GFX89:       ; %bb.0:
589; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
590; GFX89-NEXT:    v_alignbit_b32 v0, v0, v4, v8
591; GFX89-NEXT:    v_alignbit_b32 v1, v1, v5, v9
592; GFX89-NEXT:    v_alignbit_b32 v2, v2, v6, v10
593; GFX89-NEXT:    v_alignbit_b32 v3, v3, v7, v11
594; GFX89-NEXT:    s_setpc_b64 s[30:31]
595;
596; R600-LABEL: v_fshr_v4i32:
597; R600:       ; %bb.0:
598; R600-NEXT:    CF_END
599; R600-NEXT:    PAD
600;
601; GFX10-LABEL: v_fshr_v4i32:
602; GFX10:       ; %bb.0:
603; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
604; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
605; GFX10-NEXT:    v_alignbit_b32 v0, v0, v4, v8
606; GFX10-NEXT:    v_alignbit_b32 v1, v1, v5, v9
607; GFX10-NEXT:    v_alignbit_b32 v2, v2, v6, v10
608; GFX10-NEXT:    v_alignbit_b32 v3, v3, v7, v11
609; GFX10-NEXT:    s_setpc_b64 s[30:31]
610  %ret = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2)
611  ret <4 x i32> %ret
612}
613
614define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
615; SI-LABEL: v_fshr_i16:
616; SI:       ; %bb.0:
617; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
618; SI-NEXT:    v_or_b32_e32 v2, 16, v2
619; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
620; SI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
621; SI-NEXT:    s_setpc_b64 s[30:31]
622;
623; VI-LABEL: v_fshr_i16:
624; VI:       ; %bb.0:
625; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
626; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
627; VI-NEXT:    v_xor_b32_e32 v3, -1, v2
628; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
629; VI-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
630; VI-NEXT:    v_or_b32_e32 v0, v0, v1
631; VI-NEXT:    s_setpc_b64 s[30:31]
632;
633; GFX9-LABEL: v_fshr_i16:
634; GFX9:       ; %bb.0:
635; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
636; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
637; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v2
638; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
639; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
640; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
641; GFX9-NEXT:    s_setpc_b64 s[30:31]
642;
643; R600-LABEL: v_fshr_i16:
644; R600:       ; %bb.0:
645; R600-NEXT:    CF_END
646; R600-NEXT:    PAD
647;
648; GFX10-LABEL: v_fshr_i16:
649; GFX10:       ; %bb.0:
650; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
651; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
652; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
653; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
654; GFX10-NEXT:    v_lshrrev_b16 v1, v2, v1
655; GFX10-NEXT:    v_lshlrev_b16 v0, v3, v0
656; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
657; GFX10-NEXT:    s_setpc_b64 s[30:31]
658  %ret = call i16 @llvm.fshr.i16(i16 %src0, i16 %src1, i16 %src2)
659  ret i16 %ret
660}
661
662define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) {
663; SI-LABEL: v_fshr_v2i16:
664; SI:       ; %bb.0:
665; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
666; SI-NEXT:    v_or_b32_e32 v5, 16, v5
667; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
668; SI-NEXT:    v_alignbit_b32 v1, v1, v3, v5
669; SI-NEXT:    v_or_b32_e32 v3, 16, v4
670; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
671; SI-NEXT:    v_alignbit_b32 v0, v0, v2, v3
672; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
673; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
674; SI-NEXT:    v_or_b32_e32 v0, v0, v1
675; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
676; SI-NEXT:    s_setpc_b64 s[30:31]
677;
678; VI-LABEL: v_fshr_v2i16:
679; VI:       ; %bb.0:
680; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
681; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
682; VI-NEXT:    v_mov_b32_e32 v5, 1
683; VI-NEXT:    v_lshrrev_b16_sdwa v4, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
684; VI-NEXT:    v_lshlrev_b16_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
685; VI-NEXT:    v_xor_b32_e32 v3, -1, v3
686; VI-NEXT:    v_lshlrev_b16_e32 v3, v3, v5
687; VI-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
688; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
689; VI-NEXT:    v_xor_b32_e32 v4, -1, v2
690; VI-NEXT:    v_lshlrev_b16_e32 v0, v4, v0
691; VI-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
692; VI-NEXT:    v_or_b32_e32 v0, v0, v1
693; VI-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
694; VI-NEXT:    s_setpc_b64 s[30:31]
695;
696; GFX9-LABEL: v_fshr_v2i16:
697; GFX9:       ; %bb.0:
698; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
699; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v2
700; GFX9-NEXT:    s_mov_b32 s4, 0xf000f
701; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
702; GFX9-NEXT:    v_and_b32_e32 v3, s4, v3
703; GFX9-NEXT:    v_and_b32_e32 v2, s4, v2
704; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
705; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
706; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
707; GFX9-NEXT:    s_setpc_b64 s[30:31]
708;
709; R600-LABEL: v_fshr_v2i16:
710; R600:       ; %bb.0:
711; R600-NEXT:    CF_END
712; R600-NEXT:    PAD
713;
714; GFX10-LABEL: v_fshr_v2i16:
715; GFX10:       ; %bb.0:
716; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
717; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
718; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
719; GFX10-NEXT:    s_mov_b32 s4, 0xf000f
720; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
721; GFX10-NEXT:    v_and_b32_e32 v2, s4, v2
722; GFX10-NEXT:    v_and_b32_e32 v3, s4, v3
723; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
724; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
725; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
726; GFX10-NEXT:    s_setpc_b64 s[30:31]
727  %ret = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2)
728  ret <2 x i16> %ret
729}
730
731define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) {
732; SI-LABEL: v_fshr_v3i16:
733; SI:       ; %bb.0:
734; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
735; SI-NEXT:    v_or_b32_e32 v7, 16, v7
736; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
737; SI-NEXT:    v_alignbit_b32 v1, v1, v4, v7
738; SI-NEXT:    v_or_b32_e32 v4, 16, v6
739; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
740; SI-NEXT:    v_alignbit_b32 v0, v0, v3, v4
741; SI-NEXT:    s_mov_b32 s4, 0xffff
742; SI-NEXT:    v_or_b32_e32 v3, 16, v8
743; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
744; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
745; SI-NEXT:    v_and_b32_e32 v0, s4, v0
746; SI-NEXT:    v_alignbit_b32 v3, v2, v4, v3
747; SI-NEXT:    v_or_b32_e32 v0, v0, v1
748; SI-NEXT:    v_and_b32_e32 v2, s4, v3
749; SI-NEXT:    v_alignbit_b32 v1, v3, v1, 16
750; SI-NEXT:    s_setpc_b64 s[30:31]
751;
752; VI-LABEL: v_fshr_v3i16:
753; VI:       ; %bb.0:
754; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
755; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
756; VI-NEXT:    v_mov_b32_e32 v8, 1
757; VI-NEXT:    v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
758; VI-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
759; VI-NEXT:    v_xor_b32_e32 v6, -1, v6
760; VI-NEXT:    v_lshlrev_b16_e32 v6, v6, v8
761; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
762; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
763; VI-NEXT:    v_xor_b32_e32 v7, -1, v5
764; VI-NEXT:    v_lshlrev_b16_e32 v1, v7, v1
765; VI-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
766; VI-NEXT:    v_or_b32_e32 v1, v1, v3
767; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
768; VI-NEXT:    v_xor_b32_e32 v3, -1, v4
769; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
770; VI-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
771; VI-NEXT:    v_or_b32_e32 v0, v0, v2
772; VI-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
773; VI-NEXT:    s_setpc_b64 s[30:31]
774;
775; GFX9-LABEL: v_fshr_v3i16:
776; GFX9:       ; %bb.0:
777; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
778; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
779; GFX9-NEXT:    v_mov_b32_e32 v8, 1
780; GFX9-NEXT:    v_lshrrev_b16_sdwa v7, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
781; GFX9-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
782; GFX9-NEXT:    v_xor_b32_e32 v6, -1, v6
783; GFX9-NEXT:    v_lshlrev_b16_e32 v6, v6, v8
784; GFX9-NEXT:    v_or_b32_e32 v6, v6, v7
785; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
786; GFX9-NEXT:    v_xor_b32_e32 v7, -1, v5
787; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v7, v1
788; GFX9-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
789; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
790; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
791; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v4
792; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
793; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
794; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
795; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
796; GFX9-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
797; GFX9-NEXT:    s_setpc_b64 s[30:31]
798;
799; R600-LABEL: v_fshr_v3i16:
800; R600:       ; %bb.0:
801; R600-NEXT:    CF_END
802; R600-NEXT:    PAD
803;
804; GFX10-LABEL: v_fshr_v3i16:
805; GFX10:       ; %bb.0:
806; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
807; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
808; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
809; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
810; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
811; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v4
812; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
813; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v6
814; GFX10-NEXT:    v_lshlrev_b16 v7, 1, v7
815; GFX10-NEXT:    v_lshrrev_b16 v2, v4, v2
816; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
817; GFX10-NEXT:    v_lshrrev_b16 v4, v6, v9
818; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v1
819; GFX10-NEXT:    v_lshlrev_b16 v6, v10, v7
820; GFX10-NEXT:    v_lshrrev_b16 v3, v5, v3
821; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
822; GFX10-NEXT:    v_xor_b32_e32 v2, -1, v5
823; GFX10-NEXT:    v_or_b32_e32 v4, v6, v4
824; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
825; GFX10-NEXT:    v_lshlrev_b16 v1, v2, v1
826; GFX10-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
827; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
828; GFX10-NEXT:    s_setpc_b64 s[30:31]
829  %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2)
830  ret <3 x i16> %ret
831}
832
833define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) {
834; SI-LABEL: v_fshr_v4i16:
835; SI:       ; %bb.0:
836; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
837; SI-NEXT:    v_or_b32_e32 v9, 16, v9
838; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
839; SI-NEXT:    v_alignbit_b32 v1, v1, v5, v9
840; SI-NEXT:    v_or_b32_e32 v5, 16, v8
841; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
842; SI-NEXT:    v_alignbit_b32 v0, v0, v4, v5
843; SI-NEXT:    v_or_b32_e32 v4, 16, v11
844; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
845; SI-NEXT:    v_alignbit_b32 v3, v3, v5, v4
846; SI-NEXT:    v_or_b32_e32 v4, 16, v10
847; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
848; SI-NEXT:    s_mov_b32 s4, 0xffff
849; SI-NEXT:    v_alignbit_b32 v2, v2, v5, v4
850; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
851; SI-NEXT:    v_and_b32_e32 v2, s4, v2
852; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
853; SI-NEXT:    v_and_b32_e32 v0, s4, v0
854; SI-NEXT:    v_or_b32_e32 v2, v2, v3
855; SI-NEXT:    v_or_b32_e32 v0, v0, v1
856; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
857; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
858; SI-NEXT:    s_setpc_b64 s[30:31]
859;
860; VI-LABEL: v_fshr_v4i16:
861; VI:       ; %bb.0:
862; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
863; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
864; VI-NEXT:    v_mov_b32_e32 v8, 1
865; VI-NEXT:    v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
866; VI-NEXT:    v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
867; VI-NEXT:    v_xor_b32_e32 v6, -1, v6
868; VI-NEXT:    v_lshlrev_b16_e32 v6, v6, v9
869; VI-NEXT:    v_or_b32_sdwa v6, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
870; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
871; VI-NEXT:    v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
872; VI-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
873; VI-NEXT:    v_xor_b32_e32 v7, -1, v7
874; VI-NEXT:    v_lshlrev_b16_e32 v7, v7, v8
875; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
876; VI-NEXT:    v_xor_b32_e32 v8, -1, v5
877; VI-NEXT:    v_lshlrev_b16_e32 v1, v8, v1
878; VI-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
879; VI-NEXT:    v_or_b32_e32 v1, v1, v3
880; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
881; VI-NEXT:    v_xor_b32_e32 v3, -1, v4
882; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
883; VI-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
884; VI-NEXT:    v_or_b32_sdwa v7, v7, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
885; VI-NEXT:    v_or_b32_e32 v0, v0, v2
886; VI-NEXT:    v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
887; VI-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
888; VI-NEXT:    s_setpc_b64 s[30:31]
889;
890; GFX9-LABEL: v_fshr_v4i16:
891; GFX9:       ; %bb.0:
892; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
893; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
894; GFX9-NEXT:    v_mov_b32_e32 v8, 1
895; GFX9-NEXT:    v_lshrrev_b16_sdwa v7, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
896; GFX9-NEXT:    v_lshlrev_b16_sdwa v9, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
897; GFX9-NEXT:    v_xor_b32_e32 v6, -1, v6
898; GFX9-NEXT:    v_lshlrev_b16_e32 v6, v6, v9
899; GFX9-NEXT:    v_or_b32_e32 v6, v6, v7
900; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
901; GFX9-NEXT:    v_lshrrev_b16_sdwa v9, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
902; GFX9-NEXT:    v_lshlrev_b16_sdwa v8, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
903; GFX9-NEXT:    v_xor_b32_e32 v7, -1, v7
904; GFX9-NEXT:    v_lshlrev_b16_e32 v7, v7, v8
905; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
906; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v5
907; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v8, v1
908; GFX9-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
909; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
910; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
911; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v4
912; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
913; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
914; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
915; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
916; GFX9-NEXT:    v_or_b32_e32 v7, v7, v9
917; GFX9-NEXT:    v_and_b32_e32 v0, v2, v0
918; GFX9-NEXT:    v_and_b32_e32 v1, v2, v1
919; GFX9-NEXT:    v_lshl_or_b32 v0, v7, 16, v0
920; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 16, v1
921; GFX9-NEXT:    s_setpc_b64 s[30:31]
922;
923; R600-LABEL: v_fshr_v4i16:
924; R600:       ; %bb.0:
925; R600-NEXT:    CF_END
926; R600-NEXT:    PAD
927;
928; GFX10-LABEL: v_fshr_v4i16:
929; GFX10:       ; %bb.0:
930; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
931; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
932; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
933; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
934; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
935; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v4
936; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
937; GFX10-NEXT:    v_lshlrev_b16 v6, 1, v6
938; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v7
939; GFX10-NEXT:    v_lshrrev_b16 v7, v7, v8
940; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
941; GFX10-NEXT:    v_xor_b32_e32 v8, -1, v4
942; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v1
943; GFX10-NEXT:    v_lshlrev_b16 v6, v9, v6
944; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v5
945; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
946; GFX10-NEXT:    v_lshlrev_b16 v12, 1, v12
947; GFX10-NEXT:    v_xor_b32_e32 v13, -1, v11
948; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
949; GFX10-NEXT:    v_lshrrev_b16 v2, v4, v2
950; GFX10-NEXT:    v_lshlrev_b16 v1, v9, v1
951; GFX10-NEXT:    v_lshrrev_b16 v3, v5, v3
952; GFX10-NEXT:    v_lshrrev_b16 v4, v11, v10
953; GFX10-NEXT:    v_lshlrev_b16 v5, v13, v12
954; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
955; GFX10-NEXT:    v_mov_b32_e32 v2, 0xffff
956; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
957; GFX10-NEXT:    v_or_b32_e32 v3, v6, v7
958; GFX10-NEXT:    v_or_b32_e32 v4, v5, v4
959; GFX10-NEXT:    v_and_b32_e32 v0, v2, v0
960; GFX10-NEXT:    v_and_b32_e32 v1, v2, v1
961; GFX10-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
962; GFX10-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
963; GFX10-NEXT:    s_setpc_b64 s[30:31]
964  %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2)
965  ret <4 x i16> %ret
966}
967
968define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
969; SI-LABEL: v_fshr_i64:
970; SI:       ; %bb.0:
971; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
972; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
973; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], v4
974; SI-NEXT:    v_not_b32_e32 v4, v4
975; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
976; SI-NEXT:    v_or_b32_e32 v1, v1, v3
977; SI-NEXT:    v_or_b32_e32 v0, v0, v2
978; SI-NEXT:    s_setpc_b64 s[30:31]
979;
980; VI-LABEL: v_fshr_i64:
981; VI:       ; %bb.0:
982; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
983; VI-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
984; VI-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
985; VI-NEXT:    v_not_b32_e32 v4, v4
986; VI-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
987; VI-NEXT:    v_or_b32_e32 v1, v1, v3
988; VI-NEXT:    v_or_b32_e32 v0, v0, v2
989; VI-NEXT:    s_setpc_b64 s[30:31]
990;
991; GFX9-LABEL: v_fshr_i64:
992; GFX9:       ; %bb.0:
993; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
994; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
995; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
996; GFX9-NEXT:    v_not_b32_e32 v4, v4
997; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
998; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
999; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
1000; GFX9-NEXT:    s_setpc_b64 s[30:31]
1001;
1002; R600-LABEL: v_fshr_i64:
1003; R600:       ; %bb.0:
1004; R600-NEXT:    CF_END
1005; R600-NEXT:    PAD
1006;
1007; GFX10-LABEL: v_fshr_i64:
1008; GFX10:       ; %bb.0:
1009; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1010; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1011; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1012; GFX10-NEXT:    v_not_b32_e32 v5, v4
1013; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
1014; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
1015; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
1016; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
1017; GFX10-NEXT:    s_setpc_b64 s[30:31]
1018  %ret = call i64 @llvm.fshr.i64(i64 %src0, i64 %src1, i64 %src2)
1019  ret i64 %ret
1020}
1021
1022define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) {
1023; SI-LABEL: v_fshr_v2i64:
1024; SI:       ; %bb.0:
1025; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1026; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
1027; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], v8
1028; SI-NEXT:    v_not_b32_e32 v8, v8
1029; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v8
1030; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
1031; SI-NEXT:    v_or_b32_e32 v1, v1, v5
1032; SI-NEXT:    v_lshr_b64 v[5:6], v[6:7], v10
1033; SI-NEXT:    v_not_b32_e32 v7, v10
1034; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], v7
1035; SI-NEXT:    v_or_b32_e32 v0, v0, v4
1036; SI-NEXT:    v_or_b32_e32 v3, v3, v6
1037; SI-NEXT:    v_or_b32_e32 v2, v2, v5
1038; SI-NEXT:    s_setpc_b64 s[30:31]
1039;
1040; VI-LABEL: v_fshr_v2i64:
1041; VI:       ; %bb.0:
1042; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1043; VI-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1044; VI-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
1045; VI-NEXT:    v_not_b32_e32 v8, v8
1046; VI-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
1047; VI-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
1048; VI-NEXT:    v_or_b32_e32 v1, v1, v5
1049; VI-NEXT:    v_lshrrev_b64 v[5:6], v10, v[6:7]
1050; VI-NEXT:    v_not_b32_e32 v7, v10
1051; VI-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
1052; VI-NEXT:    v_or_b32_e32 v0, v0, v4
1053; VI-NEXT:    v_or_b32_e32 v3, v3, v6
1054; VI-NEXT:    v_or_b32_e32 v2, v2, v5
1055; VI-NEXT:    s_setpc_b64 s[30:31]
1056;
1057; GFX9-LABEL: v_fshr_v2i64:
1058; GFX9:       ; %bb.0:
1059; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1060; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1061; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
1062; GFX9-NEXT:    v_not_b32_e32 v8, v8
1063; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
1064; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
1065; GFX9-NEXT:    v_or_b32_e32 v1, v1, v5
1066; GFX9-NEXT:    v_lshrrev_b64 v[5:6], v10, v[6:7]
1067; GFX9-NEXT:    v_not_b32_e32 v7, v10
1068; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
1069; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4
1070; GFX9-NEXT:    v_or_b32_e32 v3, v3, v6
1071; GFX9-NEXT:    v_or_b32_e32 v2, v2, v5
1072; GFX9-NEXT:    s_setpc_b64 s[30:31]
1073;
1074; R600-LABEL: v_fshr_v2i64:
1075; R600:       ; %bb.0:
1076; R600-NEXT:    CF_END
1077; R600-NEXT:    PAD
1078;
1079; GFX10-LABEL: v_fshr_v2i64:
1080; GFX10:       ; %bb.0:
1081; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1082; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1083; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1084; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
1085; GFX10-NEXT:    v_not_b32_e32 v9, v8
1086; GFX10-NEXT:    v_not_b32_e32 v11, v10
1087; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
1088; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v10, v[6:7]
1089; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
1090; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
1091; GFX10-NEXT:    v_or_b32_e32 v0, v0, v4
1092; GFX10-NEXT:    v_or_b32_e32 v1, v1, v5
1093; GFX10-NEXT:    v_or_b32_e32 v2, v2, v6
1094; GFX10-NEXT:    v_or_b32_e32 v3, v3, v7
1095; GFX10-NEXT:    s_setpc_b64 s[30:31]
1096  %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2)
1097  ret <2 x i64> %ret
1098}
1099
1100define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
1101; SI-LABEL: v_fshr_i24:
1102; SI:       ; %bb.0:
1103; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1104; SI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1105; SI-NEXT:    v_mul_hi_u32 v3, v2, s4
1106; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1107; SI-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1108; SI-NEXT:    v_mul_lo_u32 v3, v3, 24
1109; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
1110; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
1111; SI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1112; SI-NEXT:    s_setpc_b64 s[30:31]
1113;
1114; VI-LABEL: v_fshr_i24:
1115; VI:       ; %bb.0:
1116; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1117; VI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1118; VI-NEXT:    v_mul_hi_u32 v3, v2, s4
1119; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1120; VI-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1121; VI-NEXT:    v_mul_lo_u32 v3, v3, 24
1122; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
1123; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v2
1124; VI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1125; VI-NEXT:    s_setpc_b64 s[30:31]
1126;
1127; GFX9-LABEL: v_fshr_i24:
1128; GFX9:       ; %bb.0:
1129; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1130; GFX9-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1131; GFX9-NEXT:    v_mul_hi_u32 v3, v2, s4
1132; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1133; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1134; GFX9-NEXT:    v_mul_lo_u32 v3, v3, 24
1135; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
1136; GFX9-NEXT:    v_add_u32_e32 v2, 8, v2
1137; GFX9-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1138; GFX9-NEXT:    s_setpc_b64 s[30:31]
1139;
1140; R600-LABEL: v_fshr_i24:
1141; R600:       ; %bb.0:
1142; R600-NEXT:    CF_END
1143; R600-NEXT:    PAD
1144;
1145; GFX10-LABEL: v_fshr_i24:
1146; GFX10:       ; %bb.0:
1147; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1148; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1149; GFX10-NEXT:    v_mul_hi_u32 v3, 0xaaaaaaab, v2
1150; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1151; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
1152; GFX10-NEXT:    v_mul_lo_u32 v3, v3, 24
1153; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
1154; GFX10-NEXT:    v_add_nc_u32_e32 v2, 8, v2
1155; GFX10-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1156; GFX10-NEXT:    s_setpc_b64 s[30:31]
1157  %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2)
1158  ret i24 %ret
1159}
1160
1161define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) {
1162; SI-LABEL: v_fshr_v2i24:
1163; SI:       ; %bb.0:
1164; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1165; SI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1166; SI-NEXT:    v_mul_hi_u32 v6, v4, s4
1167; SI-NEXT:    v_mul_hi_u32 v7, v5, s4
1168; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1169; SI-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1170; SI-NEXT:    v_mul_lo_u32 v6, v6, 24
1171; SI-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
1172; SI-NEXT:    v_lshrrev_b32_e32 v6, 4, v7
1173; SI-NEXT:    v_mul_lo_u32 v6, v6, 24
1174; SI-NEXT:    v_add_i32_e32 v4, vcc, 8, v4
1175; SI-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1176; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
1177; SI-NEXT:    v_sub_i32_e32 v3, vcc, v5, v6
1178; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v3
1179; SI-NEXT:    v_alignbit_b32 v1, v1, v2, v3
1180; SI-NEXT:    s_setpc_b64 s[30:31]
1181;
1182; VI-LABEL: v_fshr_v2i24:
1183; VI:       ; %bb.0:
1184; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1185; VI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1186; VI-NEXT:    v_mul_hi_u32 v6, v4, s4
1187; VI-NEXT:    v_mul_hi_u32 v7, v5, s4
1188; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1189; VI-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1190; VI-NEXT:    v_mul_lo_u32 v6, v6, 24
1191; VI-NEXT:    v_sub_u32_e32 v4, vcc, v4, v6
1192; VI-NEXT:    v_lshrrev_b32_e32 v6, 4, v7
1193; VI-NEXT:    v_mul_lo_u32 v6, v6, 24
1194; VI-NEXT:    v_add_u32_e32 v4, vcc, 8, v4
1195; VI-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1196; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
1197; VI-NEXT:    v_sub_u32_e32 v3, vcc, v5, v6
1198; VI-NEXT:    v_add_u32_e32 v3, vcc, 8, v3
1199; VI-NEXT:    v_alignbit_b32 v1, v1, v2, v3
1200; VI-NEXT:    s_setpc_b64 s[30:31]
1201;
1202; GFX9-LABEL: v_fshr_v2i24:
1203; GFX9:       ; %bb.0:
1204; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1205; GFX9-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1206; GFX9-NEXT:    v_mul_hi_u32 v6, v4, s4
1207; GFX9-NEXT:    v_mul_hi_u32 v7, v5, s4
1208; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1209; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1210; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
1211; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v6
1212; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 4, v7
1213; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
1214; GFX9-NEXT:    v_add_u32_e32 v4, 8, v4
1215; GFX9-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1216; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
1217; GFX9-NEXT:    v_sub_u32_e32 v3, v5, v6
1218; GFX9-NEXT:    v_add_u32_e32 v3, 8, v3
1219; GFX9-NEXT:    v_alignbit_b32 v1, v1, v2, v3
1220; GFX9-NEXT:    s_setpc_b64 s[30:31]
1221;
1222; R600-LABEL: v_fshr_v2i24:
1223; R600:       ; %bb.0:
1224; R600-NEXT:    CF_END
1225; R600-NEXT:    PAD
1226;
1227; GFX10-LABEL: v_fshr_v2i24:
1228; GFX10:       ; %bb.0:
1229; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1230; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1231; GFX10-NEXT:    s_mov_b32 s4, 0xaaaaaaab
1232; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1233; GFX10-NEXT:    v_mul_hi_u32 v6, v4, s4
1234; GFX10-NEXT:    v_mul_hi_u32 v7, v5, s4
1235; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
1236; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
1237; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 4, v7
1238; GFX10-NEXT:    v_mul_lo_u32 v6, v6, 24
1239; GFX10-NEXT:    v_mul_lo_u32 v7, v7, 24
1240; GFX10-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
1241; GFX10-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
1242; GFX10-NEXT:    v_add_nc_u32_e32 v4, 8, v4
1243; GFX10-NEXT:    v_add_nc_u32_e32 v5, 8, v5
1244; GFX10-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1245; GFX10-NEXT:    v_alignbit_b32 v1, v1, v3, v5
1246; GFX10-NEXT:    s_setpc_b64 s[30:31]
1247  %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2)
1248  ret <2 x i24> %ret
1249}
1250