1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=VI
4; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s -check-prefixes=EG
5
6declare i32 @llvm.amdgcn.workitem.id.x() #0
7
8define amdgpu_kernel void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
9; SI-LABEL: ashr_v2i32:
10; SI:       ; %bb.0:
11; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
12; SI-NEXT:    s_mov_b32 s7, 0xf000
13; SI-NEXT:    s_mov_b32 s6, -1
14; SI-NEXT:    s_mov_b32 s10, s6
15; SI-NEXT:    s_mov_b32 s11, s7
16; SI-NEXT:    s_waitcnt lgkmcnt(0)
17; SI-NEXT:    s_mov_b32 s8, s2
18; SI-NEXT:    s_mov_b32 s9, s3
19; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
20; SI-NEXT:    s_mov_b32 s4, s0
21; SI-NEXT:    s_mov_b32 s5, s1
22; SI-NEXT:    s_waitcnt vmcnt(0)
23; SI-NEXT:    v_ashr_i32_e32 v1, v1, v3
24; SI-NEXT:    v_ashr_i32_e32 v0, v0, v2
25; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
26; SI-NEXT:    s_endpgm
27;
28; VI-LABEL: ashr_v2i32:
29; VI:       ; %bb.0:
30; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
31; VI-NEXT:    s_mov_b32 s7, 0xf000
32; VI-NEXT:    s_mov_b32 s6, -1
33; VI-NEXT:    s_mov_b32 s10, s6
34; VI-NEXT:    s_mov_b32 s11, s7
35; VI-NEXT:    s_waitcnt lgkmcnt(0)
36; VI-NEXT:    s_mov_b32 s8, s2
37; VI-NEXT:    s_mov_b32 s9, s3
38; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
39; VI-NEXT:    s_mov_b32 s4, s0
40; VI-NEXT:    s_mov_b32 s5, s1
41; VI-NEXT:    s_waitcnt vmcnt(0)
42; VI-NEXT:    v_ashrrev_i32_e32 v1, v3, v1
43; VI-NEXT:    v_ashrrev_i32_e32 v0, v2, v0
44; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
45; VI-NEXT:    s_endpgm
46;
47; EG-LABEL: ashr_v2i32:
48; EG:       ; %bb.0:
49; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
50; EG-NEXT:    TEX 0 @6
51; EG-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
52; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
53; EG-NEXT:    CF_END
54; EG-NEXT:    PAD
55; EG-NEXT:    Fetch clause starting at 6:
56; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
57; EG-NEXT:    ALU clause starting at 8:
58; EG-NEXT:     MOV * T0.X, KC0[2].Z,
59; EG-NEXT:    ALU clause starting at 9:
60; EG-NEXT:     ASHR * T0.Y, T0.Y, T0.W,
61; EG-NEXT:     ASHR T0.X, T0.X, T0.Z,
62; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
63; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
64  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
65  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
66  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
67  %result = ashr <2 x i32> %a, %b
68  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
69  ret void
70}
71
72define amdgpu_kernel void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
73; SI-LABEL: ashr_v4i32:
74; SI:       ; %bb.0:
75; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
76; SI-NEXT:    s_mov_b32 s7, 0xf000
77; SI-NEXT:    s_mov_b32 s6, -1
78; SI-NEXT:    s_mov_b32 s10, s6
79; SI-NEXT:    s_mov_b32 s11, s7
80; SI-NEXT:    s_waitcnt lgkmcnt(0)
81; SI-NEXT:    s_mov_b32 s8, s2
82; SI-NEXT:    s_mov_b32 s9, s3
83; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
84; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
85; SI-NEXT:    s_mov_b32 s4, s0
86; SI-NEXT:    s_mov_b32 s5, s1
87; SI-NEXT:    s_waitcnt vmcnt(0)
88; SI-NEXT:    v_ashr_i32_e32 v3, v3, v7
89; SI-NEXT:    v_ashr_i32_e32 v2, v2, v6
90; SI-NEXT:    v_ashr_i32_e32 v1, v1, v5
91; SI-NEXT:    v_ashr_i32_e32 v0, v0, v4
92; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
93; SI-NEXT:    s_endpgm
94;
95; VI-LABEL: ashr_v4i32:
96; VI:       ; %bb.0:
97; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
98; VI-NEXT:    s_mov_b32 s7, 0xf000
99; VI-NEXT:    s_mov_b32 s6, -1
100; VI-NEXT:    s_mov_b32 s10, s6
101; VI-NEXT:    s_mov_b32 s11, s7
102; VI-NEXT:    s_waitcnt lgkmcnt(0)
103; VI-NEXT:    s_mov_b32 s8, s2
104; VI-NEXT:    s_mov_b32 s9, s3
105; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
106; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
107; VI-NEXT:    s_mov_b32 s4, s0
108; VI-NEXT:    s_mov_b32 s5, s1
109; VI-NEXT:    s_waitcnt vmcnt(0)
110; VI-NEXT:    v_ashrrev_i32_e32 v3, v7, v3
111; VI-NEXT:    v_ashrrev_i32_e32 v2, v6, v2
112; VI-NEXT:    v_ashrrev_i32_e32 v1, v5, v1
113; VI-NEXT:    v_ashrrev_i32_e32 v0, v4, v0
114; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
115; VI-NEXT:    s_endpgm
116;
117; EG-LABEL: ashr_v4i32:
118; EG:       ; %bb.0:
119; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
120; EG-NEXT:    TEX 1 @6
121; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
122; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
123; EG-NEXT:    CF_END
124; EG-NEXT:    PAD
125; EG-NEXT:    Fetch clause starting at 6:
126; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
127; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
128; EG-NEXT:    ALU clause starting at 10:
129; EG-NEXT:     MOV * T0.X, KC0[2].Z,
130; EG-NEXT:    ALU clause starting at 11:
131; EG-NEXT:     ASHR * T0.W, T0.W, T1.W,
132; EG-NEXT:     ASHR * T0.Z, T0.Z, T1.Z,
133; EG-NEXT:     ASHR * T0.Y, T0.Y, T1.Y,
134; EG-NEXT:     ASHR T0.X, T0.X, T1.X,
135; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
136; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
137  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
138  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
139  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
140  %result = ashr <4 x i32> %a, %b
141  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
142  ret void
143}
144
145; FIXME: The ashr operation is uniform, but because its operands come from a
146; global load we end up with the vector instructions rather than scalar.
147define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
148; SI-LABEL: ashr_v2i16:
149; SI:       ; %bb.0:
150; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
151; SI-NEXT:    s_mov_b32 s7, 0xf000
152; SI-NEXT:    s_mov_b32 s6, -1
153; SI-NEXT:    s_mov_b32 s10, s6
154; SI-NEXT:    s_mov_b32 s11, s7
155; SI-NEXT:    s_waitcnt lgkmcnt(0)
156; SI-NEXT:    s_mov_b32 s8, s2
157; SI-NEXT:    s_mov_b32 s9, s3
158; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
159; SI-NEXT:    s_mov_b32 s4, s0
160; SI-NEXT:    s_mov_b32 s5, s1
161; SI-NEXT:    s_waitcnt vmcnt(0)
162; SI-NEXT:    v_bfe_i32 v2, v0, 0, 16
163; SI-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
164; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
165; SI-NEXT:    v_ashrrev_i32_e32 v0, v3, v0
166; SI-NEXT:    v_ashrrev_i32_e32 v1, v1, v2
167; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
168; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
169; SI-NEXT:    v_or_b32_e32 v0, v1, v0
170; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
171; SI-NEXT:    s_endpgm
172;
173; VI-LABEL: ashr_v2i16:
174; VI:       ; %bb.0:
175; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
176; VI-NEXT:    s_mov_b32 s7, 0xf000
177; VI-NEXT:    s_mov_b32 s6, -1
178; VI-NEXT:    s_mov_b32 s10, s6
179; VI-NEXT:    s_mov_b32 s11, s7
180; VI-NEXT:    s_waitcnt lgkmcnt(0)
181; VI-NEXT:    s_mov_b32 s8, s2
182; VI-NEXT:    s_mov_b32 s9, s3
183; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
184; VI-NEXT:    s_mov_b32 s4, s0
185; VI-NEXT:    s_mov_b32 s5, s1
186; VI-NEXT:    s_waitcnt vmcnt(0)
187; VI-NEXT:    v_ashrrev_i32_sdwa v2, sext(v1), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
188; VI-NEXT:    v_ashrrev_i32_sdwa v0, sext(v1), sext(v0) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
189; VI-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
190; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
191; VI-NEXT:    s_endpgm
192;
193; EG-LABEL: ashr_v2i16:
194; EG:       ; %bb.0:
195; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
196; EG-NEXT:    TEX 0 @6
197; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
198; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1
199; EG-NEXT:    CF_END
200; EG-NEXT:    PAD
201; EG-NEXT:    Fetch clause starting at 6:
202; EG-NEXT:     VTX_READ_64 T6.XY, T6.X, 0, #1
203; EG-NEXT:    ALU clause starting at 8:
204; EG-NEXT:     MOV * T6.X, KC0[2].Z,
205; EG-NEXT:    ALU clause starting at 9:
206; EG-NEXT:     LSHR * T0.W, T6.X, literal.x,
207; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
208; EG-NEXT:     BFE_INT T0.Y, PV.W, 0.0, literal.x,
209; EG-NEXT:     LSHR T0.Z, T6.Y, literal.x,
210; EG-NEXT:     BFE_INT T0.W, T6.X, 0.0, literal.x,
211; EG-NEXT:     AND_INT * T1.W, T6.Y, literal.y,
212; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
213; EG-NEXT:     ASHR T0.W, PV.W, PS,
214; EG-NEXT:     ASHR * T1.W, PV.Y, PV.Z,
215; EG-NEXT:     LSHL T1.W, PS, literal.x,
216; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
217; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
218; EG-NEXT:     OR_INT T6.X, PS, PV.W,
219; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
220; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
221  %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
222  %a = load <2 x i16>, <2 x i16> addrspace(1)* %in
223  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
224  %result = ashr <2 x i16> %a, %b
225  store <2 x i16> %result, <2 x i16> addrspace(1)* %out
226  ret void
227}
228
229; FIXME: The ashr operation is uniform, but because its operands come from a
230; global load we end up with the vector instructions rather than scalar.
231define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
232; SI-LABEL: ashr_v4i16:
233; SI:       ; %bb.0:
234; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
235; SI-NEXT:    s_mov_b32 s7, 0xf000
236; SI-NEXT:    s_mov_b32 s6, -1
237; SI-NEXT:    s_mov_b32 s10, s6
238; SI-NEXT:    s_mov_b32 s11, s7
239; SI-NEXT:    s_waitcnt lgkmcnt(0)
240; SI-NEXT:    s_mov_b32 s8, s2
241; SI-NEXT:    s_mov_b32 s9, s3
242; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
243; SI-NEXT:    s_mov_b32 s2, 0xffff
244; SI-NEXT:    s_mov_b32 s4, s0
245; SI-NEXT:    s_mov_b32 s5, s1
246; SI-NEXT:    s_waitcnt vmcnt(0)
247; SI-NEXT:    v_bfe_i32 v4, v0, 0, 16
248; SI-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
249; SI-NEXT:    v_bfe_i32 v5, v1, 0, 16
250; SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
251; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
252; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
253; SI-NEXT:    v_ashr_i32_e32 v1, v1, v7
254; SI-NEXT:    v_ashr_i32_e32 v3, v5, v3
255; SI-NEXT:    v_ashr_i32_e32 v0, v0, v6
256; SI-NEXT:    v_ashr_i32_e32 v2, v4, v2
257; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
258; SI-NEXT:    v_and_b32_e32 v3, s2, v3
259; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
260; SI-NEXT:    v_and_b32_e32 v2, s2, v2
261; SI-NEXT:    v_or_b32_e32 v1, v3, v1
262; SI-NEXT:    v_or_b32_e32 v0, v2, v0
263; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
264; SI-NEXT:    s_endpgm
265;
266; VI-LABEL: ashr_v4i16:
267; VI:       ; %bb.0:
268; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
269; VI-NEXT:    s_mov_b32 s7, 0xf000
270; VI-NEXT:    s_mov_b32 s6, -1
271; VI-NEXT:    s_mov_b32 s10, s6
272; VI-NEXT:    s_mov_b32 s11, s7
273; VI-NEXT:    s_waitcnt lgkmcnt(0)
274; VI-NEXT:    s_mov_b32 s8, s2
275; VI-NEXT:    s_mov_b32 s9, s3
276; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
277; VI-NEXT:    s_mov_b32 s4, s0
278; VI-NEXT:    s_mov_b32 s5, s1
279; VI-NEXT:    s_waitcnt vmcnt(0)
280; VI-NEXT:    v_ashrrev_i32_sdwa v4, sext(v2), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
281; VI-NEXT:    v_ashrrev_i32_sdwa v0, sext(v2), sext(v0) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
282; VI-NEXT:    v_ashrrev_i32_sdwa v2, sext(v3), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
283; VI-NEXT:    v_ashrrev_i32_sdwa v1, sext(v3), sext(v1) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
284; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
285; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
286; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
287; VI-NEXT:    s_endpgm
288;
289; EG-LABEL: ashr_v4i16:
290; EG:       ; %bb.0:
291; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
292; EG-NEXT:    TEX 0 @6
293; EG-NEXT:    ALU 58, @9, KC0[CB0:0-32], KC1[]
294; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XY, T9.X, 1
295; EG-NEXT:    CF_END
296; EG-NEXT:    PAD
297; EG-NEXT:    Fetch clause starting at 6:
298; EG-NEXT:     VTX_READ_128 T9.XYZW, T9.X, 0, #1
299; EG-NEXT:    ALU clause starting at 8:
300; EG-NEXT:     MOV * T9.X, KC0[2].Z,
301; EG-NEXT:    ALU clause starting at 9:
302; EG-NEXT:     MOV T4.X, T9.X,
303; EG-NEXT:     MOV * T5.X, T9.Y,
304; EG-NEXT:     MOV T0.Y, PV.X,
305; EG-NEXT:     MOV * T0.Z, PS,
306; EG-NEXT:     MOV T2.X, T9.Z,
307; EG-NEXT:     MOV * T3.X, T9.W,
308; EG-NEXT:     MOV * T0.W, T6.X,
309; EG-NEXT:     MOV T1.Y, T2.X,
310; EG-NEXT:     BFE_INT * T1.W, T0.Y, 0.0, literal.x,
311; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
312; EG-NEXT:     AND_INT * T2.W, PV.Y, literal.x,
313; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
314; EG-NEXT:     ASHR * T1.W, T1.W, PV.W,
315; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
316; EG-NEXT:     AND_INT * T0.W, T0.W, literal.y,
317; EG-NEXT:    65535(9.183409e-41), -65536(nan)
318; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
319; EG-NEXT:     MOV * T1.Z, T3.X,
320; EG-NEXT:     MOV * T6.X, T0.W,
321; EG-NEXT:     MOV T0.W, PV.X,
322; EG-NEXT:     LSHR * T1.W, T0.Y, literal.x,
323; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
324; EG-NEXT:     BFE_INT T1.W, PS, 0.0, literal.x,
325; EG-NEXT:     LSHR * T2.W, T1.Y, literal.x,
326; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
327; EG-NEXT:     ASHR T1.W, PV.W, PS,
328; EG-NEXT:     AND_INT * T0.W, T0.W, literal.x,
329; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
330; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
331; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
332; EG-NEXT:     OR_INT * T0.W, T0.W, PV.W,
333; EG-NEXT:     MOV T6.X, PV.W,
334; EG-NEXT:     MOV T0.Y, T7.X,
335; EG-NEXT:     BFE_INT T0.W, T0.Z, 0.0, literal.x,
336; EG-NEXT:     AND_INT * T1.W, T1.Z, literal.y,
337; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
338; EG-NEXT:     ASHR T0.W, PV.W, PS,
339; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.x,
340; EG-NEXT:    -65536(nan), 0(0.000000e+00)
341; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
342; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
343; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
344; EG-NEXT:     MOV * T7.X, PV.W,
345; EG-NEXT:     MOV T0.Y, PV.X,
346; EG-NEXT:     LSHR * T0.W, T0.Z, literal.x,
347; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
348; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
349; EG-NEXT:     LSHR * T1.W, T1.Z, literal.x,
350; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
351; EG-NEXT:     ASHR T0.W, PV.W, PS,
352; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.x,
353; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
354; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
355; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
356; EG-NEXT:     LSHR T9.X, KC0[2].Y, literal.x,
357; EG-NEXT:     OR_INT * T10.Y, T1.W, PV.W,
358; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
359; EG-NEXT:     MOV T7.X, PV.Y,
360; EG-NEXT:     MOV * T10.X, T6.X,
361  %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
362  %a = load <4 x i16>, <4 x i16> addrspace(1)* %in
363  %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
364  %result = ashr <4 x i16> %a, %b
365  store <4 x i16> %result, <4 x i16> addrspace(1)* %out
366  ret void
367}
368
369define amdgpu_kernel void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) {
370; SI-LABEL: s_ashr_i64:
371; SI:       ; %bb.0: ; %entry
372; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
373; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
374; SI-NEXT:    s_mov_b32 s3, 0xf000
375; SI-NEXT:    s_mov_b32 s2, -1
376; SI-NEXT:    s_waitcnt lgkmcnt(0)
377; SI-NEXT:    s_ashr_i32 s5, s4, 31
378; SI-NEXT:    s_ashr_i64 s[4:5], s[4:5], 8
379; SI-NEXT:    v_mov_b32_e32 v0, s4
380; SI-NEXT:    v_mov_b32_e32 v1, s5
381; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
382; SI-NEXT:    s_endpgm
383;
384; VI-LABEL: s_ashr_i64:
385; VI:       ; %bb.0: ; %entry
386; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
387; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
388; VI-NEXT:    s_mov_b32 s3, 0xf000
389; VI-NEXT:    s_mov_b32 s2, -1
390; VI-NEXT:    s_waitcnt lgkmcnt(0)
391; VI-NEXT:    s_ashr_i32 s5, s4, 31
392; VI-NEXT:    s_ashr_i64 s[4:5], s[4:5], 8
393; VI-NEXT:    v_mov_b32_e32 v0, s4
394; VI-NEXT:    v_mov_b32_e32 v1, s5
395; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
396; VI-NEXT:    s_endpgm
397;
398; EG-LABEL: s_ashr_i64:
399; EG:       ; %bb.0: ; %entry
400; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
401; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
402; EG-NEXT:    CF_END
403; EG-NEXT:    PAD
404; EG-NEXT:    ALU clause starting at 4:
405; EG-NEXT:     ASHR * T0.Y, KC0[2].Z, literal.x,
406; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
407; EG-NEXT:     BIT_ALIGN_INT T0.X, PV.Y, KC0[2].Z, literal.x,
408; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
409; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
410entry:
411  %in.ext = sext i32 %in to i64
412  %ashr = ashr i64 %in.ext, 8
413  store i64 %ashr, i64 addrspace(1)* %out
414  ret void
415}
416
417define amdgpu_kernel void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
418; SI-LABEL: ashr_i64_2:
419; SI:       ; %bb.0: ; %entry
420; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
421; SI-NEXT:    s_mov_b32 s7, 0xf000
422; SI-NEXT:    s_mov_b32 s6, -1
423; SI-NEXT:    s_mov_b32 s10, s6
424; SI-NEXT:    s_mov_b32 s11, s7
425; SI-NEXT:    s_waitcnt lgkmcnt(0)
426; SI-NEXT:    s_mov_b32 s8, s2
427; SI-NEXT:    s_mov_b32 s9, s3
428; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
429; SI-NEXT:    s_mov_b32 s4, s0
430; SI-NEXT:    s_mov_b32 s5, s1
431; SI-NEXT:    s_waitcnt vmcnt(0)
432; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], v2
433; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
434; SI-NEXT:    s_endpgm
435;
436; VI-LABEL: ashr_i64_2:
437; VI:       ; %bb.0: ; %entry
438; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
439; VI-NEXT:    s_mov_b32 s7, 0xf000
440; VI-NEXT:    s_mov_b32 s6, -1
441; VI-NEXT:    s_mov_b32 s10, s6
442; VI-NEXT:    s_mov_b32 s11, s7
443; VI-NEXT:    s_waitcnt lgkmcnt(0)
444; VI-NEXT:    s_mov_b32 s8, s2
445; VI-NEXT:    s_mov_b32 s9, s3
446; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
447; VI-NEXT:    s_mov_b32 s4, s0
448; VI-NEXT:    s_mov_b32 s5, s1
449; VI-NEXT:    s_waitcnt vmcnt(0)
450; VI-NEXT:    v_ashrrev_i64 v[0:1], v2, v[0:1]
451; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
452; VI-NEXT:    s_endpgm
453;
454; EG-LABEL: ashr_i64_2:
455; EG:       ; %bb.0: ; %entry
456; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
457; EG-NEXT:    TEX 0 @6
458; EG-NEXT:    ALU 10, @9, KC0[CB0:0-32], KC1[]
459; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
460; EG-NEXT:    CF_END
461; EG-NEXT:    PAD
462; EG-NEXT:    Fetch clause starting at 6:
463; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
464; EG-NEXT:    ALU clause starting at 8:
465; EG-NEXT:     MOV * T0.X, KC0[2].Z,
466; EG-NEXT:    ALU clause starting at 9:
467; EG-NEXT:     AND_INT * T0.W, T0.Z, literal.x,
468; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
469; EG-NEXT:     ASHR T1.Z, T0.Y, PV.W,
470; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.Y, T0.X, T0.Z,
471; EG-NEXT:     AND_INT * T1.W, T0.Z, literal.x,
472; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
473; EG-NEXT:     CNDE_INT T0.X, PS, PV.W, PV.Z,
474; EG-NEXT:     ASHR T0.W, T0.Y, literal.x,
475; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
476; EG-NEXT:    31(4.344025e-44), 2(2.802597e-45)
477; EG-NEXT:     CNDE_INT * T0.Y, T1.W, T1.Z, PV.W,
478entry:
479  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
480  %a = load i64, i64 addrspace(1)* %in
481  %b = load i64, i64 addrspace(1)* %b_ptr
482  %result = ashr i64 %a, %b
483  store i64 %result, i64 addrspace(1)* %out
484  ret void
485}
486
487define amdgpu_kernel void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
488; SI-LABEL: ashr_v2i64:
489; SI:       ; %bb.0:
490; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
491; SI-NEXT:    s_mov_b32 s7, 0xf000
492; SI-NEXT:    s_mov_b32 s6, -1
493; SI-NEXT:    s_mov_b32 s10, s6
494; SI-NEXT:    s_mov_b32 s11, s7
495; SI-NEXT:    s_waitcnt lgkmcnt(0)
496; SI-NEXT:    s_mov_b32 s8, s2
497; SI-NEXT:    s_mov_b32 s9, s3
498; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
499; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
500; SI-NEXT:    s_mov_b32 s4, s0
501; SI-NEXT:    s_mov_b32 s5, s1
502; SI-NEXT:    s_waitcnt vmcnt(0)
503; SI-NEXT:    v_ashr_i64 v[2:3], v[2:3], v6
504; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], v4
505; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
506; SI-NEXT:    s_endpgm
507;
508; VI-LABEL: ashr_v2i64:
509; VI:       ; %bb.0:
510; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
511; VI-NEXT:    s_mov_b32 s7, 0xf000
512; VI-NEXT:    s_mov_b32 s6, -1
513; VI-NEXT:    s_mov_b32 s10, s6
514; VI-NEXT:    s_mov_b32 s11, s7
515; VI-NEXT:    s_waitcnt lgkmcnt(0)
516; VI-NEXT:    s_mov_b32 s8, s2
517; VI-NEXT:    s_mov_b32 s9, s3
518; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
519; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
520; VI-NEXT:    s_mov_b32 s4, s0
521; VI-NEXT:    s_mov_b32 s5, s1
522; VI-NEXT:    s_waitcnt vmcnt(0)
523; VI-NEXT:    v_ashrrev_i64 v[2:3], v6, v[2:3]
524; VI-NEXT:    v_ashrrev_i64 v[0:1], v4, v[0:1]
525; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
526; VI-NEXT:    s_endpgm
527;
528; EG-LABEL: ashr_v2i64:
529; EG:       ; %bb.0:
530; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
531; EG-NEXT:    TEX 1 @6
532; EG-NEXT:    ALU 19, @11, KC0[CB0:0-32], KC1[]
533; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
534; EG-NEXT:    CF_END
535; EG-NEXT:    PAD
536; EG-NEXT:    Fetch clause starting at 6:
537; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
538; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
539; EG-NEXT:    ALU clause starting at 10:
540; EG-NEXT:     MOV * T0.X, KC0[2].Z,
541; EG-NEXT:    ALU clause starting at 11:
542; EG-NEXT:     AND_INT * T1.W, T1.Z, literal.x,
543; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
544; EG-NEXT:     ASHR T1.Y, T0.W, PV.W,
545; EG-NEXT:     AND_INT T2.Z, T1.Z, literal.x,
546; EG-NEXT:     BIT_ALIGN_INT T1.W, T0.W, T0.Z, T1.Z,
547; EG-NEXT:     AND_INT * T2.W, T1.X, literal.y,
548; EG-NEXT:    32(4.484155e-44), 31(4.344025e-44)
549; EG-NEXT:     ASHR T2.Y, T0.Y, PS,
550; EG-NEXT:     CNDE_INT T0.Z, PV.Z, PV.W, PV.Y,
551; EG-NEXT:     BIT_ALIGN_INT T1.W, T0.Y, T0.X, T1.X,
552; EG-NEXT:     AND_INT * T2.W, T1.X, literal.x,
553; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
554; EG-NEXT:     CNDE_INT T0.X, PS, PV.W, PV.Y,
555; EG-NEXT:     ASHR T0.W, T0.W, literal.x,
556; EG-NEXT:     ASHR * T1.W, T0.Y, literal.x,
557; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
558; EG-NEXT:     CNDE_INT * T0.W, T2.Z, T1.Y, PV.W,
559; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
560; EG-NEXT:     CNDE_INT * T0.Y, T2.W, T2.Y, T1.W,
561; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
562  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
563  %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
564  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
565  %result = ashr <2 x i64> %a, %b
566  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
567  ret void
568}
569
570; FIXME: Broken on r600
571define amdgpu_kernel void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
572; SI-LABEL: ashr_v4i64:
573; SI:       ; %bb.0:
574; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
575; SI-NEXT:    s_mov_b32 s3, 0xf000
576; SI-NEXT:    s_mov_b32 s2, -1
577; SI-NEXT:    s_mov_b32 s10, s2
578; SI-NEXT:    s_mov_b32 s11, s3
579; SI-NEXT:    s_waitcnt lgkmcnt(0)
580; SI-NEXT:    s_mov_b32 s8, s6
581; SI-NEXT:    s_mov_b32 s9, s7
582; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
583; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
584; SI-NEXT:    buffer_load_dwordx4 v[7:10], off, s[8:11], 0
585; SI-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
586; SI-NEXT:    s_mov_b32 s0, s4
587; SI-NEXT:    s_mov_b32 s1, s5
588; SI-NEXT:    s_waitcnt vmcnt(2)
589; SI-NEXT:    v_ashr_i64 v[2:3], v[2:3], v6
590; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], v4
591; SI-NEXT:    s_waitcnt vmcnt(0)
592; SI-NEXT:    v_ashr_i64 v[9:10], v[9:10], v13
593; SI-NEXT:    v_ashr_i64 v[7:8], v[7:8], v11
594; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
595; SI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0
596; SI-NEXT:    s_endpgm
597;
598; VI-LABEL: ashr_v4i64:
599; VI:       ; %bb.0:
600; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
601; VI-NEXT:    s_mov_b32 s3, 0xf000
602; VI-NEXT:    s_mov_b32 s2, -1
603; VI-NEXT:    s_mov_b32 s10, s2
604; VI-NEXT:    s_mov_b32 s11, s3
605; VI-NEXT:    s_waitcnt lgkmcnt(0)
606; VI-NEXT:    s_mov_b32 s8, s6
607; VI-NEXT:    s_mov_b32 s9, s7
608; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
609; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
610; VI-NEXT:    buffer_load_dwordx4 v[7:10], off, s[8:11], 0
611; VI-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
612; VI-NEXT:    s_mov_b32 s0, s4
613; VI-NEXT:    s_mov_b32 s1, s5
614; VI-NEXT:    s_waitcnt vmcnt(2)
615; VI-NEXT:    v_ashrrev_i64 v[2:3], v6, v[2:3]
616; VI-NEXT:    v_ashrrev_i64 v[0:1], v4, v[0:1]
617; VI-NEXT:    s_waitcnt vmcnt(0)
618; VI-NEXT:    v_ashrrev_i64 v[9:10], v13, v[9:10]
619; VI-NEXT:    v_ashrrev_i64 v[7:8], v11, v[7:8]
620; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
621; VI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0
622; VI-NEXT:    s_endpgm
623;
624; EG-LABEL: ashr_v4i64:
625; EG:       ; %bb.0:
626; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
627; EG-NEXT:    TEX 3 @6
628; EG-NEXT:    ALU 39, @15, KC0[CB0:0-32], KC1[]
629; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0
630; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
631; EG-NEXT:    CF_END
632; EG-NEXT:    Fetch clause starting at 6:
633; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 32, #1
634; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 48, #1
635; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 0, #1
636; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 16, #1
637; EG-NEXT:    ALU clause starting at 14:
638; EG-NEXT:     MOV * T0.X, KC0[2].Z,
639; EG-NEXT:    ALU clause starting at 15:
640; EG-NEXT:     AND_INT * T1.W, T1.Z, literal.x,
641; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
642; EG-NEXT:     ASHR T1.Y, T0.W, literal.x,
643; EG-NEXT:     ASHR T4.Z, T3.W, PV.W, BS:VEC_120/SCL_212
644; EG-NEXT:     AND_INT T1.W, T1.Z, literal.y,
645; EG-NEXT:     AND_INT * T2.W, T2.Z, literal.x,
646; EG-NEXT:    31(4.344025e-44), 32(4.484155e-44)
647; EG-NEXT:     BIT_ALIGN_INT T4.X, T3.W, T3.Z, T1.Z,
648; EG-NEXT:     ASHR T2.Y, T0.W, PS, BS:VEC_120/SCL_212
649; EG-NEXT:     AND_INT * T1.Z, T2.Z, literal.x,
650; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
651; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.W, T0.Z, T2.Z,
652; EG-NEXT:     AND_INT * T2.W, T2.X, literal.x,
653; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
654; EG-NEXT:     AND_INT T5.X, T1.X, literal.x,
655; EG-NEXT:     ASHR T4.Y, T0.Y, PS,
656; EG-NEXT:     CNDE_INT T0.Z, T1.Z, PV.W, T2.Y,
657; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.Y, T0.X, T2.X,
658; EG-NEXT:     AND_INT * T2.W, T2.X, literal.y,
659; EG-NEXT:    31(4.344025e-44), 32(4.484155e-44)
660; EG-NEXT:     CNDE_INT T0.X, PS, PV.W, PV.Y,
661; EG-NEXT:     ASHR T5.Y, T3.Y, PV.X,
662; EG-NEXT:     CNDE_INT T2.Z, T1.W, T4.X, T4.Z,
663; EG-NEXT:     BIT_ALIGN_INT T0.W, T3.Y, T3.X, T1.X, BS:VEC_102/SCL_221
664; EG-NEXT:     AND_INT * T4.W, T1.X, literal.x,
665; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
666; EG-NEXT:     CNDE_INT T2.X, PS, PV.W, PV.Y,
667; EG-NEXT:     ASHR T6.Y, T3.W, literal.x,
668; EG-NEXT:     ASHR T3.Z, T0.Y, literal.x, BS:VEC_201
669; EG-NEXT:     ADD_INT T3.W, KC0[2].Y, literal.y,
670; EG-NEXT:     CNDE_INT * T0.W, T1.Z, T2.Y, T1.Y,
671; EG-NEXT:    31(4.344025e-44), 16(2.242078e-44)
672; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
673; EG-NEXT:     CNDE_INT T0.Y, T2.W, T4.Y, PV.Z,
674; EG-NEXT:     ASHR T3.W, T3.Y, literal.y,
675; EG-NEXT:     CNDE_INT * T2.W, T1.W, T4.Z, PV.Y,
676; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
677; EG-NEXT:     LSHR T3.X, KC0[2].Y, literal.x,
678; EG-NEXT:     CNDE_INT * T2.Y, T4.W, T5.Y, PV.W,
679; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
680  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
681  %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
682  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
683  %result = ashr <4 x i64> %a, %b
684  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
685  ret void
686}
687
688define amdgpu_kernel void @s_ashr_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
689; SI-LABEL: s_ashr_32_i64:
690; SI:       ; %bb.0:
691; SI-NEXT:    s_load_dword s6, s[0:1], 0x14
692; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x1d
693; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
694; SI-NEXT:    s_mov_b32 s3, 0xf000
695; SI-NEXT:    s_mov_b32 s2, -1
696; SI-NEXT:    s_waitcnt lgkmcnt(0)
697; SI-NEXT:    s_ashr_i32 s7, s6, 31
698; SI-NEXT:    s_add_u32 s4, s6, s4
699; SI-NEXT:    s_addc_u32 s5, s7, s5
700; SI-NEXT:    v_mov_b32_e32 v0, s4
701; SI-NEXT:    v_mov_b32_e32 v1, s5
702; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
703; SI-NEXT:    s_endpgm
704;
705; VI-LABEL: s_ashr_32_i64:
706; VI:       ; %bb.0:
707; VI-NEXT:    s_load_dword s6, s[0:1], 0x50
708; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x74
709; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
710; VI-NEXT:    s_mov_b32 s3, 0xf000
711; VI-NEXT:    s_mov_b32 s2, -1
712; VI-NEXT:    s_waitcnt lgkmcnt(0)
713; VI-NEXT:    s_ashr_i32 s7, s6, 31
714; VI-NEXT:    s_add_u32 s4, s6, s4
715; VI-NEXT:    s_addc_u32 s5, s7, s5
716; VI-NEXT:    v_mov_b32_e32 v0, s4
717; VI-NEXT:    v_mov_b32_e32 v1, s5
718; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
719; VI-NEXT:    s_endpgm
720;
721; EG-LABEL: s_ashr_32_i64:
722; EG:       ; %bb.0:
723; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
724; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
725; EG-NEXT:    CF_END
726; EG-NEXT:    PAD
727; EG-NEXT:    ALU clause starting at 4:
728; EG-NEXT:     ASHR * T0.W, KC0[5].X, literal.x,
729; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
730; EG-NEXT:     ADD_INT * T0.W, PV.W, KC0[7].Z,
731; EG-NEXT:     ADDC_UINT * T1.W, KC0[5].X, KC0[7].Y,
732; EG-NEXT:     ADD_INT * T0.Y, T0.W, PV.W,
733; EG-NEXT:     ADD_INT * T0.X, KC0[5].X, KC0[7].Y,
734; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
735; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
736  %result = ashr i64 %a, 32
737  %add = add i64 %result, %b
738  store i64 %add, i64 addrspace(1)* %out
739  ret void
740}
741
742define amdgpu_kernel void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
743; SI-LABEL: v_ashr_32_i64:
744; SI:       ; %bb.0:
745; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
746; SI-NEXT:    s_mov_b32 s7, 0xf000
747; SI-NEXT:    s_mov_b32 s6, 0
748; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
749; SI-NEXT:    v_mov_b32_e32 v1, 0
750; SI-NEXT:    s_waitcnt lgkmcnt(0)
751; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
752; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
753; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
754; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
755; SI-NEXT:    s_waitcnt vmcnt(0)
756; SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
757; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
758; SI-NEXT:    s_endpgm
759;
760; VI-LABEL: v_ashr_32_i64:
761; VI:       ; %bb.0:
762; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
763; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
764; VI-NEXT:    s_waitcnt lgkmcnt(0)
765; VI-NEXT:    v_mov_b32_e32 v0, s3
766; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v2
767; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v0, vcc
768; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v1
769; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
770; VI-NEXT:    flat_load_dword v0, v[0:1]
771; VI-NEXT:    v_mov_b32_e32 v1, s1
772; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
773; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
774; VI-NEXT:    s_waitcnt vmcnt(0)
775; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
776; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
777; VI-NEXT:    s_endpgm
778;
779; EG-LABEL: v_ashr_32_i64:
780; EG:       ; %bb.0:
781; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
782; EG-NEXT:    TEX 0 @6
783; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
784; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
785; EG-NEXT:    CF_END
786; EG-NEXT:    PAD
787; EG-NEXT:    Fetch clause starting at 6:
788; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 4, #1
789; EG-NEXT:    ALU clause starting at 8:
790; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
791; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
792; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
793; EG-NEXT:    ALU clause starting at 11:
794; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
795; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
796; EG-NEXT:     ASHR * T0.Y, T0.X, literal.y,
797; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
798  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
799  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
800  %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
801  %a = load i64, i64 addrspace(1)* %gep.in
802  %result = ashr i64 %a, 32
803  store i64 %result, i64 addrspace(1)* %gep.out
804  ret void
805}
806
807define amdgpu_kernel void @s_ashr_63_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
808; SI-LABEL: s_ashr_63_i64:
809; SI:       ; %bb.0:
810; SI-NEXT:    s_load_dword s6, s[0:1], 0x14
811; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x1d
812; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
813; SI-NEXT:    s_mov_b32 s3, 0xf000
814; SI-NEXT:    s_mov_b32 s2, -1
815; SI-NEXT:    s_waitcnt lgkmcnt(0)
816; SI-NEXT:    s_ashr_i32 s6, s6, 31
817; SI-NEXT:    s_add_u32 s4, s6, s4
818; SI-NEXT:    s_addc_u32 s5, s6, s5
819; SI-NEXT:    v_mov_b32_e32 v0, s4
820; SI-NEXT:    v_mov_b32_e32 v1, s5
821; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
822; SI-NEXT:    s_endpgm
823;
824; VI-LABEL: s_ashr_63_i64:
825; VI:       ; %bb.0:
826; VI-NEXT:    s_load_dword s6, s[0:1], 0x50
827; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x74
828; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
829; VI-NEXT:    s_mov_b32 s3, 0xf000
830; VI-NEXT:    s_mov_b32 s2, -1
831; VI-NEXT:    s_waitcnt lgkmcnt(0)
832; VI-NEXT:    s_ashr_i32 s6, s6, 31
833; VI-NEXT:    s_add_u32 s4, s6, s4
834; VI-NEXT:    s_addc_u32 s5, s6, s5
835; VI-NEXT:    v_mov_b32_e32 v0, s4
836; VI-NEXT:    v_mov_b32_e32 v1, s5
837; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
838; VI-NEXT:    s_endpgm
839;
840; EG-LABEL: s_ashr_63_i64:
841; EG:       ; %bb.0:
842; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
843; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
844; EG-NEXT:    CF_END
845; EG-NEXT:    PAD
846; EG-NEXT:    ALU clause starting at 4:
847; EG-NEXT:     ASHR * T0.W, KC0[5].X, literal.x,
848; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
849; EG-NEXT:     ADD_INT T1.W, PV.W, KC0[7].Z,
850; EG-NEXT:     ADDC_UINT * T2.W, PV.W, KC0[7].Y,
851; EG-NEXT:     ADD_INT * T0.Y, PV.W, PS,
852; EG-NEXT:     ADD_INT T0.X, T0.W, KC0[7].Y,
853; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
854; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
855  %result = ashr i64 %a, 63
856  %add = add i64 %result, %b
857  store i64 %add, i64 addrspace(1)* %out
858  ret void
859}
860
861define amdgpu_kernel void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
862; SI-LABEL: v_ashr_63_i64:
863; SI:       ; %bb.0:
864; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
865; SI-NEXT:    s_mov_b32 s7, 0xf000
866; SI-NEXT:    s_mov_b32 s6, 0
867; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
868; SI-NEXT:    v_mov_b32_e32 v1, 0
869; SI-NEXT:    s_waitcnt lgkmcnt(0)
870; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
871; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
872; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
873; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
874; SI-NEXT:    s_waitcnt vmcnt(0)
875; SI-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
876; SI-NEXT:    v_mov_b32_e32 v3, v2
877; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
878; SI-NEXT:    s_endpgm
879;
880; VI-LABEL: v_ashr_63_i64:
881; VI:       ; %bb.0:
882; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
883; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
884; VI-NEXT:    s_waitcnt lgkmcnt(0)
885; VI-NEXT:    v_mov_b32_e32 v0, s3
886; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v2
887; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v0, vcc
888; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v1
889; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
890; VI-NEXT:    flat_load_dword v3, v[0:1]
891; VI-NEXT:    v_mov_b32_e32 v1, s1
892; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
893; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
894; VI-NEXT:    s_waitcnt vmcnt(0)
895; VI-NEXT:    v_ashrrev_i32_e32 v2, 31, v3
896; VI-NEXT:    v_mov_b32_e32 v3, v2
897; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
898; VI-NEXT:    s_endpgm
899;
900; EG-LABEL: v_ashr_63_i64:
901; EG:       ; %bb.0:
902; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
903; EG-NEXT:    TEX 0 @6
904; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
905; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
906; EG-NEXT:    CF_END
907; EG-NEXT:    PAD
908; EG-NEXT:    Fetch clause starting at 6:
909; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 4, #1
910; EG-NEXT:    ALU clause starting at 8:
911; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
912; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
913; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
914; EG-NEXT:    ALU clause starting at 11:
915; EG-NEXT:     ASHR T0.X, T0.X, literal.x,
916; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
917; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
918; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
919; EG-NEXT:     MOV * T0.Y, PV.X,
920; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
921  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
922  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
923  %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
924  %a = load i64, i64 addrspace(1)* %gep.in
925  %result = ashr i64 %a, 63
926  store i64 %result, i64 addrspace(1)* %gep.out
927  ret void
928}
929
930attributes #0 = { nounwind readnone }
931