1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=VI
4; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s -check-prefixes=EG
5
6declare i32 @llvm.amdgcn.workitem.id.x() #0
7
8define amdgpu_kernel void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
9; SI-LABEL: ashr_v2i32:
10; SI:       ; %bb.0:
11; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
12; SI-NEXT:    s_mov_b32 s7, 0xf000
13; SI-NEXT:    s_mov_b32 s6, -1
14; SI-NEXT:    s_mov_b32 s10, s6
15; SI-NEXT:    s_mov_b32 s11, s7
16; SI-NEXT:    s_waitcnt lgkmcnt(0)
17; SI-NEXT:    s_mov_b32 s8, s2
18; SI-NEXT:    s_mov_b32 s9, s3
19; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
20; SI-NEXT:    s_mov_b32 s4, s0
21; SI-NEXT:    s_mov_b32 s5, s1
22; SI-NEXT:    s_waitcnt vmcnt(0)
23; SI-NEXT:    v_ashr_i32_e32 v1, v1, v3
24; SI-NEXT:    v_ashr_i32_e32 v0, v0, v2
25; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
26; SI-NEXT:    s_endpgm
27;
28; VI-LABEL: ashr_v2i32:
29; VI:       ; %bb.0:
30; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
31; VI-NEXT:    s_mov_b32 s7, 0xf000
32; VI-NEXT:    s_mov_b32 s6, -1
33; VI-NEXT:    s_mov_b32 s10, s6
34; VI-NEXT:    s_mov_b32 s11, s7
35; VI-NEXT:    s_waitcnt lgkmcnt(0)
36; VI-NEXT:    s_mov_b32 s8, s2
37; VI-NEXT:    s_mov_b32 s9, s3
38; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
39; VI-NEXT:    s_mov_b32 s4, s0
40; VI-NEXT:    s_mov_b32 s5, s1
41; VI-NEXT:    s_waitcnt vmcnt(0)
42; VI-NEXT:    v_ashrrev_i32_e32 v1, v3, v1
43; VI-NEXT:    v_ashrrev_i32_e32 v0, v2, v0
44; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
45; VI-NEXT:    s_endpgm
46;
47; EG-LABEL: ashr_v2i32:
48; EG:       ; %bb.0:
49; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
50; EG-NEXT:    TEX 1 @6
51; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
52; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
53; EG-NEXT:    CF_END
54; EG-NEXT:    PAD
55; EG-NEXT:    Fetch clause starting at 6:
56; EG-NEXT:     VTX_READ_64 T1.XY, T0.X, 8, #1
57; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
58; EG-NEXT:    ALU clause starting at 10:
59; EG-NEXT:     MOV * T0.X, KC0[2].Z,
60; EG-NEXT:    ALU clause starting at 11:
61; EG-NEXT:     ASHR * T0.Y, T0.Y, T1.Y,
62; EG-NEXT:     ASHR T0.X, T0.X, T1.X,
63; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
64; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
65  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
66  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
67  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
68  %result = ashr <2 x i32> %a, %b
69  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
70  ret void
71}
72
73define amdgpu_kernel void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
74; SI-LABEL: ashr_v4i32:
75; SI:       ; %bb.0:
76; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
77; SI-NEXT:    s_mov_b32 s7, 0xf000
78; SI-NEXT:    s_mov_b32 s6, -1
79; SI-NEXT:    s_mov_b32 s10, s6
80; SI-NEXT:    s_mov_b32 s11, s7
81; SI-NEXT:    s_waitcnt lgkmcnt(0)
82; SI-NEXT:    s_mov_b32 s8, s2
83; SI-NEXT:    s_mov_b32 s9, s3
84; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
85; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
86; SI-NEXT:    s_mov_b32 s4, s0
87; SI-NEXT:    s_mov_b32 s5, s1
88; SI-NEXT:    s_waitcnt vmcnt(0)
89; SI-NEXT:    v_ashr_i32_e32 v3, v3, v7
90; SI-NEXT:    v_ashr_i32_e32 v2, v2, v6
91; SI-NEXT:    v_ashr_i32_e32 v1, v1, v5
92; SI-NEXT:    v_ashr_i32_e32 v0, v0, v4
93; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
94; SI-NEXT:    s_endpgm
95;
96; VI-LABEL: ashr_v4i32:
97; VI:       ; %bb.0:
98; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
99; VI-NEXT:    s_mov_b32 s7, 0xf000
100; VI-NEXT:    s_mov_b32 s6, -1
101; VI-NEXT:    s_mov_b32 s10, s6
102; VI-NEXT:    s_mov_b32 s11, s7
103; VI-NEXT:    s_waitcnt lgkmcnt(0)
104; VI-NEXT:    s_mov_b32 s8, s2
105; VI-NEXT:    s_mov_b32 s9, s3
106; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
107; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
108; VI-NEXT:    s_mov_b32 s4, s0
109; VI-NEXT:    s_mov_b32 s5, s1
110; VI-NEXT:    s_waitcnt vmcnt(0)
111; VI-NEXT:    v_ashrrev_i32_e32 v3, v7, v3
112; VI-NEXT:    v_ashrrev_i32_e32 v2, v6, v2
113; VI-NEXT:    v_ashrrev_i32_e32 v1, v5, v1
114; VI-NEXT:    v_ashrrev_i32_e32 v0, v4, v0
115; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
116; VI-NEXT:    s_endpgm
117;
118; EG-LABEL: ashr_v4i32:
119; EG:       ; %bb.0:
120; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
121; EG-NEXT:    TEX 1 @6
122; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
123; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
124; EG-NEXT:    CF_END
125; EG-NEXT:    PAD
126; EG-NEXT:    Fetch clause starting at 6:
127; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
128; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
129; EG-NEXT:    ALU clause starting at 10:
130; EG-NEXT:     MOV * T0.X, KC0[2].Z,
131; EG-NEXT:    ALU clause starting at 11:
132; EG-NEXT:     ASHR * T0.W, T0.W, T1.W,
133; EG-NEXT:     ASHR * T0.Z, T0.Z, T1.Z,
134; EG-NEXT:     ASHR * T0.Y, T0.Y, T1.Y,
135; EG-NEXT:     ASHR T0.X, T0.X, T1.X,
136; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
137; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
138  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
139  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
140  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
141  %result = ashr <4 x i32> %a, %b
142  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
143  ret void
144}
145
146; FIXME: The ashr operation is uniform, but because its operands come from a
147; global load we end up with the vector instructions rather than scalar.
148define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
149; SI-LABEL: ashr_v2i16:
150; SI:       ; %bb.0:
151; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
152; SI-NEXT:    s_mov_b32 s7, 0xf000
153; SI-NEXT:    s_mov_b32 s6, -1
154; SI-NEXT:    s_mov_b32 s10, s6
155; SI-NEXT:    s_mov_b32 s11, s7
156; SI-NEXT:    s_waitcnt lgkmcnt(0)
157; SI-NEXT:    s_mov_b32 s8, s2
158; SI-NEXT:    s_mov_b32 s9, s3
159; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
160; SI-NEXT:    s_mov_b32 s4, s0
161; SI-NEXT:    s_mov_b32 s5, s1
162; SI-NEXT:    s_waitcnt vmcnt(0)
163; SI-NEXT:    v_bfe_i32 v2, v0, 0, 16
164; SI-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
165; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
166; SI-NEXT:    v_ashrrev_i32_e32 v0, v3, v0
167; SI-NEXT:    v_ashrrev_i32_e32 v1, v1, v2
168; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
169; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
170; SI-NEXT:    v_or_b32_e32 v0, v1, v0
171; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
172; SI-NEXT:    s_endpgm
173;
174; VI-LABEL: ashr_v2i16:
175; VI:       ; %bb.0:
176; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
177; VI-NEXT:    s_mov_b32 s7, 0xf000
178; VI-NEXT:    s_mov_b32 s6, -1
179; VI-NEXT:    s_mov_b32 s10, s6
180; VI-NEXT:    s_mov_b32 s11, s7
181; VI-NEXT:    s_waitcnt lgkmcnt(0)
182; VI-NEXT:    s_mov_b32 s8, s2
183; VI-NEXT:    s_mov_b32 s9, s3
184; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
185; VI-NEXT:    s_mov_b32 s4, s0
186; VI-NEXT:    s_mov_b32 s5, s1
187; VI-NEXT:    s_waitcnt vmcnt(0)
188; VI-NEXT:    v_ashrrev_i32_sdwa v2, sext(v1), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
189; VI-NEXT:    v_ashrrev_i32_sdwa v0, sext(v1), sext(v0) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
190; VI-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
191; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
192; VI-NEXT:    s_endpgm
193;
194; EG-LABEL: ashr_v2i16:
195; EG:       ; %bb.0:
196; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
197; EG-NEXT:    TEX 1 @6
198; EG-NEXT:    ALU 14, @12, KC0[CB0:0-32], KC1[]
199; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1
200; EG-NEXT:    CF_END
201; EG-NEXT:    PAD
202; EG-NEXT:    Fetch clause starting at 6:
203; EG-NEXT:     VTX_READ_32 T7.X, T7.X, 0, #1
204; EG-NEXT:     VTX_READ_32 T6.X, T6.X, 4, #1
205; EG-NEXT:    ALU clause starting at 10:
206; EG-NEXT:     MOV * T6.X, KC0[2].Z,
207; EG-NEXT:     MOV * T7.X, PV.X,
208; EG-NEXT:    ALU clause starting at 12:
209; EG-NEXT:     LSHR * T0.W, T7.X, literal.x,
210; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
211; EG-NEXT:     BFE_INT T0.Y, PV.W, 0.0, literal.x,
212; EG-NEXT:     LSHR T0.Z, T6.X, literal.x,
213; EG-NEXT:     BFE_INT T0.W, T7.X, 0.0, literal.x, BS:VEC_120/SCL_212
214; EG-NEXT:     AND_INT * T1.W, T6.X, literal.y,
215; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
216; EG-NEXT:     ASHR T0.W, PV.W, PS,
217; EG-NEXT:     ASHR * T1.W, PV.Y, PV.Z,
218; EG-NEXT:     LSHL T1.W, PS, literal.x,
219; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
220; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
221; EG-NEXT:     OR_INT T6.X, PS, PV.W,
222; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
223; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
224  %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
225  %a = load <2 x i16>, <2 x i16> addrspace(1)* %in
226  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
227  %result = ashr <2 x i16> %a, %b
228  store <2 x i16> %result, <2 x i16> addrspace(1)* %out
229  ret void
230}
231
232; FIXME: The ashr operation is uniform, but because its operands come from a
233; global load we end up with the vector instructions rather than scalar.
234define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
235; SI-LABEL: ashr_v4i16:
236; SI:       ; %bb.0:
237; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
238; SI-NEXT:    s_mov_b32 s7, 0xf000
239; SI-NEXT:    s_mov_b32 s6, -1
240; SI-NEXT:    s_mov_b32 s10, s6
241; SI-NEXT:    s_mov_b32 s11, s7
242; SI-NEXT:    s_waitcnt lgkmcnt(0)
243; SI-NEXT:    s_mov_b32 s8, s2
244; SI-NEXT:    s_mov_b32 s9, s3
245; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
246; SI-NEXT:    s_mov_b32 s2, 0xffff
247; SI-NEXT:    s_mov_b32 s4, s0
248; SI-NEXT:    s_mov_b32 s5, s1
249; SI-NEXT:    s_waitcnt vmcnt(0)
250; SI-NEXT:    v_bfe_i32 v4, v0, 0, 16
251; SI-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
252; SI-NEXT:    v_bfe_i32 v5, v1, 0, 16
253; SI-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
254; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
255; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
256; SI-NEXT:    v_ashrrev_i32_e32 v1, v7, v1
257; SI-NEXT:    v_ashrrev_i32_e32 v3, v3, v5
258; SI-NEXT:    v_ashrrev_i32_e32 v0, v6, v0
259; SI-NEXT:    v_ashrrev_i32_e32 v2, v2, v4
260; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
261; SI-NEXT:    v_and_b32_e32 v3, s2, v3
262; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
263; SI-NEXT:    v_and_b32_e32 v2, s2, v2
264; SI-NEXT:    v_or_b32_e32 v1, v3, v1
265; SI-NEXT:    v_or_b32_e32 v0, v2, v0
266; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
267; SI-NEXT:    s_endpgm
268;
269; VI-LABEL: ashr_v4i16:
270; VI:       ; %bb.0:
271; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
272; VI-NEXT:    s_mov_b32 s7, 0xf000
273; VI-NEXT:    s_mov_b32 s6, -1
274; VI-NEXT:    s_mov_b32 s10, s6
275; VI-NEXT:    s_mov_b32 s11, s7
276; VI-NEXT:    s_waitcnt lgkmcnt(0)
277; VI-NEXT:    s_mov_b32 s8, s2
278; VI-NEXT:    s_mov_b32 s9, s3
279; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
280; VI-NEXT:    s_mov_b32 s4, s0
281; VI-NEXT:    s_mov_b32 s5, s1
282; VI-NEXT:    s_waitcnt vmcnt(0)
283; VI-NEXT:    v_ashrrev_i32_sdwa v4, sext(v2), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
284; VI-NEXT:    v_ashrrev_i32_sdwa v0, sext(v2), sext(v0) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
285; VI-NEXT:    v_ashrrev_i32_sdwa v2, sext(v3), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
286; VI-NEXT:    v_ashrrev_i32_sdwa v1, sext(v3), sext(v1) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
287; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
288; VI-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
289; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
290; VI-NEXT:    s_endpgm
291;
292; EG-LABEL: ashr_v4i16:
293; EG:       ; %bb.0:
294; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
295; EG-NEXT:    TEX 0 @8
296; EG-NEXT:    ALU 3, @13, KC0[], KC1[]
297; EG-NEXT:    TEX 0 @10
298; EG-NEXT:    ALU 54, @17, KC0[CB0:0-32], KC1[]
299; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XY, T9.X, 1
300; EG-NEXT:    CF_END
301; EG-NEXT:    PAD
302; EG-NEXT:    Fetch clause starting at 8:
303; EG-NEXT:     VTX_READ_64 T10.XY, T9.X, 0, #1
304; EG-NEXT:    Fetch clause starting at 10:
305; EG-NEXT:     VTX_READ_64 T9.XY, T9.X, 8, #1
306; EG-NEXT:    ALU clause starting at 12:
307; EG-NEXT:     MOV * T9.X, KC0[2].Z,
308; EG-NEXT:    ALU clause starting at 13:
309; EG-NEXT:     MOV T4.X, T10.X,
310; EG-NEXT:     MOV * T5.X, T10.Y,
311; EG-NEXT:     MOV T0.Y, PV.X,
312; EG-NEXT:     MOV * T0.Z, PS,
313; EG-NEXT:    ALU clause starting at 17:
314; EG-NEXT:     MOV T2.X, T9.X,
315; EG-NEXT:     MOV * T3.X, T9.Y,
316; EG-NEXT:     MOV * T0.W, T6.X,
317; EG-NEXT:     MOV T1.Y, T2.X,
318; EG-NEXT:     BFE_INT * T1.W, T0.Y, 0.0, literal.x,
319; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
320; EG-NEXT:     AND_INT * T2.W, PV.Y, literal.x,
321; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
322; EG-NEXT:     ASHR * T1.W, T1.W, PV.W,
323; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
324; EG-NEXT:     AND_INT * T0.W, T0.W, literal.y,
325; EG-NEXT:    65535(9.183409e-41), -65536(nan)
326; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
327; EG-NEXT:     MOV * T1.Z, T3.X,
328; EG-NEXT:     MOV * T6.X, T0.W,
329; EG-NEXT:     MOV T0.W, PV.X,
330; EG-NEXT:     LSHR * T1.W, T0.Y, literal.x,
331; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
332; EG-NEXT:     BFE_INT T1.W, PS, 0.0, literal.x,
333; EG-NEXT:     LSHR * T2.W, T1.Y, literal.x,
334; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
335; EG-NEXT:     ASHR T1.W, PV.W, PS,
336; EG-NEXT:     AND_INT * T0.W, T0.W, literal.x,
337; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
338; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
339; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
340; EG-NEXT:     OR_INT * T0.W, T0.W, PV.W,
341; EG-NEXT:     MOV T6.X, PV.W,
342; EG-NEXT:     MOV T0.Y, T7.X,
343; EG-NEXT:     BFE_INT T0.W, T0.Z, 0.0, literal.x,
344; EG-NEXT:     AND_INT * T1.W, T1.Z, literal.y,
345; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
346; EG-NEXT:     ASHR T0.W, PV.W, PS,
347; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.x,
348; EG-NEXT:    -65536(nan), 0(0.000000e+00)
349; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
350; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
351; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
352; EG-NEXT:     MOV * T7.X, PV.W,
353; EG-NEXT:     MOV T0.Y, PV.X,
354; EG-NEXT:     LSHR * T0.W, T0.Z, literal.x,
355; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
356; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
357; EG-NEXT:     LSHR * T1.W, T1.Z, literal.x,
358; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
359; EG-NEXT:     ASHR T0.W, PV.W, PS,
360; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.x,
361; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
362; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
363; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
364; EG-NEXT:     LSHR T9.X, KC0[2].Y, literal.x,
365; EG-NEXT:     OR_INT * T10.Y, T1.W, PV.W,
366; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
367; EG-NEXT:     MOV T7.X, PV.Y,
368; EG-NEXT:     MOV * T10.X, T6.X,
369  %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
370  %a = load <4 x i16>, <4 x i16> addrspace(1)* %in
371  %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
372  %result = ashr <4 x i16> %a, %b
373  store <4 x i16> %result, <4 x i16> addrspace(1)* %out
374  ret void
375}
376
377define amdgpu_kernel void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) {
378; SI-LABEL: s_ashr_i64:
379; SI:       ; %bb.0: ; %entry
380; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
381; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
382; SI-NEXT:    s_mov_b32 s3, 0xf000
383; SI-NEXT:    s_mov_b32 s2, -1
384; SI-NEXT:    s_waitcnt lgkmcnt(0)
385; SI-NEXT:    s_ashr_i32 s5, s4, 31
386; SI-NEXT:    s_ashr_i64 s[4:5], s[4:5], 8
387; SI-NEXT:    v_mov_b32_e32 v0, s4
388; SI-NEXT:    v_mov_b32_e32 v1, s5
389; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
390; SI-NEXT:    s_endpgm
391;
392; VI-LABEL: s_ashr_i64:
393; VI:       ; %bb.0: ; %entry
394; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
395; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
396; VI-NEXT:    s_mov_b32 s3, 0xf000
397; VI-NEXT:    s_mov_b32 s2, -1
398; VI-NEXT:    s_waitcnt lgkmcnt(0)
399; VI-NEXT:    s_ashr_i32 s5, s4, 31
400; VI-NEXT:    s_ashr_i64 s[4:5], s[4:5], 8
401; VI-NEXT:    v_mov_b32_e32 v0, s4
402; VI-NEXT:    v_mov_b32_e32 v1, s5
403; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
404; VI-NEXT:    s_endpgm
405;
406; EG-LABEL: s_ashr_i64:
407; EG:       ; %bb.0: ; %entry
408; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
409; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
410; EG-NEXT:    CF_END
411; EG-NEXT:    PAD
412; EG-NEXT:    ALU clause starting at 4:
413; EG-NEXT:     ASHR * T0.Y, KC0[2].Z, literal.x,
414; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
415; EG-NEXT:     BIT_ALIGN_INT T0.X, PV.Y, KC0[2].Z, literal.x,
416; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
417; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
418entry:
419  %in.ext = sext i32 %in to i64
420  %ashr = ashr i64 %in.ext, 8
421  store i64 %ashr, i64 addrspace(1)* %out
422  ret void
423}
424
425define amdgpu_kernel void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
426; SI-LABEL: ashr_i64_2:
427; SI:       ; %bb.0: ; %entry
428; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
429; SI-NEXT:    s_mov_b32 s7, 0xf000
430; SI-NEXT:    s_mov_b32 s6, -1
431; SI-NEXT:    s_mov_b32 s10, s6
432; SI-NEXT:    s_mov_b32 s11, s7
433; SI-NEXT:    s_waitcnt lgkmcnt(0)
434; SI-NEXT:    s_mov_b32 s8, s2
435; SI-NEXT:    s_mov_b32 s9, s3
436; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
437; SI-NEXT:    s_mov_b32 s4, s0
438; SI-NEXT:    s_mov_b32 s5, s1
439; SI-NEXT:    s_waitcnt vmcnt(0)
440; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], v2
441; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
442; SI-NEXT:    s_endpgm
443;
444; VI-LABEL: ashr_i64_2:
445; VI:       ; %bb.0: ; %entry
446; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
447; VI-NEXT:    s_mov_b32 s7, 0xf000
448; VI-NEXT:    s_mov_b32 s6, -1
449; VI-NEXT:    s_mov_b32 s10, s6
450; VI-NEXT:    s_mov_b32 s11, s7
451; VI-NEXT:    s_waitcnt lgkmcnt(0)
452; VI-NEXT:    s_mov_b32 s8, s2
453; VI-NEXT:    s_mov_b32 s9, s3
454; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
455; VI-NEXT:    s_mov_b32 s4, s0
456; VI-NEXT:    s_mov_b32 s5, s1
457; VI-NEXT:    s_waitcnt vmcnt(0)
458; VI-NEXT:    v_ashrrev_i64 v[0:1], v2, v[0:1]
459; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
460; VI-NEXT:    s_endpgm
461;
462; EG-LABEL: ashr_i64_2:
463; EG:       ; %bb.0: ; %entry
464; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
465; EG-NEXT:    TEX 0 @6
466; EG-NEXT:    ALU 10, @9, KC0[CB0:0-32], KC1[]
467; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
468; EG-NEXT:    CF_END
469; EG-NEXT:    PAD
470; EG-NEXT:    Fetch clause starting at 6:
471; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
472; EG-NEXT:    ALU clause starting at 8:
473; EG-NEXT:     MOV * T0.X, KC0[2].Z,
474; EG-NEXT:    ALU clause starting at 9:
475; EG-NEXT:     AND_INT * T0.W, T0.Z, literal.x,
476; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
477; EG-NEXT:     ASHR T1.Z, T0.Y, PV.W,
478; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.Y, T0.X, T0.Z,
479; EG-NEXT:     AND_INT * T1.W, T0.Z, literal.x,
480; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
481; EG-NEXT:     CNDE_INT T0.X, PS, PV.W, PV.Z,
482; EG-NEXT:     ASHR T0.W, T0.Y, literal.x,
483; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
484; EG-NEXT:    31(4.344025e-44), 2(2.802597e-45)
485; EG-NEXT:     CNDE_INT * T0.Y, T1.W, T1.Z, PV.W,
486entry:
487  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
488  %a = load i64, i64 addrspace(1)* %in
489  %b = load i64, i64 addrspace(1)* %b_ptr
490  %result = ashr i64 %a, %b
491  store i64 %result, i64 addrspace(1)* %out
492  ret void
493}
494
495define amdgpu_kernel void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
496; SI-LABEL: ashr_v2i64:
497; SI:       ; %bb.0:
498; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
499; SI-NEXT:    s_mov_b32 s7, 0xf000
500; SI-NEXT:    s_mov_b32 s6, -1
501; SI-NEXT:    s_mov_b32 s10, s6
502; SI-NEXT:    s_mov_b32 s11, s7
503; SI-NEXT:    s_waitcnt lgkmcnt(0)
504; SI-NEXT:    s_mov_b32 s8, s2
505; SI-NEXT:    s_mov_b32 s9, s3
506; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
507; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
508; SI-NEXT:    s_mov_b32 s4, s0
509; SI-NEXT:    s_mov_b32 s5, s1
510; SI-NEXT:    s_waitcnt vmcnt(0)
511; SI-NEXT:    v_ashr_i64 v[2:3], v[2:3], v6
512; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], v4
513; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
514; SI-NEXT:    s_endpgm
515;
516; VI-LABEL: ashr_v2i64:
517; VI:       ; %bb.0:
518; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
519; VI-NEXT:    s_mov_b32 s7, 0xf000
520; VI-NEXT:    s_mov_b32 s6, -1
521; VI-NEXT:    s_mov_b32 s10, s6
522; VI-NEXT:    s_mov_b32 s11, s7
523; VI-NEXT:    s_waitcnt lgkmcnt(0)
524; VI-NEXT:    s_mov_b32 s8, s2
525; VI-NEXT:    s_mov_b32 s9, s3
526; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
527; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
528; VI-NEXT:    s_mov_b32 s4, s0
529; VI-NEXT:    s_mov_b32 s5, s1
530; VI-NEXT:    s_waitcnt vmcnt(0)
531; VI-NEXT:    v_ashrrev_i64 v[2:3], v6, v[2:3]
532; VI-NEXT:    v_ashrrev_i64 v[0:1], v4, v[0:1]
533; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
534; VI-NEXT:    s_endpgm
535;
536; EG-LABEL: ashr_v2i64:
537; EG:       ; %bb.0:
538; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
539; EG-NEXT:    TEX 1 @6
540; EG-NEXT:    ALU 19, @11, KC0[CB0:0-32], KC1[]
541; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
542; EG-NEXT:    CF_END
543; EG-NEXT:    PAD
544; EG-NEXT:    Fetch clause starting at 6:
545; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
546; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
547; EG-NEXT:    ALU clause starting at 10:
548; EG-NEXT:     MOV * T0.X, KC0[2].Z,
549; EG-NEXT:    ALU clause starting at 11:
550; EG-NEXT:     AND_INT * T1.W, T1.Z, literal.x,
551; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
552; EG-NEXT:     ASHR T1.Y, T0.W, PV.W,
553; EG-NEXT:     AND_INT T2.Z, T1.Z, literal.x,
554; EG-NEXT:     BIT_ALIGN_INT T1.W, T0.W, T0.Z, T1.Z,
555; EG-NEXT:     AND_INT * T2.W, T1.X, literal.y,
556; EG-NEXT:    32(4.484155e-44), 31(4.344025e-44)
557; EG-NEXT:     ASHR T2.Y, T0.Y, PS,
558; EG-NEXT:     CNDE_INT T0.Z, PV.Z, PV.W, PV.Y,
559; EG-NEXT:     BIT_ALIGN_INT T1.W, T0.Y, T0.X, T1.X,
560; EG-NEXT:     AND_INT * T2.W, T1.X, literal.x,
561; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
562; EG-NEXT:     CNDE_INT T0.X, PS, PV.W, PV.Y,
563; EG-NEXT:     ASHR T0.W, T0.W, literal.x,
564; EG-NEXT:     ASHR * T1.W, T0.Y, literal.x,
565; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
566; EG-NEXT:     CNDE_INT * T0.W, T2.Z, T1.Y, PV.W,
567; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
568; EG-NEXT:     CNDE_INT * T0.Y, T2.W, T2.Y, T1.W,
569; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
570  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
571  %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
572  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
573  %result = ashr <2 x i64> %a, %b
574  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
575  ret void
576}
577
578; FIXME: Broken on r600
579define amdgpu_kernel void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
580; SI-LABEL: ashr_v4i64:
581; SI:       ; %bb.0:
582; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
583; SI-NEXT:    s_mov_b32 s3, 0xf000
584; SI-NEXT:    s_mov_b32 s2, -1
585; SI-NEXT:    s_mov_b32 s10, s2
586; SI-NEXT:    s_mov_b32 s11, s3
587; SI-NEXT:    s_waitcnt lgkmcnt(0)
588; SI-NEXT:    s_mov_b32 s8, s6
589; SI-NEXT:    s_mov_b32 s9, s7
590; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
591; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
592; SI-NEXT:    buffer_load_dwordx4 v[7:10], off, s[8:11], 0
593; SI-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
594; SI-NEXT:    s_mov_b32 s0, s4
595; SI-NEXT:    s_mov_b32 s1, s5
596; SI-NEXT:    s_waitcnt vmcnt(2)
597; SI-NEXT:    v_ashr_i64 v[2:3], v[2:3], v6
598; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], v4
599; SI-NEXT:    s_waitcnt vmcnt(0)
600; SI-NEXT:    v_ashr_i64 v[9:10], v[9:10], v13
601; SI-NEXT:    v_ashr_i64 v[7:8], v[7:8], v11
602; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
603; SI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0
604; SI-NEXT:    s_endpgm
605;
606; VI-LABEL: ashr_v4i64:
607; VI:       ; %bb.0:
608; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
609; VI-NEXT:    s_mov_b32 s3, 0xf000
610; VI-NEXT:    s_mov_b32 s2, -1
611; VI-NEXT:    s_mov_b32 s10, s2
612; VI-NEXT:    s_mov_b32 s11, s3
613; VI-NEXT:    s_waitcnt lgkmcnt(0)
614; VI-NEXT:    s_mov_b32 s8, s6
615; VI-NEXT:    s_mov_b32 s9, s7
616; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
617; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
618; VI-NEXT:    buffer_load_dwordx4 v[7:10], off, s[8:11], 0
619; VI-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
620; VI-NEXT:    s_mov_b32 s0, s4
621; VI-NEXT:    s_mov_b32 s1, s5
622; VI-NEXT:    s_waitcnt vmcnt(2)
623; VI-NEXT:    v_ashrrev_i64 v[2:3], v6, v[2:3]
624; VI-NEXT:    v_ashrrev_i64 v[0:1], v4, v[0:1]
625; VI-NEXT:    s_waitcnt vmcnt(0)
626; VI-NEXT:    v_ashrrev_i64 v[9:10], v13, v[9:10]
627; VI-NEXT:    v_ashrrev_i64 v[7:8], v11, v[7:8]
628; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
629; VI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0
630; VI-NEXT:    s_endpgm
631;
632; EG-LABEL: ashr_v4i64:
633; EG:       ; %bb.0:
634; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
635; EG-NEXT:    TEX 3 @6
636; EG-NEXT:    ALU 39, @15, KC0[CB0:0-32], KC1[]
637; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0
638; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
639; EG-NEXT:    CF_END
640; EG-NEXT:    Fetch clause starting at 6:
641; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 32, #1
642; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 48, #1
643; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 0, #1
644; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 16, #1
645; EG-NEXT:    ALU clause starting at 14:
646; EG-NEXT:     MOV * T0.X, KC0[2].Z,
647; EG-NEXT:    ALU clause starting at 15:
648; EG-NEXT:     AND_INT * T1.W, T1.Z, literal.x,
649; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
650; EG-NEXT:     ASHR T1.Y, T0.W, literal.x,
651; EG-NEXT:     ASHR T4.Z, T3.W, PV.W, BS:VEC_120/SCL_212
652; EG-NEXT:     AND_INT T1.W, T1.Z, literal.y,
653; EG-NEXT:     AND_INT * T2.W, T2.Z, literal.x,
654; EG-NEXT:    31(4.344025e-44), 32(4.484155e-44)
655; EG-NEXT:     BIT_ALIGN_INT T4.X, T3.W, T3.Z, T1.Z,
656; EG-NEXT:     ASHR T2.Y, T0.W, PS, BS:VEC_120/SCL_212
657; EG-NEXT:     AND_INT * T1.Z, T2.Z, literal.x,
658; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
659; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.W, T0.Z, T2.Z,
660; EG-NEXT:     AND_INT * T2.W, T2.X, literal.x,
661; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
662; EG-NEXT:     AND_INT T5.X, T1.X, literal.x,
663; EG-NEXT:     ASHR T4.Y, T0.Y, PS,
664; EG-NEXT:     CNDE_INT T0.Z, T1.Z, PV.W, T2.Y,
665; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.Y, T0.X, T2.X,
666; EG-NEXT:     AND_INT * T2.W, T2.X, literal.y,
667; EG-NEXT:    31(4.344025e-44), 32(4.484155e-44)
668; EG-NEXT:     CNDE_INT T0.X, PS, PV.W, PV.Y,
669; EG-NEXT:     ASHR T5.Y, T3.Y, PV.X,
670; EG-NEXT:     CNDE_INT T2.Z, T1.W, T4.X, T4.Z,
671; EG-NEXT:     BIT_ALIGN_INT T0.W, T3.Y, T3.X, T1.X, BS:VEC_102/SCL_221
672; EG-NEXT:     AND_INT * T4.W, T1.X, literal.x,
673; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
674; EG-NEXT:     CNDE_INT T2.X, PS, PV.W, PV.Y,
675; EG-NEXT:     ASHR T6.Y, T3.W, literal.x,
676; EG-NEXT:     ASHR T3.Z, T0.Y, literal.x, BS:VEC_201
677; EG-NEXT:     ADD_INT T3.W, KC0[2].Y, literal.y,
678; EG-NEXT:     CNDE_INT * T0.W, T1.Z, T2.Y, T1.Y,
679; EG-NEXT:    31(4.344025e-44), 16(2.242078e-44)
680; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
681; EG-NEXT:     CNDE_INT T0.Y, T2.W, T4.Y, PV.Z,
682; EG-NEXT:     ASHR T3.W, T3.Y, literal.y,
683; EG-NEXT:     CNDE_INT * T2.W, T1.W, T4.Z, PV.Y,
684; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
685; EG-NEXT:     LSHR T3.X, KC0[2].Y, literal.x,
686; EG-NEXT:     CNDE_INT * T2.Y, T4.W, T5.Y, PV.W,
687; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
688  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
689  %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
690  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
691  %result = ashr <4 x i64> %a, %b
692  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
693  ret void
694}
695
696define amdgpu_kernel void @s_ashr_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
697; SI-LABEL: s_ashr_32_i64:
698; SI:       ; %bb.0:
699; SI-NEXT:    s_load_dword s6, s[0:1], 0x14
700; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x1d
701; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
702; SI-NEXT:    s_mov_b32 s3, 0xf000
703; SI-NEXT:    s_mov_b32 s2, -1
704; SI-NEXT:    s_waitcnt lgkmcnt(0)
705; SI-NEXT:    s_ashr_i32 s7, s6, 31
706; SI-NEXT:    s_add_u32 s4, s6, s4
707; SI-NEXT:    s_addc_u32 s5, s7, s5
708; SI-NEXT:    v_mov_b32_e32 v0, s4
709; SI-NEXT:    v_mov_b32_e32 v1, s5
710; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
711; SI-NEXT:    s_endpgm
712;
713; VI-LABEL: s_ashr_32_i64:
714; VI:       ; %bb.0:
715; VI-NEXT:    s_load_dword s6, s[0:1], 0x50
716; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x74
717; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
718; VI-NEXT:    s_mov_b32 s3, 0xf000
719; VI-NEXT:    s_mov_b32 s2, -1
720; VI-NEXT:    s_waitcnt lgkmcnt(0)
721; VI-NEXT:    s_ashr_i32 s7, s6, 31
722; VI-NEXT:    s_add_u32 s4, s6, s4
723; VI-NEXT:    s_addc_u32 s5, s7, s5
724; VI-NEXT:    v_mov_b32_e32 v0, s4
725; VI-NEXT:    v_mov_b32_e32 v1, s5
726; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
727; VI-NEXT:    s_endpgm
728;
729; EG-LABEL: s_ashr_32_i64:
730; EG:       ; %bb.0:
731; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
732; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
733; EG-NEXT:    CF_END
734; EG-NEXT:    PAD
735; EG-NEXT:    ALU clause starting at 4:
736; EG-NEXT:     ASHR * T0.W, KC0[5].X, literal.x,
737; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
738; EG-NEXT:     ADD_INT * T0.W, PV.W, KC0[7].Z,
739; EG-NEXT:     ADDC_UINT * T1.W, KC0[5].X, KC0[7].Y,
740; EG-NEXT:     ADD_INT * T0.Y, T0.W, PV.W,
741; EG-NEXT:     ADD_INT * T0.X, KC0[5].X, KC0[7].Y,
742; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
743; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
744  %result = ashr i64 %a, 32
745  %add = add i64 %result, %b
746  store i64 %add, i64 addrspace(1)* %out
747  ret void
748}
749
750define amdgpu_kernel void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
751; SI-LABEL: v_ashr_32_i64:
752; SI:       ; %bb.0:
753; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
754; SI-NEXT:    s_mov_b32 s7, 0xf000
755; SI-NEXT:    s_mov_b32 s6, 0
756; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
757; SI-NEXT:    v_mov_b32_e32 v1, 0
758; SI-NEXT:    s_waitcnt lgkmcnt(0)
759; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
760; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
761; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
762; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
763; SI-NEXT:    s_waitcnt vmcnt(0)
764; SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
765; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
766; SI-NEXT:    s_endpgm
767;
768; VI-LABEL: v_ashr_32_i64:
769; VI:       ; %bb.0:
770; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
771; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
772; VI-NEXT:    s_waitcnt lgkmcnt(0)
773; VI-NEXT:    v_mov_b32_e32 v0, s3
774; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v2
775; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v0, vcc
776; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v1
777; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
778; VI-NEXT:    flat_load_dword v0, v[0:1]
779; VI-NEXT:    v_mov_b32_e32 v1, s1
780; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
781; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
782; VI-NEXT:    s_waitcnt vmcnt(0)
783; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
784; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
785; VI-NEXT:    s_endpgm
786;
787; EG-LABEL: v_ashr_32_i64:
788; EG:       ; %bb.0:
789; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
790; EG-NEXT:    TEX 0 @6
791; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
792; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
793; EG-NEXT:    CF_END
794; EG-NEXT:    PAD
795; EG-NEXT:    Fetch clause starting at 6:
796; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 4, #1
797; EG-NEXT:    ALU clause starting at 8:
798; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
799; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
800; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
801; EG-NEXT:    ALU clause starting at 11:
802; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
803; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
804; EG-NEXT:     ASHR * T0.Y, T0.X, literal.y,
805; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
806  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
807  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
808  %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
809  %a = load i64, i64 addrspace(1)* %gep.in
810  %result = ashr i64 %a, 32
811  store i64 %result, i64 addrspace(1)* %gep.out
812  ret void
813}
814
815define amdgpu_kernel void @s_ashr_63_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
816; SI-LABEL: s_ashr_63_i64:
817; SI:       ; %bb.0:
818; SI-NEXT:    s_load_dword s6, s[0:1], 0x14
819; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x1d
820; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
821; SI-NEXT:    s_mov_b32 s3, 0xf000
822; SI-NEXT:    s_mov_b32 s2, -1
823; SI-NEXT:    s_waitcnt lgkmcnt(0)
824; SI-NEXT:    s_ashr_i32 s6, s6, 31
825; SI-NEXT:    s_add_u32 s4, s6, s4
826; SI-NEXT:    s_addc_u32 s5, s6, s5
827; SI-NEXT:    v_mov_b32_e32 v0, s4
828; SI-NEXT:    v_mov_b32_e32 v1, s5
829; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
830; SI-NEXT:    s_endpgm
831;
832; VI-LABEL: s_ashr_63_i64:
833; VI:       ; %bb.0:
834; VI-NEXT:    s_load_dword s6, s[0:1], 0x50
835; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x74
836; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
837; VI-NEXT:    s_mov_b32 s3, 0xf000
838; VI-NEXT:    s_mov_b32 s2, -1
839; VI-NEXT:    s_waitcnt lgkmcnt(0)
840; VI-NEXT:    s_ashr_i32 s6, s6, 31
841; VI-NEXT:    s_add_u32 s4, s6, s4
842; VI-NEXT:    s_addc_u32 s5, s6, s5
843; VI-NEXT:    v_mov_b32_e32 v0, s4
844; VI-NEXT:    v_mov_b32_e32 v1, s5
845; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
846; VI-NEXT:    s_endpgm
847;
848; EG-LABEL: s_ashr_63_i64:
849; EG:       ; %bb.0:
850; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
851; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
852; EG-NEXT:    CF_END
853; EG-NEXT:    PAD
854; EG-NEXT:    ALU clause starting at 4:
855; EG-NEXT:     ASHR * T0.W, KC0[5].X, literal.x,
856; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
857; EG-NEXT:     ADD_INT T1.W, PV.W, KC0[7].Z,
858; EG-NEXT:     ADDC_UINT * T2.W, PV.W, KC0[7].Y,
859; EG-NEXT:     ADD_INT * T0.Y, PV.W, PS,
860; EG-NEXT:     ADD_INT T0.X, T0.W, KC0[7].Y,
861; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
862; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
863  %result = ashr i64 %a, 63
864  %add = add i64 %result, %b
865  store i64 %add, i64 addrspace(1)* %out
866  ret void
867}
868
869define amdgpu_kernel void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
870; SI-LABEL: v_ashr_63_i64:
871; SI:       ; %bb.0:
872; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
873; SI-NEXT:    s_mov_b32 s7, 0xf000
874; SI-NEXT:    s_mov_b32 s6, 0
875; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
876; SI-NEXT:    v_mov_b32_e32 v1, 0
877; SI-NEXT:    s_waitcnt lgkmcnt(0)
878; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
879; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
880; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
881; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
882; SI-NEXT:    s_waitcnt vmcnt(0)
883; SI-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
884; SI-NEXT:    v_mov_b32_e32 v3, v2
885; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
886; SI-NEXT:    s_endpgm
887;
888; VI-LABEL: v_ashr_63_i64:
889; VI:       ; %bb.0:
890; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
891; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
892; VI-NEXT:    s_waitcnt lgkmcnt(0)
893; VI-NEXT:    v_mov_b32_e32 v0, s3
894; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v2
895; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v0, vcc
896; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v1
897; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
898; VI-NEXT:    flat_load_dword v3, v[0:1]
899; VI-NEXT:    v_mov_b32_e32 v1, s1
900; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
901; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
902; VI-NEXT:    s_waitcnt vmcnt(0)
903; VI-NEXT:    v_ashrrev_i32_e32 v2, 31, v3
904; VI-NEXT:    v_mov_b32_e32 v3, v2
905; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
906; VI-NEXT:    s_endpgm
907;
908; EG-LABEL: v_ashr_63_i64:
909; EG:       ; %bb.0:
910; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
911; EG-NEXT:    TEX 0 @6
912; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
913; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
914; EG-NEXT:    CF_END
915; EG-NEXT:    PAD
916; EG-NEXT:    Fetch clause starting at 6:
917; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 4, #1
918; EG-NEXT:    ALU clause starting at 8:
919; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
920; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
921; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
922; EG-NEXT:    ALU clause starting at 11:
923; EG-NEXT:     ASHR T0.X, T0.X, literal.x,
924; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
925; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
926; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
927; EG-NEXT:     MOV * T0.Y, PV.X,
928; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
929  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
930  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
931  %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
932  %a = load i64, i64 addrspace(1)* %gep.in
933  %result = ashr i64 %a, 63
934  store i64 %result, i64 addrspace(1)* %gep.out
935  ret void
936}
937
938attributes #0 = { nounwind readnone }
939