1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=VI
4; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s -check-prefixes=EG
5
6declare i32 @llvm.amdgcn.workitem.id.x() #0
7
8define amdgpu_kernel void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
9; SI-LABEL: ashr_v2i32:
10; SI:       ; %bb.0:
11; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
12; SI-NEXT:    s_mov_b32 s7, 0xf000
13; SI-NEXT:    s_mov_b32 s6, -1
14; SI-NEXT:    s_mov_b32 s10, s6
15; SI-NEXT:    s_mov_b32 s11, s7
16; SI-NEXT:    s_waitcnt lgkmcnt(0)
17; SI-NEXT:    s_mov_b32 s8, s2
18; SI-NEXT:    s_mov_b32 s9, s3
19; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
20; SI-NEXT:    s_mov_b32 s4, s0
21; SI-NEXT:    s_mov_b32 s5, s1
22; SI-NEXT:    s_waitcnt vmcnt(0)
23; SI-NEXT:    v_ashr_i32_e32 v1, v1, v3
24; SI-NEXT:    v_ashr_i32_e32 v0, v0, v2
25; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
26; SI-NEXT:    s_endpgm
27;
28; VI-LABEL: ashr_v2i32:
29; VI:       ; %bb.0:
30; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
31; VI-NEXT:    s_mov_b32 s7, 0xf000
32; VI-NEXT:    s_mov_b32 s6, -1
33; VI-NEXT:    s_mov_b32 s10, s6
34; VI-NEXT:    s_mov_b32 s11, s7
35; VI-NEXT:    s_waitcnt lgkmcnt(0)
36; VI-NEXT:    s_mov_b32 s8, s2
37; VI-NEXT:    s_mov_b32 s9, s3
38; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
39; VI-NEXT:    s_mov_b32 s4, s0
40; VI-NEXT:    s_mov_b32 s5, s1
41; VI-NEXT:    s_waitcnt vmcnt(0)
42; VI-NEXT:    v_ashrrev_i32_e32 v1, v3, v1
43; VI-NEXT:    v_ashrrev_i32_e32 v0, v2, v0
44; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
45; VI-NEXT:    s_endpgm
46;
47; EG-LABEL: ashr_v2i32:
48; EG:       ; %bb.0:
49; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
50; EG-NEXT:    TEX 0 @6
51; EG-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
52; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
53; EG-NEXT:    CF_END
54; EG-NEXT:    PAD
55; EG-NEXT:    Fetch clause starting at 6:
56; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
57; EG-NEXT:    ALU clause starting at 8:
58; EG-NEXT:     MOV * T0.X, KC0[2].Z,
59; EG-NEXT:    ALU clause starting at 9:
60; EG-NEXT:     ASHR * T0.Y, T0.Y, T0.W,
61; EG-NEXT:     ASHR T0.X, T0.X, T0.Z,
62; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
63; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
64  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
65  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
66  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
67  %result = ashr <2 x i32> %a, %b
68  store <2 x i32> %result, <2 x i32> addrspace(1)* %out
69  ret void
70}
71
72define amdgpu_kernel void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
73; SI-LABEL: ashr_v4i32:
74; SI:       ; %bb.0:
75; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
76; SI-NEXT:    s_mov_b32 s7, 0xf000
77; SI-NEXT:    s_mov_b32 s6, -1
78; SI-NEXT:    s_mov_b32 s10, s6
79; SI-NEXT:    s_mov_b32 s11, s7
80; SI-NEXT:    s_waitcnt lgkmcnt(0)
81; SI-NEXT:    s_mov_b32 s8, s2
82; SI-NEXT:    s_mov_b32 s9, s3
83; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
84; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
85; SI-NEXT:    s_mov_b32 s4, s0
86; SI-NEXT:    s_mov_b32 s5, s1
87; SI-NEXT:    s_waitcnt vmcnt(0)
88; SI-NEXT:    v_ashrrev_i32_e32 v3, v7, v3
89; SI-NEXT:    v_ashrrev_i32_e32 v2, v6, v2
90; SI-NEXT:    v_ashrrev_i32_e32 v1, v5, v1
91; SI-NEXT:    v_ashr_i32_e32 v0, v0, v4
92; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
93; SI-NEXT:    s_endpgm
94;
95; VI-LABEL: ashr_v4i32:
96; VI:       ; %bb.0:
97; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
98; VI-NEXT:    s_mov_b32 s7, 0xf000
99; VI-NEXT:    s_mov_b32 s6, -1
100; VI-NEXT:    s_mov_b32 s10, s6
101; VI-NEXT:    s_mov_b32 s11, s7
102; VI-NEXT:    s_waitcnt lgkmcnt(0)
103; VI-NEXT:    s_mov_b32 s8, s2
104; VI-NEXT:    s_mov_b32 s9, s3
105; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
106; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
107; VI-NEXT:    s_mov_b32 s4, s0
108; VI-NEXT:    s_mov_b32 s5, s1
109; VI-NEXT:    s_waitcnt vmcnt(0)
110; VI-NEXT:    v_ashrrev_i32_e32 v3, v7, v3
111; VI-NEXT:    v_ashrrev_i32_e32 v2, v6, v2
112; VI-NEXT:    v_ashrrev_i32_e32 v1, v5, v1
113; VI-NEXT:    v_ashrrev_i32_e32 v0, v4, v0
114; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
115; VI-NEXT:    s_endpgm
116;
117; EG-LABEL: ashr_v4i32:
118; EG:       ; %bb.0:
119; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
120; EG-NEXT:    TEX 1 @6
121; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
122; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
123; EG-NEXT:    CF_END
124; EG-NEXT:    PAD
125; EG-NEXT:    Fetch clause starting at 6:
126; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
127; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
128; EG-NEXT:    ALU clause starting at 10:
129; EG-NEXT:     MOV * T0.X, KC0[2].Z,
130; EG-NEXT:    ALU clause starting at 11:
131; EG-NEXT:     ASHR * T0.W, T0.W, T1.W,
132; EG-NEXT:     ASHR * T0.Z, T0.Z, T1.Z,
133; EG-NEXT:     ASHR * T0.Y, T0.Y, T1.Y,
134; EG-NEXT:     ASHR T0.X, T0.X, T1.X,
135; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
136; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
137  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
138  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
139  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
140  %result = ashr <4 x i32> %a, %b
141  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
142  ret void
143}
144
145; FIXME: The ashr operation is uniform, but because its operands come from a
146; global load we end up with the vector instructions rather than scalar.
147define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
148; SI-LABEL: ashr_v2i16:
149; SI:       ; %bb.0:
150; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
151; SI-NEXT:    s_mov_b32 s7, 0xf000
152; SI-NEXT:    s_mov_b32 s6, -1
153; SI-NEXT:    s_mov_b32 s10, s6
154; SI-NEXT:    s_mov_b32 s11, s7
155; SI-NEXT:    s_waitcnt lgkmcnt(0)
156; SI-NEXT:    s_mov_b32 s8, s2
157; SI-NEXT:    s_mov_b32 s9, s3
158; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
159; SI-NEXT:    s_mov_b32 s4, s0
160; SI-NEXT:    s_mov_b32 s5, s1
161; SI-NEXT:    s_waitcnt vmcnt(0)
162; SI-NEXT:    v_readfirstlane_b32 s0, v0
163; SI-NEXT:    v_readfirstlane_b32 s1, v1
164; SI-NEXT:    s_sext_i32_i16 s2, s0
165; SI-NEXT:    s_ashr_i32 s0, s0, 16
166; SI-NEXT:    s_lshr_b32 s3, s1, 16
167; SI-NEXT:    s_ashr_i32 s0, s0, s3
168; SI-NEXT:    s_ashr_i32 s1, s2, s1
169; SI-NEXT:    s_lshl_b32 s0, s0, 16
170; SI-NEXT:    s_and_b32 s1, s1, 0xffff
171; SI-NEXT:    s_or_b32 s0, s1, s0
172; SI-NEXT:    v_mov_b32_e32 v0, s0
173; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
174; SI-NEXT:    s_endpgm
175;
176; VI-LABEL: ashr_v2i16:
177; VI:       ; %bb.0:
178; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
179; VI-NEXT:    s_mov_b32 s7, 0xf000
180; VI-NEXT:    s_mov_b32 s6, -1
181; VI-NEXT:    s_mov_b32 s10, s6
182; VI-NEXT:    s_mov_b32 s11, s7
183; VI-NEXT:    s_waitcnt lgkmcnt(0)
184; VI-NEXT:    s_mov_b32 s8, s2
185; VI-NEXT:    s_mov_b32 s9, s3
186; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
187; VI-NEXT:    s_mov_b32 s4, s0
188; VI-NEXT:    s_mov_b32 s5, s1
189; VI-NEXT:    s_waitcnt vmcnt(0)
190; VI-NEXT:    v_readfirstlane_b32 s0, v0
191; VI-NEXT:    v_readfirstlane_b32 s1, v1
192; VI-NEXT:    s_ashr_i32 s2, s0, 16
193; VI-NEXT:    s_sext_i32_i16 s0, s0
194; VI-NEXT:    s_ashr_i32 s3, s1, 16
195; VI-NEXT:    s_sext_i32_i16 s1, s1
196; VI-NEXT:    s_ashr_i32 s0, s0, s1
197; VI-NEXT:    s_ashr_i32 s1, s2, s3
198; VI-NEXT:    s_lshl_b32 s1, s1, 16
199; VI-NEXT:    s_and_b32 s0, s0, 0xffff
200; VI-NEXT:    s_or_b32 s0, s0, s1
201; VI-NEXT:    v_mov_b32_e32 v0, s0
202; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
203; VI-NEXT:    s_endpgm
204;
205; EG-LABEL: ashr_v2i16:
206; EG:       ; %bb.0:
207; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
208; EG-NEXT:    TEX 0 @6
209; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
210; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1
211; EG-NEXT:    CF_END
212; EG-NEXT:    PAD
213; EG-NEXT:    Fetch clause starting at 6:
214; EG-NEXT:     VTX_READ_64 T6.XY, T6.X, 0, #1
215; EG-NEXT:    ALU clause starting at 8:
216; EG-NEXT:     MOV * T6.X, KC0[2].Z,
217; EG-NEXT:    ALU clause starting at 9:
218; EG-NEXT:     LSHR * T0.W, T6.X, literal.x,
219; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
220; EG-NEXT:     BFE_INT T0.Y, PV.W, 0.0, literal.x,
221; EG-NEXT:     LSHR T0.Z, T6.Y, literal.x,
222; EG-NEXT:     BFE_INT T0.W, T6.X, 0.0, literal.x,
223; EG-NEXT:     AND_INT * T1.W, T6.Y, literal.y,
224; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
225; EG-NEXT:     ASHR T0.W, PV.W, PS,
226; EG-NEXT:     ASHR * T1.W, PV.Y, PV.Z,
227; EG-NEXT:     LSHL T1.W, PS, literal.x,
228; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
229; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
230; EG-NEXT:     OR_INT T6.X, PS, PV.W,
231; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
232; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
233  %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
234  %a = load <2 x i16>, <2 x i16> addrspace(1)* %in
235  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
236  %result = ashr <2 x i16> %a, %b
237  store <2 x i16> %result, <2 x i16> addrspace(1)* %out
238  ret void
239}
240
241; FIXME: The ashr operation is uniform, but because its operands come from a
242; global load we end up with the vector instructions rather than scalar.
243define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
244; SI-LABEL: ashr_v4i16:
245; SI:       ; %bb.0:
246; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
247; SI-NEXT:    s_mov_b32 s7, 0xf000
248; SI-NEXT:    s_mov_b32 s6, -1
249; SI-NEXT:    s_mov_b32 s10, s6
250; SI-NEXT:    s_mov_b32 s11, s7
251; SI-NEXT:    s_waitcnt lgkmcnt(0)
252; SI-NEXT:    s_mov_b32 s8, s2
253; SI-NEXT:    s_mov_b32 s9, s3
254; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
255; SI-NEXT:    s_mov_b32 s4, s0
256; SI-NEXT:    s_mov_b32 s5, s1
257; SI-NEXT:    s_waitcnt vmcnt(0)
258; SI-NEXT:    v_readfirstlane_b32 s0, v3
259; SI-NEXT:    v_readfirstlane_b32 s1, v2
260; SI-NEXT:    v_readfirstlane_b32 s2, v1
261; SI-NEXT:    v_readfirstlane_b32 s3, v0
262; SI-NEXT:    s_sext_i32_i16 s8, s3
263; SI-NEXT:    s_ashr_i32 s3, s3, 16
264; SI-NEXT:    s_sext_i32_i16 s9, s2
265; SI-NEXT:    s_ashr_i32 s2, s2, 16
266; SI-NEXT:    s_lshr_b32 s10, s1, 16
267; SI-NEXT:    s_lshr_b32 s11, s0, 16
268; SI-NEXT:    s_ashr_i32 s2, s2, s11
269; SI-NEXT:    s_ashr_i32 s0, s9, s0
270; SI-NEXT:    s_ashr_i32 s3, s3, s10
271; SI-NEXT:    s_ashr_i32 s1, s8, s1
272; SI-NEXT:    s_lshl_b32 s2, s2, 16
273; SI-NEXT:    s_and_b32 s0, s0, 0xffff
274; SI-NEXT:    s_lshl_b32 s3, s3, 16
275; SI-NEXT:    s_and_b32 s1, s1, 0xffff
276; SI-NEXT:    s_or_b32 s0, s0, s2
277; SI-NEXT:    s_or_b32 s1, s1, s3
278; SI-NEXT:    v_mov_b32_e32 v0, s1
279; SI-NEXT:    v_mov_b32_e32 v1, s0
280; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
281; SI-NEXT:    s_endpgm
282;
283; VI-LABEL: ashr_v4i16:
284; VI:       ; %bb.0:
285; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
286; VI-NEXT:    s_mov_b32 s7, 0xf000
287; VI-NEXT:    s_mov_b32 s6, -1
288; VI-NEXT:    s_mov_b32 s10, s6
289; VI-NEXT:    s_mov_b32 s11, s7
290; VI-NEXT:    s_waitcnt lgkmcnt(0)
291; VI-NEXT:    s_mov_b32 s8, s2
292; VI-NEXT:    s_mov_b32 s9, s3
293; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
294; VI-NEXT:    s_mov_b32 s4, s0
295; VI-NEXT:    s_mov_b32 s5, s1
296; VI-NEXT:    s_waitcnt vmcnt(0)
297; VI-NEXT:    v_readfirstlane_b32 s0, v2
298; VI-NEXT:    v_readfirstlane_b32 s1, v3
299; VI-NEXT:    v_readfirstlane_b32 s2, v0
300; VI-NEXT:    v_readfirstlane_b32 s3, v1
301; VI-NEXT:    s_ashr_i32 s8, s3, 16
302; VI-NEXT:    s_sext_i32_i16 s3, s3
303; VI-NEXT:    s_ashr_i32 s9, s2, 16
304; VI-NEXT:    s_sext_i32_i16 s2, s2
305; VI-NEXT:    s_ashr_i32 s10, s1, 16
306; VI-NEXT:    s_sext_i32_i16 s1, s1
307; VI-NEXT:    s_ashr_i32 s11, s0, 16
308; VI-NEXT:    s_sext_i32_i16 s0, s0
309; VI-NEXT:    s_ashr_i32 s0, s2, s0
310; VI-NEXT:    s_ashr_i32 s2, s9, s11
311; VI-NEXT:    s_ashr_i32 s1, s3, s1
312; VI-NEXT:    s_ashr_i32 s3, s8, s10
313; VI-NEXT:    s_lshl_b32 s3, s3, 16
314; VI-NEXT:    s_and_b32 s1, s1, 0xffff
315; VI-NEXT:    s_lshl_b32 s2, s2, 16
316; VI-NEXT:    s_and_b32 s0, s0, 0xffff
317; VI-NEXT:    s_or_b32 s1, s1, s3
318; VI-NEXT:    s_or_b32 s0, s0, s2
319; VI-NEXT:    v_mov_b32_e32 v0, s0
320; VI-NEXT:    v_mov_b32_e32 v1, s1
321; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
322; VI-NEXT:    s_endpgm
323;
324; EG-LABEL: ashr_v4i16:
325; EG:       ; %bb.0:
326; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
327; EG-NEXT:    TEX 0 @6
328; EG-NEXT:    ALU 58, @9, KC0[CB0:0-32], KC1[]
329; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XY, T9.X, 1
330; EG-NEXT:    CF_END
331; EG-NEXT:    PAD
332; EG-NEXT:    Fetch clause starting at 6:
333; EG-NEXT:     VTX_READ_128 T9.XYZW, T9.X, 0, #1
334; EG-NEXT:    ALU clause starting at 8:
335; EG-NEXT:     MOV * T9.X, KC0[2].Z,
336; EG-NEXT:    ALU clause starting at 9:
337; EG-NEXT:     MOV T4.X, T9.X,
338; EG-NEXT:     MOV * T5.X, T9.Y,
339; EG-NEXT:     MOV T0.Y, PV.X,
340; EG-NEXT:     MOV * T0.Z, PS,
341; EG-NEXT:     MOV T2.X, T9.Z,
342; EG-NEXT:     MOV * T3.X, T9.W,
343; EG-NEXT:     MOV * T0.W, T6.X,
344; EG-NEXT:     MOV T1.Y, T2.X,
345; EG-NEXT:     BFE_INT * T1.W, T0.Y, 0.0, literal.x,
346; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
347; EG-NEXT:     AND_INT * T2.W, PV.Y, literal.x,
348; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
349; EG-NEXT:     ASHR * T1.W, T1.W, PV.W,
350; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
351; EG-NEXT:     AND_INT * T0.W, T0.W, literal.y,
352; EG-NEXT:    65535(9.183409e-41), -65536(nan)
353; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
354; EG-NEXT:     MOV * T1.Z, T3.X,
355; EG-NEXT:     MOV * T6.X, T0.W,
356; EG-NEXT:     MOV T0.W, PV.X,
357; EG-NEXT:     LSHR * T1.W, T0.Y, literal.x,
358; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
359; EG-NEXT:     BFE_INT T1.W, PS, 0.0, literal.x,
360; EG-NEXT:     LSHR * T2.W, T1.Y, literal.x,
361; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
362; EG-NEXT:     ASHR T1.W, PV.W, PS,
363; EG-NEXT:     AND_INT * T0.W, T0.W, literal.x,
364; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
365; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
366; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
367; EG-NEXT:     OR_INT * T0.W, T0.W, PV.W,
368; EG-NEXT:     MOV T6.X, PV.W,
369; EG-NEXT:     MOV T0.Y, T7.X,
370; EG-NEXT:     BFE_INT T0.W, T0.Z, 0.0, literal.x,
371; EG-NEXT:     AND_INT * T1.W, T1.Z, literal.y,
372; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
373; EG-NEXT:     ASHR T0.W, PV.W, PS,
374; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.x,
375; EG-NEXT:    -65536(nan), 0(0.000000e+00)
376; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
377; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
378; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
379; EG-NEXT:     MOV * T7.X, PV.W,
380; EG-NEXT:     MOV T0.Y, PV.X,
381; EG-NEXT:     LSHR * T0.W, T0.Z, literal.x,
382; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
383; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
384; EG-NEXT:     LSHR * T1.W, T1.Z, literal.x,
385; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
386; EG-NEXT:     ASHR T0.W, PV.W, PS,
387; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.x,
388; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
389; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
390; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
391; EG-NEXT:     LSHR T9.X, KC0[2].Y, literal.x,
392; EG-NEXT:     OR_INT * T10.Y, T1.W, PV.W,
393; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
394; EG-NEXT:     MOV T7.X, PV.Y,
395; EG-NEXT:     MOV * T10.X, T6.X,
396  %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
397  %a = load <4 x i16>, <4 x i16> addrspace(1)* %in
398  %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
399  %result = ashr <4 x i16> %a, %b
400  store <4 x i16> %result, <4 x i16> addrspace(1)* %out
401  ret void
402}
403
404define amdgpu_kernel void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) {
405; SI-LABEL: s_ashr_i64:
406; SI:       ; %bb.0: ; %entry
407; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
408; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
409; SI-NEXT:    s_mov_b32 s3, 0xf000
410; SI-NEXT:    s_mov_b32 s2, -1
411; SI-NEXT:    s_waitcnt lgkmcnt(0)
412; SI-NEXT:    s_ashr_i32 s5, s4, 31
413; SI-NEXT:    s_ashr_i64 s[4:5], s[4:5], 8
414; SI-NEXT:    v_mov_b32_e32 v0, s4
415; SI-NEXT:    v_mov_b32_e32 v1, s5
416; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
417; SI-NEXT:    s_endpgm
418;
419; VI-LABEL: s_ashr_i64:
420; VI:       ; %bb.0: ; %entry
421; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
422; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
423; VI-NEXT:    s_mov_b32 s3, 0xf000
424; VI-NEXT:    s_mov_b32 s2, -1
425; VI-NEXT:    s_waitcnt lgkmcnt(0)
426; VI-NEXT:    s_ashr_i32 s5, s4, 31
427; VI-NEXT:    s_ashr_i64 s[4:5], s[4:5], 8
428; VI-NEXT:    v_mov_b32_e32 v0, s4
429; VI-NEXT:    v_mov_b32_e32 v1, s5
430; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
431; VI-NEXT:    s_endpgm
432;
433; EG-LABEL: s_ashr_i64:
434; EG:       ; %bb.0: ; %entry
435; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
436; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
437; EG-NEXT:    CF_END
438; EG-NEXT:    PAD
439; EG-NEXT:    ALU clause starting at 4:
440; EG-NEXT:     ASHR * T0.Y, KC0[2].Z, literal.x,
441; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
442; EG-NEXT:     BIT_ALIGN_INT T0.X, PV.Y, KC0[2].Z, literal.x,
443; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
444; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
445entry:
446  %in.ext = sext i32 %in to i64
447  %ashr = ashr i64 %in.ext, 8
448  store i64 %ashr, i64 addrspace(1)* %out
449  ret void
450}
451
452define amdgpu_kernel void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
453; SI-LABEL: ashr_i64_2:
454; SI:       ; %bb.0: ; %entry
455; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
456; SI-NEXT:    s_mov_b32 s7, 0xf000
457; SI-NEXT:    s_mov_b32 s6, -1
458; SI-NEXT:    s_mov_b32 s10, s6
459; SI-NEXT:    s_mov_b32 s11, s7
460; SI-NEXT:    s_waitcnt lgkmcnt(0)
461; SI-NEXT:    s_mov_b32 s8, s2
462; SI-NEXT:    s_mov_b32 s9, s3
463; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
464; SI-NEXT:    s_mov_b32 s4, s0
465; SI-NEXT:    s_mov_b32 s5, s1
466; SI-NEXT:    s_waitcnt vmcnt(0)
467; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], v2
468; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
469; SI-NEXT:    s_endpgm
470;
471; VI-LABEL: ashr_i64_2:
472; VI:       ; %bb.0: ; %entry
473; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
474; VI-NEXT:    s_mov_b32 s7, 0xf000
475; VI-NEXT:    s_mov_b32 s6, -1
476; VI-NEXT:    s_mov_b32 s10, s6
477; VI-NEXT:    s_mov_b32 s11, s7
478; VI-NEXT:    s_waitcnt lgkmcnt(0)
479; VI-NEXT:    s_mov_b32 s8, s2
480; VI-NEXT:    s_mov_b32 s9, s3
481; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
482; VI-NEXT:    s_mov_b32 s4, s0
483; VI-NEXT:    s_mov_b32 s5, s1
484; VI-NEXT:    s_waitcnt vmcnt(0)
485; VI-NEXT:    v_ashrrev_i64 v[0:1], v2, v[0:1]
486; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
487; VI-NEXT:    s_endpgm
488;
489; EG-LABEL: ashr_i64_2:
490; EG:       ; %bb.0: ; %entry
491; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
492; EG-NEXT:    TEX 0 @6
493; EG-NEXT:    ALU 10, @9, KC0[CB0:0-32], KC1[]
494; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
495; EG-NEXT:    CF_END
496; EG-NEXT:    PAD
497; EG-NEXT:    Fetch clause starting at 6:
498; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
499; EG-NEXT:    ALU clause starting at 8:
500; EG-NEXT:     MOV * T0.X, KC0[2].Z,
501; EG-NEXT:    ALU clause starting at 9:
502; EG-NEXT:     AND_INT * T0.W, T0.Z, literal.x,
503; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
504; EG-NEXT:     ASHR T1.Z, T0.Y, PV.W,
505; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.Y, T0.X, T0.Z,
506; EG-NEXT:     AND_INT * T1.W, T0.Z, literal.x,
507; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
508; EG-NEXT:     CNDE_INT T0.X, PS, PV.W, PV.Z,
509; EG-NEXT:     ASHR T0.W, T0.Y, literal.x,
510; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
511; EG-NEXT:    31(4.344025e-44), 2(2.802597e-45)
512; EG-NEXT:     CNDE_INT * T0.Y, T1.W, T1.Z, PV.W,
513entry:
514  %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
515  %a = load i64, i64 addrspace(1)* %in
516  %b = load i64, i64 addrspace(1)* %b_ptr
517  %result = ashr i64 %a, %b
518  store i64 %result, i64 addrspace(1)* %out
519  ret void
520}
521
522define amdgpu_kernel void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
523; SI-LABEL: ashr_v2i64:
524; SI:       ; %bb.0:
525; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
526; SI-NEXT:    s_mov_b32 s7, 0xf000
527; SI-NEXT:    s_mov_b32 s6, -1
528; SI-NEXT:    s_mov_b32 s10, s6
529; SI-NEXT:    s_mov_b32 s11, s7
530; SI-NEXT:    s_waitcnt lgkmcnt(0)
531; SI-NEXT:    s_mov_b32 s8, s2
532; SI-NEXT:    s_mov_b32 s9, s3
533; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
534; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
535; SI-NEXT:    s_mov_b32 s4, s0
536; SI-NEXT:    s_mov_b32 s5, s1
537; SI-NEXT:    s_waitcnt vmcnt(0)
538; SI-NEXT:    v_ashr_i64 v[2:3], v[2:3], v6
539; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], v4
540; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
541; SI-NEXT:    s_endpgm
542;
543; VI-LABEL: ashr_v2i64:
544; VI:       ; %bb.0:
545; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
546; VI-NEXT:    s_mov_b32 s7, 0xf000
547; VI-NEXT:    s_mov_b32 s6, -1
548; VI-NEXT:    s_mov_b32 s10, s6
549; VI-NEXT:    s_mov_b32 s11, s7
550; VI-NEXT:    s_waitcnt lgkmcnt(0)
551; VI-NEXT:    s_mov_b32 s8, s2
552; VI-NEXT:    s_mov_b32 s9, s3
553; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
554; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
555; VI-NEXT:    s_mov_b32 s4, s0
556; VI-NEXT:    s_mov_b32 s5, s1
557; VI-NEXT:    s_waitcnt vmcnt(0)
558; VI-NEXT:    v_ashrrev_i64 v[2:3], v6, v[2:3]
559; VI-NEXT:    v_ashrrev_i64 v[0:1], v4, v[0:1]
560; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
561; VI-NEXT:    s_endpgm
562;
563; EG-LABEL: ashr_v2i64:
564; EG:       ; %bb.0:
565; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
566; EG-NEXT:    TEX 1 @6
567; EG-NEXT:    ALU 19, @11, KC0[CB0:0-32], KC1[]
568; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
569; EG-NEXT:    CF_END
570; EG-NEXT:    PAD
571; EG-NEXT:    Fetch clause starting at 6:
572; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
573; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
574; EG-NEXT:    ALU clause starting at 10:
575; EG-NEXT:     MOV * T0.X, KC0[2].Z,
576; EG-NEXT:    ALU clause starting at 11:
577; EG-NEXT:     AND_INT * T1.W, T1.Z, literal.x,
578; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
579; EG-NEXT:     ASHR T1.Y, T0.W, PV.W,
580; EG-NEXT:     AND_INT T2.Z, T1.Z, literal.x,
581; EG-NEXT:     BIT_ALIGN_INT T1.W, T0.W, T0.Z, T1.Z,
582; EG-NEXT:     AND_INT * T2.W, T1.X, literal.y,
583; EG-NEXT:    32(4.484155e-44), 31(4.344025e-44)
584; EG-NEXT:     ASHR T2.Y, T0.Y, PS,
585; EG-NEXT:     CNDE_INT T0.Z, PV.Z, PV.W, PV.Y,
586; EG-NEXT:     BIT_ALIGN_INT T1.W, T0.Y, T0.X, T1.X,
587; EG-NEXT:     AND_INT * T2.W, T1.X, literal.x,
588; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
589; EG-NEXT:     CNDE_INT T0.X, PS, PV.W, PV.Y,
590; EG-NEXT:     ASHR T0.W, T0.W, literal.x,
591; EG-NEXT:     ASHR * T1.W, T0.Y, literal.x,
592; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
593; EG-NEXT:     CNDE_INT * T0.W, T2.Z, T1.Y, PV.W,
594; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
595; EG-NEXT:     CNDE_INT * T0.Y, T2.W, T2.Y, T1.W,
596; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
597  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
598  %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
599  %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
600  %result = ashr <2 x i64> %a, %b
601  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
602  ret void
603}
604
605; FIXME: Broken on r600
606define amdgpu_kernel void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
607; SI-LABEL: ashr_v4i64:
608; SI:       ; %bb.0:
609; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
610; SI-NEXT:    s_mov_b32 s3, 0xf000
611; SI-NEXT:    s_mov_b32 s2, -1
612; SI-NEXT:    s_mov_b32 s10, s2
613; SI-NEXT:    s_mov_b32 s11, s3
614; SI-NEXT:    s_waitcnt lgkmcnt(0)
615; SI-NEXT:    s_mov_b32 s8, s6
616; SI-NEXT:    s_mov_b32 s9, s7
617; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
618; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
619; SI-NEXT:    buffer_load_dwordx4 v[7:10], off, s[8:11], 0
620; SI-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
621; SI-NEXT:    s_mov_b32 s0, s4
622; SI-NEXT:    s_mov_b32 s1, s5
623; SI-NEXT:    s_waitcnt vmcnt(2)
624; SI-NEXT:    v_ashr_i64 v[2:3], v[2:3], v6
625; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], v4
626; SI-NEXT:    s_waitcnt vmcnt(0)
627; SI-NEXT:    v_ashr_i64 v[9:10], v[9:10], v13
628; SI-NEXT:    v_ashr_i64 v[7:8], v[7:8], v11
629; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
630; SI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0
631; SI-NEXT:    s_endpgm
632;
633; VI-LABEL: ashr_v4i64:
634; VI:       ; %bb.0:
635; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
636; VI-NEXT:    s_mov_b32 s3, 0xf000
637; VI-NEXT:    s_mov_b32 s2, -1
638; VI-NEXT:    s_mov_b32 s10, s2
639; VI-NEXT:    s_mov_b32 s11, s3
640; VI-NEXT:    s_waitcnt lgkmcnt(0)
641; VI-NEXT:    s_mov_b32 s8, s6
642; VI-NEXT:    s_mov_b32 s9, s7
643; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
644; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
645; VI-NEXT:    buffer_load_dwordx4 v[7:10], off, s[8:11], 0
646; VI-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
647; VI-NEXT:    s_mov_b32 s0, s4
648; VI-NEXT:    s_mov_b32 s1, s5
649; VI-NEXT:    s_waitcnt vmcnt(2)
650; VI-NEXT:    v_ashrrev_i64 v[2:3], v6, v[2:3]
651; VI-NEXT:    v_ashrrev_i64 v[0:1], v4, v[0:1]
652; VI-NEXT:    s_waitcnt vmcnt(0)
653; VI-NEXT:    v_ashrrev_i64 v[9:10], v13, v[9:10]
654; VI-NEXT:    v_ashrrev_i64 v[7:8], v11, v[7:8]
655; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
656; VI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0
657; VI-NEXT:    s_endpgm
658;
659; EG-LABEL: ashr_v4i64:
660; EG:       ; %bb.0:
661; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
662; EG-NEXT:    TEX 3 @6
663; EG-NEXT:    ALU 39, @15, KC0[CB0:0-32], KC1[]
664; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0
665; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
666; EG-NEXT:    CF_END
667; EG-NEXT:    Fetch clause starting at 6:
668; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 32, #1
669; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 48, #1
670; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 0, #1
671; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 16, #1
672; EG-NEXT:    ALU clause starting at 14:
673; EG-NEXT:     MOV * T0.X, KC0[2].Z,
674; EG-NEXT:    ALU clause starting at 15:
675; EG-NEXT:     AND_INT * T1.W, T1.Z, literal.x,
676; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
677; EG-NEXT:     ASHR T1.Y, T0.W, literal.x,
678; EG-NEXT:     ASHR T4.Z, T3.W, PV.W, BS:VEC_120/SCL_212
679; EG-NEXT:     AND_INT T1.W, T1.Z, literal.y,
680; EG-NEXT:     AND_INT * T2.W, T2.Z, literal.x,
681; EG-NEXT:    31(4.344025e-44), 32(4.484155e-44)
682; EG-NEXT:     BIT_ALIGN_INT T4.X, T3.W, T3.Z, T1.Z,
683; EG-NEXT:     ASHR T2.Y, T0.W, PS, BS:VEC_120/SCL_212
684; EG-NEXT:     AND_INT * T1.Z, T2.Z, literal.x,
685; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
686; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.W, T0.Z, T2.Z,
687; EG-NEXT:     AND_INT * T2.W, T2.X, literal.x,
688; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
689; EG-NEXT:     AND_INT T5.X, T1.X, literal.x,
690; EG-NEXT:     ASHR T4.Y, T0.Y, PS,
691; EG-NEXT:     CNDE_INT T0.Z, T1.Z, PV.W, T2.Y,
692; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.Y, T0.X, T2.X,
693; EG-NEXT:     AND_INT * T2.W, T2.X, literal.y,
694; EG-NEXT:    31(4.344025e-44), 32(4.484155e-44)
695; EG-NEXT:     CNDE_INT T0.X, PS, PV.W, PV.Y,
696; EG-NEXT:     ASHR T5.Y, T3.Y, PV.X,
697; EG-NEXT:     CNDE_INT T2.Z, T1.W, T4.X, T4.Z,
698; EG-NEXT:     BIT_ALIGN_INT T0.W, T3.Y, T3.X, T1.X, BS:VEC_102/SCL_221
699; EG-NEXT:     AND_INT * T4.W, T1.X, literal.x,
700; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
701; EG-NEXT:     CNDE_INT T2.X, PS, PV.W, PV.Y,
702; EG-NEXT:     ASHR T6.Y, T3.W, literal.x,
703; EG-NEXT:     ASHR T3.Z, T0.Y, literal.x, BS:VEC_201
704; EG-NEXT:     ADD_INT T3.W, KC0[2].Y, literal.y,
705; EG-NEXT:     CNDE_INT * T0.W, T1.Z, T2.Y, T1.Y,
706; EG-NEXT:    31(4.344025e-44), 16(2.242078e-44)
707; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
708; EG-NEXT:     CNDE_INT T0.Y, T2.W, T4.Y, PV.Z,
709; EG-NEXT:     ASHR T3.W, T3.Y, literal.y,
710; EG-NEXT:     CNDE_INT * T2.W, T1.W, T4.Z, PV.Y,
711; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
712; EG-NEXT:     LSHR T3.X, KC0[2].Y, literal.x,
713; EG-NEXT:     CNDE_INT * T2.Y, T4.W, T5.Y, PV.W,
714; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
715  %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
716  %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
717  %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
718  %result = ashr <4 x i64> %a, %b
719  store <4 x i64> %result, <4 x i64> addrspace(1)* %out
720  ret void
721}
722
723define amdgpu_kernel void @s_ashr_32_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
724; SI-LABEL: s_ashr_32_i64:
725; SI:       ; %bb.0:
726; SI-NEXT:    s_load_dword s6, s[0:1], 0x14
727; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x1d
728; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
729; SI-NEXT:    s_mov_b32 s3, 0xf000
730; SI-NEXT:    s_mov_b32 s2, -1
731; SI-NEXT:    s_waitcnt lgkmcnt(0)
732; SI-NEXT:    s_ashr_i32 s7, s6, 31
733; SI-NEXT:    s_add_u32 s4, s6, s4
734; SI-NEXT:    s_addc_u32 s5, s7, s5
735; SI-NEXT:    v_mov_b32_e32 v0, s4
736; SI-NEXT:    v_mov_b32_e32 v1, s5
737; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
738; SI-NEXT:    s_endpgm
739;
740; VI-LABEL: s_ashr_32_i64:
741; VI:       ; %bb.0:
742; VI-NEXT:    s_load_dword s6, s[0:1], 0x50
743; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x74
744; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
745; VI-NEXT:    s_mov_b32 s3, 0xf000
746; VI-NEXT:    s_mov_b32 s2, -1
747; VI-NEXT:    s_waitcnt lgkmcnt(0)
748; VI-NEXT:    s_ashr_i32 s7, s6, 31
749; VI-NEXT:    s_add_u32 s4, s6, s4
750; VI-NEXT:    s_addc_u32 s5, s7, s5
751; VI-NEXT:    v_mov_b32_e32 v0, s4
752; VI-NEXT:    v_mov_b32_e32 v1, s5
753; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
754; VI-NEXT:    s_endpgm
755;
756; EG-LABEL: s_ashr_32_i64:
757; EG:       ; %bb.0:
758; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
759; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
760; EG-NEXT:    CF_END
761; EG-NEXT:    PAD
762; EG-NEXT:    ALU clause starting at 4:
763; EG-NEXT:     ASHR * T0.W, KC0[5].X, literal.x,
764; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
765; EG-NEXT:     ADD_INT * T0.W, PV.W, KC0[7].Z,
766; EG-NEXT:     ADDC_UINT * T1.W, KC0[5].X, KC0[7].Y,
767; EG-NEXT:     ADD_INT * T0.Y, T0.W, PV.W,
768; EG-NEXT:     ADD_INT * T0.X, KC0[5].X, KC0[7].Y,
769; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
770; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
771  %result = ashr i64 %a, 32
772  %add = add i64 %result, %b
773  store i64 %add, i64 addrspace(1)* %out
774  ret void
775}
776
777define amdgpu_kernel void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
778; SI-LABEL: v_ashr_32_i64:
779; SI:       ; %bb.0:
780; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
781; SI-NEXT:    s_mov_b32 s7, 0xf000
782; SI-NEXT:    s_mov_b32 s6, 0
783; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
784; SI-NEXT:    v_mov_b32_e32 v1, 0
785; SI-NEXT:    s_waitcnt lgkmcnt(0)
786; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
787; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
788; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
789; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
790; SI-NEXT:    s_waitcnt vmcnt(0)
791; SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
792; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
793; SI-NEXT:    s_endpgm
794;
795; VI-LABEL: v_ashr_32_i64:
796; VI:       ; %bb.0:
797; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
798; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
799; VI-NEXT:    s_waitcnt lgkmcnt(0)
800; VI-NEXT:    v_mov_b32_e32 v0, s3
801; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v2
802; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v0, vcc
803; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v1
804; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
805; VI-NEXT:    flat_load_dword v0, v[0:1]
806; VI-NEXT:    v_mov_b32_e32 v1, s1
807; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
808; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
809; VI-NEXT:    s_waitcnt vmcnt(0)
810; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
811; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
812; VI-NEXT:    s_endpgm
813;
814; EG-LABEL: v_ashr_32_i64:
815; EG:       ; %bb.0:
816; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
817; EG-NEXT:    TEX 0 @6
818; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
819; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
820; EG-NEXT:    CF_END
821; EG-NEXT:    PAD
822; EG-NEXT:    Fetch clause starting at 6:
823; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 4, #1
824; EG-NEXT:    ALU clause starting at 8:
825; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
826; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
827; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
828; EG-NEXT:    ALU clause starting at 11:
829; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
830; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
831; EG-NEXT:     ASHR * T0.Y, T0.X, literal.y,
832; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
833  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
834  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
835  %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
836  %a = load i64, i64 addrspace(1)* %gep.in
837  %result = ashr i64 %a, 32
838  store i64 %result, i64 addrspace(1)* %gep.out
839  ret void
840}
841
842define amdgpu_kernel void @s_ashr_63_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
843; SI-LABEL: s_ashr_63_i64:
844; SI:       ; %bb.0:
845; SI-NEXT:    s_load_dword s6, s[0:1], 0x14
846; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x1d
847; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
848; SI-NEXT:    s_mov_b32 s3, 0xf000
849; SI-NEXT:    s_mov_b32 s2, -1
850; SI-NEXT:    s_waitcnt lgkmcnt(0)
851; SI-NEXT:    s_ashr_i32 s6, s6, 31
852; SI-NEXT:    s_add_u32 s4, s6, s4
853; SI-NEXT:    s_addc_u32 s5, s6, s5
854; SI-NEXT:    v_mov_b32_e32 v0, s4
855; SI-NEXT:    v_mov_b32_e32 v1, s5
856; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
857; SI-NEXT:    s_endpgm
858;
859; VI-LABEL: s_ashr_63_i64:
860; VI:       ; %bb.0:
861; VI-NEXT:    s_load_dword s6, s[0:1], 0x50
862; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x74
863; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
864; VI-NEXT:    s_mov_b32 s3, 0xf000
865; VI-NEXT:    s_mov_b32 s2, -1
866; VI-NEXT:    s_waitcnt lgkmcnt(0)
867; VI-NEXT:    s_ashr_i32 s6, s6, 31
868; VI-NEXT:    s_add_u32 s4, s6, s4
869; VI-NEXT:    s_addc_u32 s5, s6, s5
870; VI-NEXT:    v_mov_b32_e32 v0, s4
871; VI-NEXT:    v_mov_b32_e32 v1, s5
872; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
873; VI-NEXT:    s_endpgm
874;
875; EG-LABEL: s_ashr_63_i64:
876; EG:       ; %bb.0:
877; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
878; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
879; EG-NEXT:    CF_END
880; EG-NEXT:    PAD
881; EG-NEXT:    ALU clause starting at 4:
882; EG-NEXT:     ASHR * T0.W, KC0[5].X, literal.x,
883; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
884; EG-NEXT:     ADD_INT T1.W, PV.W, KC0[7].Z,
885; EG-NEXT:     ADDC_UINT * T2.W, PV.W, KC0[7].Y,
886; EG-NEXT:     ADD_INT * T0.Y, PV.W, PS,
887; EG-NEXT:     ADD_INT T0.X, T0.W, KC0[7].Y,
888; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
889; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
890  %result = ashr i64 %a, 63
891  %add = add i64 %result, %b
892  store i64 %add, i64 addrspace(1)* %out
893  ret void
894}
895
896define amdgpu_kernel void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
897; SI-LABEL: v_ashr_63_i64:
898; SI:       ; %bb.0:
899; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
900; SI-NEXT:    s_mov_b32 s7, 0xf000
901; SI-NEXT:    s_mov_b32 s6, 0
902; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
903; SI-NEXT:    v_mov_b32_e32 v1, 0
904; SI-NEXT:    s_waitcnt lgkmcnt(0)
905; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
906; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
907; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
908; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
909; SI-NEXT:    s_waitcnt vmcnt(0)
910; SI-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
911; SI-NEXT:    v_mov_b32_e32 v3, v2
912; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
913; SI-NEXT:    s_endpgm
914;
915; VI-LABEL: v_ashr_63_i64:
916; VI:       ; %bb.0:
917; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
918; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
919; VI-NEXT:    s_waitcnt lgkmcnt(0)
920; VI-NEXT:    v_mov_b32_e32 v0, s3
921; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v2
922; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v0, vcc
923; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v1
924; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
925; VI-NEXT:    flat_load_dword v3, v[0:1]
926; VI-NEXT:    v_mov_b32_e32 v1, s1
927; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
928; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
929; VI-NEXT:    s_waitcnt vmcnt(0)
930; VI-NEXT:    v_ashrrev_i32_e32 v2, 31, v3
931; VI-NEXT:    v_mov_b32_e32 v3, v2
932; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
933; VI-NEXT:    s_endpgm
934;
935; EG-LABEL: v_ashr_63_i64:
936; EG:       ; %bb.0:
937; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
938; EG-NEXT:    TEX 0 @6
939; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
940; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
941; EG-NEXT:    CF_END
942; EG-NEXT:    PAD
943; EG-NEXT:    Fetch clause starting at 6:
944; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 4, #1
945; EG-NEXT:    ALU clause starting at 8:
946; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
947; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
948; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
949; EG-NEXT:    ALU clause starting at 11:
950; EG-NEXT:     ASHR T0.X, T0.X, literal.x,
951; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
952; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
953; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
954; EG-NEXT:     MOV * T0.Y, PV.X,
955; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
956  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
957  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
958  %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
959  %a = load i64, i64 addrspace(1)* %gep.in
960  %result = ashr i64 %a, 63
961  store i64 %result, i64 addrspace(1)* %gep.out
962  ret void
963}
964
965attributes #0 = { nounwind readnone }
966