1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=SI
3; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=VI
4
5define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
6; SI-LABEL: s_sext_i1_to_i32:
7; SI:       ; %bb.0:
8; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
9; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
10; SI-NEXT:    s_mov_b32 s3, 0xf000
11; SI-NEXT:    s_mov_b32 s2, -1
12; SI-NEXT:    s_waitcnt lgkmcnt(0)
13; SI-NEXT:    s_cmp_eq_u32 s4, s5
14; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
15; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
16; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
17; SI-NEXT:    s_endpgm
18;
19; VI-LABEL: s_sext_i1_to_i32:
20; VI:       ; %bb.0:
21; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
22; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
23; VI-NEXT:    s_mov_b32 s3, 0xf000
24; VI-NEXT:    s_mov_b32 s2, -1
25; VI-NEXT:    s_waitcnt lgkmcnt(0)
26; VI-NEXT:    s_cmp_eq_u32 s4, s5
27; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
28; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
29; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
30; VI-NEXT:    s_endpgm
31  %cmp = icmp eq i32 %a, %b
32  %sext = sext i1 %cmp to i32
33  store i32 %sext, i32 addrspace(1)* %out, align 4
34  ret void
35}
36
37define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
38; SI-LABEL: test_s_sext_i32_to_i64:
39; SI:       ; %bb.0: ; %entry
40; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
41; SI-NEXT:    s_load_dword s6, s[0:1], 0xd
42; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
43; SI-NEXT:    s_mov_b32 s3, 0xf000
44; SI-NEXT:    s_mov_b32 s2, -1
45; SI-NEXT:    s_waitcnt lgkmcnt(0)
46; SI-NEXT:    s_mul_i32 s4, s4, s5
47; SI-NEXT:    s_add_i32 s4, s4, s6
48; SI-NEXT:    s_ashr_i32 s5, s4, 31
49; SI-NEXT:    v_mov_b32_e32 v0, s4
50; SI-NEXT:    v_mov_b32_e32 v1, s5
51; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
52; SI-NEXT:    s_endpgm
53;
54; VI-LABEL: test_s_sext_i32_to_i64:
55; VI:       ; %bb.0: ; %entry
56; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
57; VI-NEXT:    s_load_dword s6, s[0:1], 0x34
58; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
59; VI-NEXT:    s_mov_b32 s3, 0xf000
60; VI-NEXT:    s_mov_b32 s2, -1
61; VI-NEXT:    s_waitcnt lgkmcnt(0)
62; VI-NEXT:    s_mul_i32 s4, s4, s5
63; VI-NEXT:    s_add_i32 s4, s4, s6
64; VI-NEXT:    s_ashr_i32 s5, s4, 31
65; VI-NEXT:    v_mov_b32_e32 v0, s4
66; VI-NEXT:    v_mov_b32_e32 v1, s5
67; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
68; VI-NEXT:    s_endpgm
69entry:
70  %mul = mul i32 %a, %b
71  %add = add i32 %mul, %c
72  %sext = sext i32 %add to i64
73  store i64 %sext, i64 addrspace(1)* %out, align 8
74  ret void
75}
76
77define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
78; SI-LABEL: s_sext_i1_to_i64:
79; SI:       ; %bb.0:
80; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
81; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
82; SI-NEXT:    s_mov_b32 s3, 0xf000
83; SI-NEXT:    s_mov_b32 s2, -1
84; SI-NEXT:    s_waitcnt lgkmcnt(0)
85; SI-NEXT:    s_cmp_eq_u32 s4, s5
86; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
87; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
88; SI-NEXT:    v_mov_b32_e32 v1, v0
89; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
90; SI-NEXT:    s_endpgm
91;
92; VI-LABEL: s_sext_i1_to_i64:
93; VI:       ; %bb.0:
94; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
95; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
96; VI-NEXT:    s_mov_b32 s3, 0xf000
97; VI-NEXT:    s_mov_b32 s2, -1
98; VI-NEXT:    s_waitcnt lgkmcnt(0)
99; VI-NEXT:    s_cmp_eq_u32 s4, s5
100; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
101; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
102; VI-NEXT:    v_mov_b32_e32 v1, v0
103; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
104; VI-NEXT:    s_endpgm
105  %cmp = icmp eq i32 %a, %b
106  %sext = sext i1 %cmp to i64
107  store i64 %sext, i64 addrspace(1)* %out, align 8
108  ret void
109}
110
111define amdgpu_kernel void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
112; SI-LABEL: s_sext_i32_to_i64:
113; SI:       ; %bb.0:
114; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
115; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
116; SI-NEXT:    s_mov_b32 s3, 0xf000
117; SI-NEXT:    s_mov_b32 s2, -1
118; SI-NEXT:    s_waitcnt lgkmcnt(0)
119; SI-NEXT:    s_ashr_i32 s5, s4, 31
120; SI-NEXT:    v_mov_b32_e32 v0, s4
121; SI-NEXT:    v_mov_b32_e32 v1, s5
122; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
123; SI-NEXT:    s_endpgm
124;
125; VI-LABEL: s_sext_i32_to_i64:
126; VI:       ; %bb.0:
127; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
128; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
129; VI-NEXT:    s_mov_b32 s3, 0xf000
130; VI-NEXT:    s_mov_b32 s2, -1
131; VI-NEXT:    s_waitcnt lgkmcnt(0)
132; VI-NEXT:    s_ashr_i32 s5, s4, 31
133; VI-NEXT:    v_mov_b32_e32 v0, s4
134; VI-NEXT:    v_mov_b32_e32 v1, s5
135; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
136; VI-NEXT:    s_endpgm
137  %sext = sext i32 %a to i64
138  store i64 %sext, i64 addrspace(1)* %out, align 8
139  ret void
140}
141
142define amdgpu_kernel void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
143; SI-LABEL: v_sext_i32_to_i64:
144; SI:       ; %bb.0:
145; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
146; SI-NEXT:    s_mov_b32 s7, 0xf000
147; SI-NEXT:    s_mov_b32 s6, -1
148; SI-NEXT:    s_mov_b32 s10, s6
149; SI-NEXT:    s_mov_b32 s11, s7
150; SI-NEXT:    s_waitcnt lgkmcnt(0)
151; SI-NEXT:    s_mov_b32 s8, s2
152; SI-NEXT:    s_mov_b32 s9, s3
153; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
154; SI-NEXT:    s_mov_b32 s4, s0
155; SI-NEXT:    s_mov_b32 s5, s1
156; SI-NEXT:    s_waitcnt vmcnt(0)
157; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
158; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
159; SI-NEXT:    s_endpgm
160;
161; VI-LABEL: v_sext_i32_to_i64:
162; VI:       ; %bb.0:
163; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
164; VI-NEXT:    s_mov_b32 s7, 0xf000
165; VI-NEXT:    s_mov_b32 s6, -1
166; VI-NEXT:    s_mov_b32 s10, s6
167; VI-NEXT:    s_mov_b32 s11, s7
168; VI-NEXT:    s_waitcnt lgkmcnt(0)
169; VI-NEXT:    s_mov_b32 s8, s2
170; VI-NEXT:    s_mov_b32 s9, s3
171; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
172; VI-NEXT:    s_mov_b32 s4, s0
173; VI-NEXT:    s_mov_b32 s5, s1
174; VI-NEXT:    s_waitcnt vmcnt(0)
175; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
176; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
177; VI-NEXT:    s_endpgm
178  %val = load i32, i32 addrspace(1)* %in, align 4
179  %sext = sext i32 %val to i64
180  store i64 %sext, i64 addrspace(1)* %out, align 8
181  ret void
182}
183
184define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
185; SI-LABEL: s_sext_i16_to_i64:
186; SI:       ; %bb.0:
187; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
188; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
189; SI-NEXT:    s_mov_b32 s3, 0xf000
190; SI-NEXT:    s_mov_b32 s2, -1
191; SI-NEXT:    s_waitcnt lgkmcnt(0)
192; SI-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
193; SI-NEXT:    v_mov_b32_e32 v0, s4
194; SI-NEXT:    v_mov_b32_e32 v1, s5
195; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
196; SI-NEXT:    s_endpgm
197;
198; VI-LABEL: s_sext_i16_to_i64:
199; VI:       ; %bb.0:
200; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
201; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
202; VI-NEXT:    s_mov_b32 s3, 0xf000
203; VI-NEXT:    s_mov_b32 s2, -1
204; VI-NEXT:    s_waitcnt lgkmcnt(0)
205; VI-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
206; VI-NEXT:    v_mov_b32_e32 v0, s4
207; VI-NEXT:    v_mov_b32_e32 v1, s5
208; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
209; VI-NEXT:    s_endpgm
210  %sext = sext i16 %a to i64
211  store i64 %sext, i64 addrspace(1)* %out, align 8
212  ret void
213}
214
215define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
216; SI-LABEL: s_sext_i1_to_i16:
217; SI:       ; %bb.0:
218; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
219; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
220; SI-NEXT:    s_mov_b32 s3, 0xf000
221; SI-NEXT:    s_mov_b32 s2, -1
222; SI-NEXT:    s_waitcnt lgkmcnt(0)
223; SI-NEXT:    s_cmp_eq_u32 s4, s5
224; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
225; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
226; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
227; SI-NEXT:    s_endpgm
228;
229; VI-LABEL: s_sext_i1_to_i16:
230; VI:       ; %bb.0:
231; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
232; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
233; VI-NEXT:    s_mov_b32 s3, 0xf000
234; VI-NEXT:    s_mov_b32 s2, -1
235; VI-NEXT:    s_waitcnt lgkmcnt(0)
236; VI-NEXT:    s_cmp_eq_u32 s4, s5
237; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
238; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
239; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
240; VI-NEXT:    s_endpgm
241  %cmp = icmp eq i32 %a, %b
242  %sext = sext i1 %cmp to i16
243  store i16 %sext, i16 addrspace(1)* %out
244  ret void
245}
246
247; This purpose of this test is to make sure the i16 = sign_extend i1 node
248; makes it all the way throught the legalizer/optimizer to make sure
249; we select this correctly.  In the s_sext_i1_to_i16, the sign_extend node
250; is optimized to a select very early.
251define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
252; SI-LABEL: s_sext_i1_to_i16_with_and:
253; SI:       ; %bb.0:
254; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xb
255; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
256; SI-NEXT:    s_mov_b32 s3, 0xf000
257; SI-NEXT:    s_mov_b32 s2, -1
258; SI-NEXT:    s_waitcnt lgkmcnt(0)
259; SI-NEXT:    s_cmp_eq_u32 s4, s5
260; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
261; SI-NEXT:    s_cmp_eq_u32 s6, s7
262; SI-NEXT:    s_cselect_b64 s[6:7], -1, 0
263; SI-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
264; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
265; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
266; SI-NEXT:    s_endpgm
267;
268; VI-LABEL: s_sext_i1_to_i16_with_and:
269; VI:       ; %bb.0:
270; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
271; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
272; VI-NEXT:    s_mov_b32 s3, 0xf000
273; VI-NEXT:    s_mov_b32 s2, -1
274; VI-NEXT:    s_waitcnt lgkmcnt(0)
275; VI-NEXT:    s_cmp_eq_u32 s4, s5
276; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
277; VI-NEXT:    s_cmp_eq_u32 s6, s7
278; VI-NEXT:    s_cselect_b64 s[6:7], -1, 0
279; VI-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
280; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
281; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
282; VI-NEXT:    s_endpgm
283  %cmp0 = icmp eq i32 %a, %b
284  %cmp1 = icmp eq i32 %c, %d
285  %cmp = and i1 %cmp0, %cmp1
286  %sext = sext i1 %cmp to i16
287  store i16 %sext, i16 addrspace(1)* %out
288  ret void
289}
290
291define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
292; SI-LABEL: v_sext_i1_to_i16_with_and:
293; SI:       ; %bb.0:
294; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
295; SI-NEXT:    s_load_dword s6, s[0:1], 0xd
296; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
297; SI-NEXT:    s_mov_b32 s3, 0xf000
298; SI-NEXT:    s_mov_b32 s2, -1
299; SI-NEXT:    s_waitcnt lgkmcnt(0)
300; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
301; SI-NEXT:    s_cmp_eq_u32 s5, s6
302; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
303; SI-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
304; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
305; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
306; SI-NEXT:    s_endpgm
307;
308; VI-LABEL: v_sext_i1_to_i16_with_and:
309; VI:       ; %bb.0:
310; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
311; VI-NEXT:    s_load_dword s6, s[0:1], 0x34
312; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
313; VI-NEXT:    s_mov_b32 s3, 0xf000
314; VI-NEXT:    s_mov_b32 s2, -1
315; VI-NEXT:    s_waitcnt lgkmcnt(0)
316; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
317; VI-NEXT:    s_cmp_eq_u32 s5, s6
318; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
319; VI-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
320; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
321; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
322; VI-NEXT:    s_endpgm
323  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
324  %cmp0 = icmp eq i32 %a, %tid
325  %cmp1 = icmp eq i32 %b, %c
326  %cmp = and i1 %cmp0, %cmp1
327  %sext = sext i1 %cmp to i16
328  store i16 %sext, i16 addrspace(1)* %out
329  ret void
330}
331
332; FIXME: We end up with a v_bfe instruction, because the i16 srl
333; gets selected to a v_lshrrev_b16 instructions, so the input to
334; the bfe is a vector registers.  To fix this we need to be able to
335; optimize:
336; t29: i16 = truncate t10
337; t55: i16 = srl t29, Constant:i32<8>
338; t63: i32 = any_extend t55
339; t64: i32 = sign_extend_inreg t63, ValueType:ch:i8
340define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind {
341; SI-LABEL: s_sext_v4i8_to_v4i32:
342; SI:       ; %bb.0:
343; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
344; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
345; SI-NEXT:    s_mov_b32 s3, 0xf000
346; SI-NEXT:    s_mov_b32 s2, -1
347; SI-NEXT:    s_waitcnt lgkmcnt(0)
348; SI-NEXT:    s_ashr_i32 s5, s4, 24
349; SI-NEXT:    s_bfe_i32 s6, s4, 0x80010
350; SI-NEXT:    s_bfe_i32 s7, s4, 0x80008
351; SI-NEXT:    s_sext_i32_i8 s4, s4
352; SI-NEXT:    v_mov_b32_e32 v0, s4
353; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
354; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
355; SI-NEXT:    v_mov_b32_e32 v0, s7
356; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
357; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
358; SI-NEXT:    v_mov_b32_e32 v0, s6
359; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
360; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
361; SI-NEXT:    v_mov_b32_e32 v0, s5
362; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
363; SI-NEXT:    s_waitcnt vmcnt(0)
364; SI-NEXT:    s_endpgm
365;
366; VI-LABEL: s_sext_v4i8_to_v4i32:
367; VI:       ; %bb.0:
368; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
369; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
370; VI-NEXT:    s_mov_b32 s3, 0xf000
371; VI-NEXT:    s_mov_b32 s2, -1
372; VI-NEXT:    s_waitcnt lgkmcnt(0)
373; VI-NEXT:    v_lshrrev_b16_e64 v0, 8, s4
374; VI-NEXT:    s_ashr_i32 s5, s4, 24
375; VI-NEXT:    s_bfe_i32 s6, s4, 0x80010
376; VI-NEXT:    s_sext_i32_i8 s4, s4
377; VI-NEXT:    v_bfe_i32 v0, v0, 0, 8
378; VI-NEXT:    v_mov_b32_e32 v1, s4
379; VI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
380; VI-NEXT:    s_waitcnt vmcnt(0)
381; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
382; VI-NEXT:    s_waitcnt vmcnt(0)
383; VI-NEXT:    v_mov_b32_e32 v0, s6
384; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
385; VI-NEXT:    s_waitcnt vmcnt(0)
386; VI-NEXT:    v_mov_b32_e32 v0, s5
387; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
388; VI-NEXT:    s_waitcnt vmcnt(0)
389; VI-NEXT:    s_endpgm
390  %cast = bitcast i32 %a to <4 x i8>
391  %ext = sext <4 x i8> %cast to <4 x i32>
392  %elt0 = extractelement <4 x i32> %ext, i32 0
393  %elt1 = extractelement <4 x i32> %ext, i32 1
394  %elt2 = extractelement <4 x i32> %ext, i32 2
395  %elt3 = extractelement <4 x i32> %ext, i32 3
396  store volatile i32 %elt0, i32 addrspace(1)* %out
397  store volatile i32 %elt1, i32 addrspace(1)* %out
398  store volatile i32 %elt2, i32 addrspace(1)* %out
399  store volatile i32 %elt3, i32 addrspace(1)* %out
400  ret void
401}
402
403; FIXME: need to optimize same sequence as above test to avoid
404; this shift.
405define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
406; SI-LABEL: v_sext_v4i8_to_v4i32:
407; SI:       ; %bb.0:
408; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
409; SI-NEXT:    s_mov_b32 s7, 0xf000
410; SI-NEXT:    s_mov_b32 s6, -1
411; SI-NEXT:    s_mov_b32 s10, s6
412; SI-NEXT:    s_mov_b32 s11, s7
413; SI-NEXT:    s_waitcnt lgkmcnt(0)
414; SI-NEXT:    s_mov_b32 s8, s2
415; SI-NEXT:    s_mov_b32 s9, s3
416; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
417; SI-NEXT:    s_mov_b32 s4, s0
418; SI-NEXT:    s_mov_b32 s5, s1
419; SI-NEXT:    s_waitcnt vmcnt(0)
420; SI-NEXT:    v_ashrrev_i32_e32 v1, 24, v0
421; SI-NEXT:    v_bfe_i32 v2, v0, 16, 8
422; SI-NEXT:    v_bfe_i32 v3, v0, 8, 8
423; SI-NEXT:    v_bfe_i32 v0, v0, 0, 8
424; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
425; SI-NEXT:    s_waitcnt vmcnt(0)
426; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0
427; SI-NEXT:    s_waitcnt vmcnt(0)
428; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
429; SI-NEXT:    s_waitcnt vmcnt(0)
430; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
431; SI-NEXT:    s_waitcnt vmcnt(0)
432; SI-NEXT:    s_endpgm
433;
434; VI-LABEL: v_sext_v4i8_to_v4i32:
435; VI:       ; %bb.0:
436; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
437; VI-NEXT:    s_mov_b32 s7, 0xf000
438; VI-NEXT:    s_mov_b32 s6, -1
439; VI-NEXT:    s_mov_b32 s10, s6
440; VI-NEXT:    s_mov_b32 s11, s7
441; VI-NEXT:    s_waitcnt lgkmcnt(0)
442; VI-NEXT:    s_mov_b32 s8, s2
443; VI-NEXT:    s_mov_b32 s9, s3
444; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
445; VI-NEXT:    s_mov_b32 s4, s0
446; VI-NEXT:    s_mov_b32 s5, s1
447; VI-NEXT:    s_waitcnt vmcnt(0)
448; VI-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
449; VI-NEXT:    v_ashrrev_i32_e32 v2, 24, v0
450; VI-NEXT:    v_bfe_i32 v3, v0, 16, 8
451; VI-NEXT:    v_bfe_i32 v0, v0, 0, 8
452; VI-NEXT:    v_bfe_i32 v1, v1, 0, 8
453; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
454; VI-NEXT:    s_waitcnt vmcnt(0)
455; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
456; VI-NEXT:    s_waitcnt vmcnt(0)
457; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0
458; VI-NEXT:    s_waitcnt vmcnt(0)
459; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
460; VI-NEXT:    s_waitcnt vmcnt(0)
461; VI-NEXT:    s_endpgm
462  %a = load i32, i32 addrspace(1)* %in
463  %cast = bitcast i32 %a to <4 x i8>
464  %ext = sext <4 x i8> %cast to <4 x i32>
465  %elt0 = extractelement <4 x i32> %ext, i32 0
466  %elt1 = extractelement <4 x i32> %ext, i32 1
467  %elt2 = extractelement <4 x i32> %ext, i32 2
468  %elt3 = extractelement <4 x i32> %ext, i32 3
469  store volatile i32 %elt0, i32 addrspace(1)* %out
470  store volatile i32 %elt1, i32 addrspace(1)* %out
471  store volatile i32 %elt2, i32 addrspace(1)* %out
472  store volatile i32 %elt3, i32 addrspace(1)* %out
473  ret void
474}
475
476; FIXME: s_bfe_i64, same on SI and VI
477define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind {
478; SI-LABEL: s_sext_v4i16_to_v4i32:
479; SI:       ; %bb.0:
480; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
481; SI-NEXT:    s_mov_b32 s7, 0xf000
482; SI-NEXT:    s_mov_b32 s6, -1
483; SI-NEXT:    s_waitcnt lgkmcnt(0)
484; SI-NEXT:    s_mov_b32 s4, s0
485; SI-NEXT:    s_mov_b32 s5, s1
486; SI-NEXT:    s_ashr_i64 s[0:1], s[2:3], 48
487; SI-NEXT:    s_ashr_i32 s1, s2, 16
488; SI-NEXT:    s_sext_i32_i16 s2, s2
489; SI-NEXT:    v_mov_b32_e32 v0, s2
490; SI-NEXT:    s_sext_i32_i16 s3, s3
491; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
492; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
493; SI-NEXT:    v_mov_b32_e32 v0, s1
494; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
495; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
496; SI-NEXT:    v_mov_b32_e32 v0, s3
497; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
498; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
499; SI-NEXT:    v_mov_b32_e32 v0, s0
500; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
501; SI-NEXT:    s_waitcnt vmcnt(0)
502; SI-NEXT:    s_endpgm
503;
504; VI-LABEL: s_sext_v4i16_to_v4i32:
505; VI:       ; %bb.0:
506; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
507; VI-NEXT:    s_mov_b32 s7, 0xf000
508; VI-NEXT:    s_mov_b32 s6, -1
509; VI-NEXT:    s_waitcnt lgkmcnt(0)
510; VI-NEXT:    s_mov_b32 s5, s1
511; VI-NEXT:    s_ashr_i32 s1, s2, 16
512; VI-NEXT:    s_sext_i32_i16 s2, s2
513; VI-NEXT:    s_mov_b32 s4, s0
514; VI-NEXT:    v_mov_b32_e32 v0, s2
515; VI-NEXT:    s_ashr_i32 s0, s3, 16
516; VI-NEXT:    s_sext_i32_i16 s3, s3
517; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
518; VI-NEXT:    s_waitcnt vmcnt(0)
519; VI-NEXT:    v_mov_b32_e32 v0, s1
520; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
521; VI-NEXT:    s_waitcnt vmcnt(0)
522; VI-NEXT:    v_mov_b32_e32 v0, s3
523; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
524; VI-NEXT:    s_waitcnt vmcnt(0)
525; VI-NEXT:    v_mov_b32_e32 v0, s0
526; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
527; VI-NEXT:    s_waitcnt vmcnt(0)
528; VI-NEXT:    s_endpgm
529  %cast = bitcast i64 %a to <4 x i16>
530  %ext = sext <4 x i16> %cast to <4 x i32>
531  %elt0 = extractelement <4 x i32> %ext, i32 0
532  %elt1 = extractelement <4 x i32> %ext, i32 1
533  %elt2 = extractelement <4 x i32> %ext, i32 2
534  %elt3 = extractelement <4 x i32> %ext, i32 3
535  store volatile i32 %elt0, i32 addrspace(1)* %out
536  store volatile i32 %elt1, i32 addrspace(1)* %out
537  store volatile i32 %elt2, i32 addrspace(1)* %out
538  store volatile i32 %elt3, i32 addrspace(1)* %out
539  ret void
540}
541
542define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
543; SI-LABEL: v_sext_v4i16_to_v4i32:
544; SI:       ; %bb.0:
545; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
546; SI-NEXT:    s_mov_b32 s7, 0xf000
547; SI-NEXT:    s_mov_b32 s6, -1
548; SI-NEXT:    s_mov_b32 s10, s6
549; SI-NEXT:    s_mov_b32 s11, s7
550; SI-NEXT:    s_waitcnt lgkmcnt(0)
551; SI-NEXT:    s_mov_b32 s8, s2
552; SI-NEXT:    s_mov_b32 s9, s3
553; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
554; SI-NEXT:    s_mov_b32 s4, s0
555; SI-NEXT:    s_mov_b32 s5, s1
556; SI-NEXT:    s_waitcnt vmcnt(0)
557; SI-NEXT:    v_ashr_i64 v[2:3], v[0:1], 48
558; SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
559; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
560; SI-NEXT:    v_bfe_i32 v1, v1, 0, 16
561; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
562; SI-NEXT:    s_waitcnt vmcnt(0)
563; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0
564; SI-NEXT:    s_waitcnt vmcnt(0)
565; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
566; SI-NEXT:    s_waitcnt vmcnt(0)
567; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
568; SI-NEXT:    s_waitcnt vmcnt(0)
569; SI-NEXT:    s_endpgm
570;
571; VI-LABEL: v_sext_v4i16_to_v4i32:
572; VI:       ; %bb.0:
573; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
574; VI-NEXT:    s_mov_b32 s7, 0xf000
575; VI-NEXT:    s_mov_b32 s6, -1
576; VI-NEXT:    s_mov_b32 s10, s6
577; VI-NEXT:    s_mov_b32 s11, s7
578; VI-NEXT:    s_waitcnt lgkmcnt(0)
579; VI-NEXT:    s_mov_b32 s8, s2
580; VI-NEXT:    s_mov_b32 s9, s3
581; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
582; VI-NEXT:    s_mov_b32 s4, s0
583; VI-NEXT:    s_mov_b32 s5, s1
584; VI-NEXT:    s_waitcnt vmcnt(0)
585; VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
586; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
587; VI-NEXT:    v_ashrrev_i32_e32 v2, 16, v1
588; VI-NEXT:    v_bfe_i32 v1, v1, 0, 16
589; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
590; VI-NEXT:    s_waitcnt vmcnt(0)
591; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0
592; VI-NEXT:    s_waitcnt vmcnt(0)
593; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
594; VI-NEXT:    s_waitcnt vmcnt(0)
595; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
596; VI-NEXT:    s_waitcnt vmcnt(0)
597; VI-NEXT:    s_endpgm
598  %a = load i64, i64 addrspace(1)* %in
599  %cast = bitcast i64 %a to <4 x i16>
600  %ext = sext <4 x i16> %cast to <4 x i32>
601  %elt0 = extractelement <4 x i32> %ext, i32 0
602  %elt1 = extractelement <4 x i32> %ext, i32 1
603  %elt2 = extractelement <4 x i32> %ext, i32 2
604  %elt3 = extractelement <4 x i32> %ext, i32 3
605  store volatile i32 %elt0, i32 addrspace(1)* %out
606  store volatile i32 %elt1, i32 addrspace(1)* %out
607  store volatile i32 %elt2, i32 addrspace(1)* %out
608  store volatile i32 %elt3, i32 addrspace(1)* %out
609  ret void
610}
611
612declare i32 @llvm.amdgcn.workitem.id.x() #1
613
614attributes #1 = { nounwind readnone }
615