1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
6; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM %s
7
8; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
9define amdgpu_kernel void @test_smul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
10; SI-LABEL: test_smul24_i32:
11; SI:       ; %bb.0: ; %entry
12; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
13; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
14; SI-NEXT:    s_mov_b32 s3, 0xf000
15; SI-NEXT:    s_waitcnt lgkmcnt(0)
16; SI-NEXT:    s_bfe_i32 s2, s4, 0x180000
17; SI-NEXT:    s_bfe_i32 s4, s5, 0x180000
18; SI-NEXT:    s_mul_i32 s4, s2, s4
19; SI-NEXT:    s_mov_b32 s2, -1
20; SI-NEXT:    v_mov_b32_e32 v0, s4
21; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
22; SI-NEXT:    s_endpgm
23;
24; VI-LABEL: test_smul24_i32:
25; VI:       ; %bb.0: ; %entry
26; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
27; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
28; VI-NEXT:    s_mov_b32 s3, 0xf000
29; VI-NEXT:    s_mov_b32 s2, -1
30; VI-NEXT:    s_waitcnt lgkmcnt(0)
31; VI-NEXT:    s_bfe_i32 s4, s4, 0x180000
32; VI-NEXT:    s_bfe_i32 s5, s5, 0x180000
33; VI-NEXT:    s_mul_i32 s4, s4, s5
34; VI-NEXT:    v_mov_b32_e32 v0, s4
35; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
36; VI-NEXT:    s_endpgm
37;
38; GFX9-LABEL: test_smul24_i32:
39; GFX9:       ; %bb.0: ; %entry
40; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
41; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
42; GFX9-NEXT:    s_mov_b32 s7, 0xf000
43; GFX9-NEXT:    s_mov_b32 s6, -1
44; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
45; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
46; GFX9-NEXT:    s_bfe_i32 s1, s3, 0x180000
47; GFX9-NEXT:    s_mul_i32 s0, s0, s1
48; GFX9-NEXT:    v_mov_b32_e32 v0, s0
49; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
50; GFX9-NEXT:    s_endpgm
51;
52; EG-LABEL: test_smul24_i32:
53; EG:       ; %bb.0: ; %entry
54; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
55; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
56; EG-NEXT:    CF_END
57; EG-NEXT:    PAD
58; EG-NEXT:    ALU clause starting at 4:
59; EG-NEXT:     LSHL T0.W, KC0[2].Z, literal.x,
60; EG-NEXT:     LSHL * T1.W, KC0[2].W, literal.x,
61; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
62; EG-NEXT:     ASHR T1.W, PS, literal.x,
63; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
64; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
65; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
66; EG-NEXT:     MULLO_INT * T1.X, PS, PV.W,
67; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
68;
69; CM-LABEL: test_smul24_i32:
70; CM:       ; %bb.0: ; %entry
71; CM-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
72; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
73; CM-NEXT:    CF_END
74; CM-NEXT:    PAD
75; CM-NEXT:    ALU clause starting at 4:
76; CM-NEXT:     LSHL T0.Z, KC0[2].Z, literal.x,
77; CM-NEXT:     LSHL * T0.W, KC0[2].W, literal.x,
78; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
79; CM-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
80; CM-NEXT:     ASHR T1.Z, PV.W, literal.y,
81; CM-NEXT:     ASHR * T0.W, PV.Z, literal.y,
82; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
83; CM-NEXT:     MULLO_INT T1.X, T0.W, T1.Z,
84; CM-NEXT:     MULLO_INT T1.Y (MASKED), T0.W, T1.Z,
85; CM-NEXT:     MULLO_INT T1.Z (MASKED), T0.W, T1.Z,
86; CM-NEXT:     MULLO_INT * T1.W (MASKED), T0.W, T1.Z,
87entry:
88  %a.shl = shl i32 %a, 8
89  %a.24 = ashr i32 %a.shl, 8
90  %b.shl = shl i32 %b, 8
91  %b.24 = ashr i32 %b.shl, 8
92  %mul24 = mul i32 %a.24, %b.24
93  store i32 %mul24, i32 addrspace(1)* %out
94  ret void
95}
96
97define amdgpu_kernel void @test_smulhi24_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
98; SI-LABEL: test_smulhi24_i64:
99; SI:       ; %bb.0: ; %entry
100; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
101; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
102; SI-NEXT:    s_mov_b32 s3, 0xf000
103; SI-NEXT:    s_mov_b32 s2, -1
104; SI-NEXT:    s_waitcnt lgkmcnt(0)
105; SI-NEXT:    v_mov_b32_e32 v0, s5
106; SI-NEXT:    v_mul_hi_i32_i24_e32 v0, s4, v0
107; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
108; SI-NEXT:    s_endpgm
109;
110; VI-LABEL: test_smulhi24_i64:
111; VI:       ; %bb.0: ; %entry
112; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
113; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
114; VI-NEXT:    s_mov_b32 s3, 0xf000
115; VI-NEXT:    s_mov_b32 s2, -1
116; VI-NEXT:    s_waitcnt lgkmcnt(0)
117; VI-NEXT:    v_mov_b32_e32 v0, s5
118; VI-NEXT:    v_mul_hi_i32_i24_e32 v0, s4, v0
119; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
120; VI-NEXT:    s_endpgm
121;
122; GFX9-LABEL: test_smulhi24_i64:
123; GFX9:       ; %bb.0: ; %entry
124; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
125; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
126; GFX9-NEXT:    s_mov_b32 s7, 0xf000
127; GFX9-NEXT:    s_mov_b32 s6, -1
128; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
129; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
130; GFX9-NEXT:    s_bfe_i32 s1, s3, 0x180000
131; GFX9-NEXT:    s_mul_hi_i32 s0, s0, s1
132; GFX9-NEXT:    v_mov_b32_e32 v0, s0
133; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
134; GFX9-NEXT:    s_endpgm
135;
136; EG-LABEL: test_smulhi24_i64:
137; EG:       ; %bb.0: ; %entry
138; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
139; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
140; EG-NEXT:    CF_END
141; EG-NEXT:    PAD
142; EG-NEXT:    ALU clause starting at 4:
143; EG-NEXT:     LSHL T0.W, KC0[2].Z, literal.x,
144; EG-NEXT:     LSHL * T1.W, KC0[2].W, literal.x,
145; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
146; EG-NEXT:     ASHR T1.W, PS, literal.x,
147; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
148; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
149; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
150; EG-NEXT:     MULHI_INT * T1.X, PS, PV.W,
151; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
152;
153; CM-LABEL: test_smulhi24_i64:
154; CM:       ; %bb.0: ; %entry
155; CM-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
156; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
157; CM-NEXT:    CF_END
158; CM-NEXT:    PAD
159; CM-NEXT:    ALU clause starting at 4:
160; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
161; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
162; CM-NEXT:     MULHI_INT24 T1.X, KC0[2].Z, KC0[2].W,
163; CM-NEXT:     MULHI_INT24 T1.Y (MASKED), KC0[2].Z, KC0[2].W,
164; CM-NEXT:     MULHI_INT24 T1.Z (MASKED), KC0[2].Z, KC0[2].W,
165; CM-NEXT:     MULHI_INT24 * T1.W (MASKED), KC0[2].Z, KC0[2].W,
166entry:
167  %a.shl = shl i32 %a, 8
168  %a.24 = ashr i32 %a.shl, 8
169  %b.shl = shl i32 %b, 8
170  %b.24 = ashr i32 %b.shl, 8
171  %a.24.i64 = sext i32 %a.24 to i64
172  %b.24.i64 = sext i32 %b.24 to i64
173  %mul48 = mul i64 %a.24.i64, %b.24.i64
174  %mul48.hi = lshr i64 %mul48, 32
175  %mul24hi = trunc i64 %mul48.hi to i32
176  store i32 %mul24hi, i32 addrspace(1)* %out
177  ret void
178}
179
180define i64 @test_smul48_i64(i64 %lhs, i64 %rhs) {
181; SI-LABEL: test_smul48_i64:
182; SI:       ; %bb.0:
183; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
184; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
185; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
186; SI-NEXT:    v_ashr_i64 v[3:4], v[0:1], 40
187; SI-NEXT:    v_ashr_i64 v[1:2], v[1:2], 40
188; SI-NEXT:    v_mul_i32_i24_e32 v0, v3, v1
189; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, v3, v1
190; SI-NEXT:    s_setpc_b64 s[30:31]
191;
192; VI-LABEL: test_smul48_i64:
193; VI:       ; %bb.0:
194; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
195; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
196; VI-NEXT:    v_ashrrev_i64 v[3:4], 40, v[0:1]
197; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
198; VI-NEXT:    v_ashrrev_i64 v[1:2], 40, v[0:1]
199; VI-NEXT:    v_mul_i32_i24_e32 v0, v3, v1
200; VI-NEXT:    v_mul_hi_i32_i24_e32 v1, v3, v1
201; VI-NEXT:    s_setpc_b64 s[30:31]
202;
203; GFX9-LABEL: test_smul48_i64:
204; GFX9:       ; %bb.0:
205; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
206; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
207; GFX9-NEXT:    v_ashrrev_i64 v[3:4], 40, v[0:1]
208; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
209; GFX9-NEXT:    v_ashrrev_i64 v[1:2], 40, v[0:1]
210; GFX9-NEXT:    v_mul_i32_i24_e32 v0, v3, v1
211; GFX9-NEXT:    v_mul_hi_i32_i24_e32 v1, v3, v1
212; GFX9-NEXT:    s_setpc_b64 s[30:31]
213;
214; EG-LABEL: test_smul48_i64:
215; EG:       ; %bb.0:
216; EG-NEXT:    CF_END
217; EG-NEXT:    PAD
218;
219; CM-LABEL: test_smul48_i64:
220; CM:       ; %bb.0:
221; CM-NEXT:    CF_END
222; CM-NEXT:    PAD
223  %shl.lhs = shl i64 %lhs, 40
224  %lhs24 = ashr i64 %shl.lhs, 40
225  %shl.rhs = shl i64 %rhs, 40
226  %rhs24 = ashr i64 %shl.rhs, 40
227  %mul = mul i64 %lhs24, %rhs24
228  ret i64 %mul
229}
230
231define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
232; SI-LABEL: test_smul48_v2i64:
233; SI:       ; %bb.0:
234; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
235; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
236; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v0
237; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v6
238; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
239; SI-NEXT:    v_ashr_i64 v[5:6], v[0:1], 40
240; SI-NEXT:    v_ashr_i64 v[1:2], v[1:2], 40
241; SI-NEXT:    v_ashr_i64 v[6:7], v[2:3], 40
242; SI-NEXT:    v_ashr_i64 v[2:3], v[3:4], 40
243; SI-NEXT:    v_mul_i32_i24_e32 v0, v1, v2
244; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, v1, v2
245; SI-NEXT:    v_mul_i32_i24_e32 v2, v5, v6
246; SI-NEXT:    v_mul_hi_i32_i24_e32 v3, v5, v6
247; SI-NEXT:    s_setpc_b64 s[30:31]
248;
249; VI-LABEL: test_smul48_v2i64:
250; VI:       ; %bb.0:
251; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
252; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
253; VI-NEXT:    v_ashrrev_i64 v[7:8], 40, v[0:1]
254; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
255; VI-NEXT:    v_ashrrev_i64 v[1:2], 40, v[0:1]
256; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
257; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v4
258; VI-NEXT:    v_ashrrev_i64 v[3:4], 40, v[2:3]
259; VI-NEXT:    v_ashrrev_i64 v[4:5], 40, v[1:2]
260; VI-NEXT:    v_mul_i32_i24_e32 v0, v1, v3
261; VI-NEXT:    v_mul_hi_i32_i24_e32 v1, v1, v3
262; VI-NEXT:    v_mul_i32_i24_e32 v2, v7, v4
263; VI-NEXT:    v_mul_hi_i32_i24_e32 v3, v7, v4
264; VI-NEXT:    s_setpc_b64 s[30:31]
265;
266; GFX9-LABEL: test_smul48_v2i64:
267; GFX9:       ; %bb.0:
268; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
269; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
270; GFX9-NEXT:    v_ashrrev_i64 v[7:8], 40, v[0:1]
271; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
272; GFX9-NEXT:    v_ashrrev_i64 v[1:2], 40, v[0:1]
273; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
274; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v4
275; GFX9-NEXT:    v_ashrrev_i64 v[3:4], 40, v[2:3]
276; GFX9-NEXT:    v_ashrrev_i64 v[4:5], 40, v[1:2]
277; GFX9-NEXT:    v_mul_i32_i24_e32 v0, v1, v3
278; GFX9-NEXT:    v_mul_hi_i32_i24_e32 v1, v1, v3
279; GFX9-NEXT:    v_mul_i32_i24_e32 v2, v7, v4
280; GFX9-NEXT:    v_mul_hi_i32_i24_e32 v3, v7, v4
281; GFX9-NEXT:    s_setpc_b64 s[30:31]
282;
283; EG-LABEL: test_smul48_v2i64:
284; EG:       ; %bb.0:
285; EG-NEXT:    CF_END
286; EG-NEXT:    PAD
287;
288; CM-LABEL: test_smul48_v2i64:
289; CM:       ; %bb.0:
290; CM-NEXT:    CF_END
291; CM-NEXT:    PAD
292  %shl.lhs = shl <2 x i64> %lhs, <i64 40, i64 40>
293  %lhs24 = ashr <2 x i64> %shl.lhs, <i64 40, i64 40>
294  %shl.rhs = shl <2 x i64> %rhs, <i64 40, i64 40>
295  %rhs24 = ashr <2 x i64> %shl.rhs, <i64 40, i64 40>
296  %mul = mul <2 x i64> %lhs24, %rhs24
297  ret <2 x i64> %mul
298}
299
300; This requires handling of the original 64-bit mul node to eliminate
301; unnecessary extension instructions because after legalization they
302; will not be removed by SimplifyDemandedBits because there are
303; multiple uses by the separate mul and mulhi.
304define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 {
305; SI-LABEL: test_smul24_i64:
306; SI:       ; %bb.0:
307; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
308; SI-NEXT:    s_load_dword s2, s[0:1], 0x13
309; SI-NEXT:    s_load_dword s0, s[0:1], 0x1c
310; SI-NEXT:    s_mov_b32 s7, 0xf000
311; SI-NEXT:    s_mov_b32 s6, -1
312; SI-NEXT:    s_waitcnt lgkmcnt(0)
313; SI-NEXT:    s_bfe_i32 s1, s2, 0x180000
314; SI-NEXT:    s_bfe_i32 s0, s0, 0x180000
315; SI-NEXT:    v_mov_b32_e32 v0, s1
316; SI-NEXT:    s_mul_i32 s1, s0, s1
317; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, s0, v0
318; SI-NEXT:    v_mov_b32_e32 v0, s1
319; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
320; SI-NEXT:    s_endpgm
321;
322; VI-LABEL: test_smul24_i64:
323; VI:       ; %bb.0:
324; VI-NEXT:    s_load_dword s4, s[0:1], 0x4c
325; VI-NEXT:    s_load_dword s5, s[0:1], 0x70
326; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
327; VI-NEXT:    s_mov_b32 s3, 0xf000
328; VI-NEXT:    s_mov_b32 s2, -1
329; VI-NEXT:    s_waitcnt lgkmcnt(0)
330; VI-NEXT:    s_bfe_i32 s4, s4, 0x180000
331; VI-NEXT:    s_bfe_i32 s5, s5, 0x180000
332; VI-NEXT:    v_mov_b32_e32 v0, s4
333; VI-NEXT:    v_mul_hi_i32_i24_e32 v1, s5, v0
334; VI-NEXT:    v_mul_i32_i24_e32 v0, s5, v0
335; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
336; VI-NEXT:    s_endpgm
337;
338; GFX9-LABEL: test_smul24_i64:
339; GFX9:       ; %bb.0:
340; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x4c
341; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x70
342; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
343; GFX9-NEXT:    s_mov_b32 s7, 0xf000
344; GFX9-NEXT:    s_mov_b32 s6, -1
345; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
346; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
347; GFX9-NEXT:    s_bfe_i32 s1, s3, 0x180000
348; GFX9-NEXT:    s_mul_hi_i32 s2, s1, s0
349; GFX9-NEXT:    s_mul_i32 s1, s1, s0
350; GFX9-NEXT:    v_mov_b32_e32 v0, s1
351; GFX9-NEXT:    v_mov_b32_e32 v1, s2
352; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
353; GFX9-NEXT:    s_endpgm
354;
355; EG-LABEL: test_smul24_i64:
356; EG:       ; %bb.0:
357; EG-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
358; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
359; EG-NEXT:    CF_END
360; EG-NEXT:    PAD
361; EG-NEXT:    ALU clause starting at 4:
362; EG-NEXT:     LSHL T0.W, KC0[4].Z, literal.x,
363; EG-NEXT:     LSHL * T1.W, KC0[6].W, literal.x,
364; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
365; EG-NEXT:     ASHR T1.W, PS, literal.x,
366; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
367; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
368; EG-NEXT:     MULHI_INT * T0.Y, PV.W, PS,
369; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
370; EG-NEXT:     MULLO_INT * T0.X, T1.W, T0.W,
371; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
372;
373; CM-LABEL: test_smul24_i64:
374; CM:       ; %bb.0:
375; CM-NEXT:    ALU 14, @4, KC0[CB0:0-32], KC1[]
376; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
377; CM-NEXT:    CF_END
378; CM-NEXT:    PAD
379; CM-NEXT:    ALU clause starting at 4:
380; CM-NEXT:     LSHL T0.Z, KC0[4].Z, literal.x,
381; CM-NEXT:     LSHL * T0.W, KC0[6].W, literal.x,
382; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
383; CM-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
384; CM-NEXT:     ASHR T1.Z, PV.W, literal.y,
385; CM-NEXT:     ASHR * T0.W, PV.Z, literal.y,
386; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
387; CM-NEXT:     MULLO_INT T1.X, T1.Z, T0.W,
388; CM-NEXT:     MULLO_INT T1.Y (MASKED), T1.Z, T0.W,
389; CM-NEXT:     MULLO_INT T1.Z (MASKED), T1.Z, T0.W,
390; CM-NEXT:     MULLO_INT * T1.W (MASKED), T1.Z, T0.W,
391; CM-NEXT:     MULHI_INT24 T1.X (MASKED), KC0[6].W, KC0[4].Z,
392; CM-NEXT:     MULHI_INT24 T1.Y, KC0[6].W, KC0[4].Z,
393; CM-NEXT:     MULHI_INT24 T1.Z (MASKED), KC0[6].W, KC0[4].Z,
394; CM-NEXT:     MULHI_INT24 * T1.W (MASKED), KC0[6].W, KC0[4].Z,
395  %shl.i = shl i32 %a, 8
396  %shr.i = ashr i32 %shl.i, 8
397  %conv.i = sext i32 %shr.i to i64
398  %shl1.i = shl i32 %b, 8
399  %shr2.i = ashr i32 %shl1.i, 8
400  %conv3.i = sext i32 %shr2.i to i64
401  %mul.i = mul i64 %conv3.i, %conv.i
402  store i64 %mul.i, i64 addrspace(1)* %out
403  ret void
404}
405
406define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
407; SI-LABEL: test_smul24_i64_square:
408; SI:       ; %bb.0:
409; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
410; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
411; SI-NEXT:    s_mov_b32 s3, 0xf000
412; SI-NEXT:    s_mov_b32 s2, -1
413; SI-NEXT:    s_waitcnt lgkmcnt(0)
414; SI-NEXT:    s_bfe_i32 s4, s4, 0x180000
415; SI-NEXT:    s_mul_i32 s5, s4, s4
416; SI-NEXT:    v_mul_hi_i32_i24_e64 v1, s4, s4
417; SI-NEXT:    v_mov_b32_e32 v0, s5
418; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
419; SI-NEXT:    s_endpgm
420;
421; VI-LABEL: test_smul24_i64_square:
422; VI:       ; %bb.0:
423; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
424; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
425; VI-NEXT:    s_mov_b32 s3, 0xf000
426; VI-NEXT:    s_mov_b32 s2, -1
427; VI-NEXT:    s_waitcnt lgkmcnt(0)
428; VI-NEXT:    s_bfe_i32 s4, s4, 0x180000
429; VI-NEXT:    v_mul_hi_i32_i24_e64 v1, s4, s4
430; VI-NEXT:    v_mul_i32_i24_e64 v0, s4, s4
431; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
432; VI-NEXT:    s_endpgm
433;
434; GFX9-LABEL: test_smul24_i64_square:
435; GFX9:       ; %bb.0:
436; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
437; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
438; GFX9-NEXT:    s_mov_b32 s7, 0xf000
439; GFX9-NEXT:    s_mov_b32 s6, -1
440; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
441; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
442; GFX9-NEXT:    s_mul_hi_i32 s1, s0, s0
443; GFX9-NEXT:    s_mul_i32 s0, s0, s0
444; GFX9-NEXT:    v_mov_b32_e32 v0, s0
445; GFX9-NEXT:    v_mov_b32_e32 v1, s1
446; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
447; GFX9-NEXT:    s_endpgm
448;
449; EG-LABEL: test_smul24_i64_square:
450; EG:       ; %bb.0:
451; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
452; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
453; EG-NEXT:    CF_END
454; EG-NEXT:    PAD
455; EG-NEXT:    ALU clause starting at 4:
456; EG-NEXT:     LSHL * T0.W, KC0[2].Z, literal.x,
457; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
458; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
459; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
460; EG-NEXT:     MULHI_INT * T0.Y, PV.W, PV.W,
461; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
462; EG-NEXT:     MULLO_INT * T0.X, T0.W, T0.W,
463; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
464;
465; CM-LABEL: test_smul24_i64_square:
466; CM:       ; %bb.0:
467; CM-NEXT:    ALU 12, @4, KC0[CB0:0-32], KC1[]
468; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
469; CM-NEXT:    CF_END
470; CM-NEXT:    PAD
471; CM-NEXT:    ALU clause starting at 4:
472; CM-NEXT:     LSHL * T0.W, KC0[2].Z, literal.x,
473; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
474; CM-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
475; CM-NEXT:     ASHR * T0.W, PV.W, literal.y,
476; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
477; CM-NEXT:     MULLO_INT T1.X, T0.W, T0.W,
478; CM-NEXT:     MULLO_INT T1.Y (MASKED), T0.W, T0.W,
479; CM-NEXT:     MULLO_INT T1.Z (MASKED), T0.W, T0.W,
480; CM-NEXT:     MULLO_INT * T1.W (MASKED), T0.W, T0.W,
481; CM-NEXT:     MULHI_INT24 T1.X (MASKED), KC0[2].Z, KC0[2].Z,
482; CM-NEXT:     MULHI_INT24 T1.Y, KC0[2].Z, KC0[2].Z,
483; CM-NEXT:     MULHI_INT24 T1.Z (MASKED), KC0[2].Z, KC0[2].Z,
484; CM-NEXT:     MULHI_INT24 * T1.W (MASKED), KC0[2].Z, KC0[2].Z,
485  %shl.i = shl i32 %a, 8
486  %shr.i = ashr i32 %shl.i, 8
487  %conv.i = sext i32 %shr.i to i64
488  %mul.i = mul i64 %conv.i, %conv.i
489  store i64 %mul.i, i64 addrspace(1)* %out
490  ret void
491}
492
493define amdgpu_kernel void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 {
494; SI-LABEL: test_smul24_i33:
495; SI:       ; %bb.0: ; %entry
496; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
497; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
498; SI-NEXT:    s_load_dword s0, s[0:1], 0xd
499; SI-NEXT:    s_mov_b32 s7, 0xf000
500; SI-NEXT:    s_mov_b32 s6, -1
501; SI-NEXT:    s_waitcnt lgkmcnt(0)
502; SI-NEXT:    s_lshl_b32 s1, s2, 8
503; SI-NEXT:    s_lshl_b32 s3, s0, 8
504; SI-NEXT:    s_ashr_i64 s[2:3], s[2:3], 40
505; SI-NEXT:    s_ashr_i64 s[0:1], s[0:1], 40
506; SI-NEXT:    v_mov_b32_e32 v0, s2
507; SI-NEXT:    s_mul_i32 s1, s0, s2
508; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, s0, v0
509; SI-NEXT:    v_mov_b32_e32 v0, s1
510; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 31
511; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], 31
512; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
513; SI-NEXT:    s_endpgm
514;
515; VI-LABEL: test_smul24_i33:
516; VI:       ; %bb.0: ; %entry
517; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
518; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
519; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
520; VI-NEXT:    s_waitcnt lgkmcnt(0)
521; VI-NEXT:    s_lshl_b32 s3, s2, 8
522; VI-NEXT:    s_lshl_b32 s5, s4, 8
523; VI-NEXT:    s_ashr_i64 s[4:5], s[4:5], 40
524; VI-NEXT:    s_ashr_i64 s[2:3], s[2:3], 40
525; VI-NEXT:    v_mov_b32_e32 v0, s4
526; VI-NEXT:    v_mul_hi_i32_i24_e32 v1, s2, v0
527; VI-NEXT:    v_mul_i32_i24_e32 v0, s2, v0
528; VI-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
529; VI-NEXT:    s_mov_b32 s3, 0xf000
530; VI-NEXT:    v_ashrrev_i64 v[0:1], 31, v[0:1]
531; VI-NEXT:    s_mov_b32 s2, -1
532; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
533; VI-NEXT:    s_endpgm
534;
535; GFX9-LABEL: test_smul24_i33:
536; GFX9:       ; %bb.0: ; %entry
537; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
538; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
539; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x34
540; GFX9-NEXT:    s_mov_b32 s7, 0xf000
541; GFX9-NEXT:    s_mov_b32 s6, -1
542; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
543; GFX9-NEXT:    s_lshl_b32 s1, s2, 8
544; GFX9-NEXT:    s_ashr_i64 s[0:1], s[0:1], 40
545; GFX9-NEXT:    s_lshl_b32 s1, s3, 8
546; GFX9-NEXT:    s_ashr_i64 s[2:3], s[0:1], 40
547; GFX9-NEXT:    s_mul_hi_i32 s1, s0, s2
548; GFX9-NEXT:    s_mul_i32 s0, s0, s2
549; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 31
550; GFX9-NEXT:    s_ashr_i64 s[0:1], s[0:1], 31
551; GFX9-NEXT:    v_mov_b32_e32 v0, s0
552; GFX9-NEXT:    v_mov_b32_e32 v1, s1
553; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
554; GFX9-NEXT:    s_endpgm
555;
556; EG-LABEL: test_smul24_i33:
557; EG:       ; %bb.0: ; %entry
558; EG-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
559; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1
560; EG-NEXT:    CF_END
561; EG-NEXT:    PAD
562; EG-NEXT:    ALU clause starting at 4:
563; EG-NEXT:     LSHL T0.W, KC0[2].W, literal.x,
564; EG-NEXT:     LSHL * T1.W, KC0[3].Y, literal.x,
565; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
566; EG-NEXT:     ASHR T1.W, PS, literal.x,
567; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
568; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
569; EG-NEXT:     MULHI_INT * T0.X, PS, PV.W,
570; EG-NEXT:     MULLO_INT * T1.X, T0.W, T1.W,
571; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
572; EG-NEXT:     BFE_INT * T1.Y, T0.X, 0.0, 1,
573; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
574;
575; CM-LABEL: test_smul24_i33:
576; CM:       ; %bb.0: ; %entry
577; CM-NEXT:    ALU 16, @4, KC0[CB0:0-32], KC1[]
578; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
579; CM-NEXT:    CF_END
580; CM-NEXT:    PAD
581; CM-NEXT:    ALU clause starting at 4:
582; CM-NEXT:     LSHL T0.Z, KC0[2].W, literal.x,
583; CM-NEXT:     LSHL * T0.W, KC0[3].Y, literal.x,
584; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
585; CM-NEXT:     ASHR T1.Z, PV.W, literal.x,
586; CM-NEXT:     ASHR * T0.W, PV.Z, literal.x,
587; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
588; CM-NEXT:     MULHI_INT24 T0.X, KC0[2].W, KC0[3].Y,
589; CM-NEXT:     MULHI_INT24 T0.Y (MASKED), KC0[2].W, KC0[3].Y,
590; CM-NEXT:     MULHI_INT24 T0.Z (MASKED), KC0[2].W, KC0[3].Y,
591; CM-NEXT:     MULHI_INT24 * T0.W (MASKED), KC0[2].W, KC0[3].Y,
592; CM-NEXT:     MULLO_INT T1.X, T0.W, T1.Z,
593; CM-NEXT:     MULLO_INT T1.Y (MASKED), T0.W, T1.Z,
594; CM-NEXT:     MULLO_INT T1.Z (MASKED), T0.W, T1.Z,
595; CM-NEXT:     MULLO_INT * T1.W (MASKED), T0.W, T1.Z,
596; CM-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
597; CM-NEXT:     BFE_INT * T1.Y, T0.X, 0.0, 1,
598; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
599entry:
600  %a.shl = shl i33 %a, 9
601  %a.24 = ashr i33 %a.shl, 9
602  %b.shl = shl i33 %b, 9
603  %b.24 = ashr i33 %b.shl, 9
604  %mul24 = mul i33 %a.24, %b.24
605  %ext = sext i33 %mul24 to i64
606  store i64 %ext, i64 addrspace(1)* %out
607  ret void
608}
609
610define amdgpu_kernel void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
611; SI-LABEL: test_smulhi24_i33:
612; SI:       ; %bb.0: ; %entry
613; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
614; SI-NEXT:    s_load_dword s5, s[0:1], 0xb
615; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
616; SI-NEXT:    s_mov_b32 s3, 0xf000
617; SI-NEXT:    s_mov_b32 s2, -1
618; SI-NEXT:    s_waitcnt lgkmcnt(0)
619; SI-NEXT:    v_mov_b32_e32 v0, s4
620; SI-NEXT:    v_mul_hi_i32_i24_e32 v0, s5, v0
621; SI-NEXT:    v_and_b32_e32 v0, 1, v0
622; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
623; SI-NEXT:    s_endpgm
624;
625; VI-LABEL: test_smulhi24_i33:
626; VI:       ; %bb.0: ; %entry
627; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
628; VI-NEXT:    s_load_dword s5, s[0:1], 0x2c
629; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
630; VI-NEXT:    s_mov_b32 s3, 0xf000
631; VI-NEXT:    s_mov_b32 s2, -1
632; VI-NEXT:    s_waitcnt lgkmcnt(0)
633; VI-NEXT:    v_mov_b32_e32 v0, s4
634; VI-NEXT:    v_mul_hi_i32_i24_e32 v0, s5, v0
635; VI-NEXT:    v_and_b32_e32 v0, 1, v0
636; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
637; VI-NEXT:    s_endpgm
638;
639; GFX9-LABEL: test_smulhi24_i33:
640; GFX9:       ; %bb.0: ; %entry
641; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
642; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
643; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x34
644; GFX9-NEXT:    s_mov_b32 s7, 0xf000
645; GFX9-NEXT:    s_mov_b32 s6, -1
646; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
647; GFX9-NEXT:    s_lshl_b32 s1, s2, 8
648; GFX9-NEXT:    s_ashr_i64 s[0:1], s[0:1], 40
649; GFX9-NEXT:    s_lshl_b32 s1, s3, 8
650; GFX9-NEXT:    s_ashr_i64 s[2:3], s[0:1], 40
651; GFX9-NEXT:    s_mul_hi_i32 s0, s0, s2
652; GFX9-NEXT:    s_and_b32 s0, s0, 1
653; GFX9-NEXT:    v_mov_b32_e32 v0, s0
654; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
655; GFX9-NEXT:    s_endpgm
656;
657; EG-LABEL: test_smulhi24_i33:
658; EG:       ; %bb.0: ; %entry
659; EG-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
660; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
661; EG-NEXT:    CF_END
662; EG-NEXT:    PAD
663; EG-NEXT:    ALU clause starting at 4:
664; EG-NEXT:     LSHL T0.W, KC0[2].W, literal.x,
665; EG-NEXT:     LSHL * T1.W, KC0[3].Y, literal.x,
666; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
667; EG-NEXT:     ASHR T1.W, PS, literal.x,
668; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
669; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
670; EG-NEXT:     MULHI_INT * T0.X, PS, PV.W,
671; EG-NEXT:     AND_INT T0.X, PS, 1,
672; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
673; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
674;
675; CM-LABEL: test_smulhi24_i33:
676; CM:       ; %bb.0: ; %entry
677; CM-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
678; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
679; CM-NEXT:    CF_END
680; CM-NEXT:    PAD
681; CM-NEXT:    ALU clause starting at 4:
682; CM-NEXT:     MULHI_INT24 T0.X, KC0[2].W, KC0[3].Y,
683; CM-NEXT:     MULHI_INT24 T0.Y (MASKED), KC0[2].W, KC0[3].Y,
684; CM-NEXT:     MULHI_INT24 T0.Z (MASKED), KC0[2].W, KC0[3].Y,
685; CM-NEXT:     MULHI_INT24 * T0.W (MASKED), KC0[2].W, KC0[3].Y,
686; CM-NEXT:     AND_INT * T0.X, PV.X, 1,
687; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
688; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
689entry:
690  %tmp0 = shl i33 %a, 9
691  %a_24 = ashr i33 %tmp0, 9
692  %tmp1 = shl i33 %b, 9
693  %b_24 = ashr i33 %tmp1, 9
694  %tmp2 = mul i33 %a_24, %b_24
695  %hi = lshr i33 %tmp2, 32
696  %trunc = trunc i33 %hi to i32
697
698  store i32 %trunc, i32 addrspace(1)* %out
699  ret void
700}
701
702define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) {
703; SI-LABEL: simplify_i24_crash:
704; SI:       ; %bb.0: ; %bb
705; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
706; SI-NEXT:    s_waitcnt lgkmcnt(0)
707; SI-NEXT:    s_cmp_lg_u32 s2, 0
708; SI-NEXT:    s_cbranch_scc0 .LBB8_2
709; SI-NEXT:  ; %bb.1: ; %bb7
710; SI-NEXT:    s_endpgm
711; SI-NEXT:  .LBB8_2: ; %bb11
712; SI-NEXT:    s_load_dword s2, s[0:1], 0xd
713; SI-NEXT:    s_load_dword s4, s[0:1], 0xf
714; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
715; SI-NEXT:    s_mov_b32 s3, 0xf000
716; SI-NEXT:    s_waitcnt lgkmcnt(0)
717; SI-NEXT:    s_bfe_i32 s2, s2, 0x180000
718; SI-NEXT:    s_bfe_i32 s4, s4, 0x180000
719; SI-NEXT:    s_mul_i32 s4, s2, s4
720; SI-NEXT:    s_mov_b32 s2, -1
721; SI-NEXT:    v_mov_b32_e32 v0, s4
722; SI-NEXT:    v_mov_b32_e32 v1, s4
723; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
724; SI-NEXT:    s_endpgm
725;
726; VI-LABEL: simplify_i24_crash:
727; VI:       ; %bb.0: ; %bb
728; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
729; VI-NEXT:    s_waitcnt lgkmcnt(0)
730; VI-NEXT:    s_cmp_lg_u32 s2, 0
731; VI-NEXT:    s_cbranch_scc0 .LBB8_2
732; VI-NEXT:  ; %bb.1: ; %bb7
733; VI-NEXT:    s_endpgm
734; VI-NEXT:  .LBB8_2: ; %bb11
735; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
736; VI-NEXT:    s_load_dword s5, s[0:1], 0x3c
737; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
738; VI-NEXT:    s_mov_b32 s3, 0xf000
739; VI-NEXT:    s_mov_b32 s2, -1
740; VI-NEXT:    s_waitcnt lgkmcnt(0)
741; VI-NEXT:    s_bfe_i32 s4, s4, 0x180000
742; VI-NEXT:    s_bfe_i32 s5, s5, 0x180000
743; VI-NEXT:    s_mul_i32 s4, s4, s5
744; VI-NEXT:    v_mov_b32_e32 v0, s4
745; VI-NEXT:    v_mov_b32_e32 v1, s4
746; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
747; VI-NEXT:    s_endpgm
748;
749; GFX9-LABEL: simplify_i24_crash:
750; GFX9:       ; %bb.0: ; %bb
751; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
752; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
753; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
754; GFX9-NEXT:    s_cbranch_scc0 .LBB8_2
755; GFX9-NEXT:  ; %bb.1: ; %bb7
756; GFX9-NEXT:    s_endpgm
757; GFX9-NEXT:  .LBB8_2: ; %bb11
758; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
759; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x3c
760; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
761; GFX9-NEXT:    s_mov_b32 s7, 0xf000
762; GFX9-NEXT:    s_mov_b32 s6, -1
763; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
764; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
765; GFX9-NEXT:    s_bfe_i32 s1, s3, 0x180000
766; GFX9-NEXT:    s_mul_i32 s0, s0, s1
767; GFX9-NEXT:    v_mov_b32_e32 v0, s0
768; GFX9-NEXT:    v_mov_b32_e32 v1, s0
769; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
770; GFX9-NEXT:    s_endpgm
771;
772; EG-LABEL: simplify_i24_crash:
773; EG:       ; %bb.0: ; %bb
774; EG-NEXT:    ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[]
775; EG-NEXT:    JUMP @5 POP:1
776; EG-NEXT:    ALU 10, @8, KC0[CB0:0-32], KC1[]
777; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 0
778; EG-NEXT:    POP @5 POP:1
779; EG-NEXT:    CF_END
780; EG-NEXT:    ALU clause starting at 6:
781; EG-NEXT:     SETNE_INT * T0.W, KC0[2].Z, 0.0,
782; EG-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
783; EG-NEXT:    ALU clause starting at 8:
784; EG-NEXT:     LSHL T0.W, KC0[2].W, literal.x,
785; EG-NEXT:     LSHL * T1.W, KC0[3].Y, literal.x,
786; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
787; EG-NEXT:     ASHR T1.W, PS, literal.x,
788; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
789; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
790; EG-NEXT:     MOV T2.W, KC0[2].Y,
791; EG-NEXT:     MULLO_INT * T0.X, PS, PV.W,
792; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
793; EG-NEXT:     MOV * T0.Y, PS,
794; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
795;
796; CM-LABEL: simplify_i24_crash:
797; CM:       ; %bb.0: ; %bb
798; CM-NEXT:    ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[]
799; CM-NEXT:    JUMP @5 POP:1
800; CM-NEXT:    ALU 13, @8, KC0[CB0:0-32], KC1[]
801; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
802; CM-NEXT:    POP @5 POP:1
803; CM-NEXT:    CF_END
804; CM-NEXT:    ALU clause starting at 6:
805; CM-NEXT:     SETNE_INT * T0.W, KC0[2].Z, 0.0,
806; CM-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
807; CM-NEXT:    ALU clause starting at 8:
808; CM-NEXT:     LSHL T0.Z, KC0[2].W, literal.x,
809; CM-NEXT:     LSHL * T0.W, KC0[3].Y, literal.x,
810; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
811; CM-NEXT:     MOV T0.Y, KC0[2].Y,
812; CM-NEXT:     ASHR T1.Z, PV.W, literal.x,
813; CM-NEXT:     ASHR * T0.W, PV.Z, literal.x,
814; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
815; CM-NEXT:     MULLO_INT T0.X, T0.W, T1.Z,
816; CM-NEXT:     MULLO_INT T0.Y (MASKED), T0.W, T1.Z,
817; CM-NEXT:     MULLO_INT T0.Z (MASKED), T0.W, T1.Z,
818; CM-NEXT:     MULLO_INT * T0.W (MASKED), T0.W, T1.Z,
819; CM-NEXT:     LSHR T1.X, T0.Y, literal.x,
820; CM-NEXT:     MOV * T0.Y, PV.X,
821; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
822bb:
823  %cmp = icmp eq i32 %arg0, 0
824  br i1 %cmp, label %bb11, label %bb7
825
826bb11:
827  %tmp14 = shufflevector <2 x i32> %arg1, <2 x i32> undef, <2 x i32> zeroinitializer
828  %tmp16 = shufflevector <2 x i32> %arg2, <2 x i32> undef, <2 x i32> zeroinitializer
829  %tmp17 = shl <2 x i32> %tmp14, <i32 8, i32 8>
830  %tmp18 = ashr <2 x i32> %tmp17, <i32 8, i32 8>
831  %tmp19 = shl <2 x i32> %tmp16, <i32 8, i32 8>
832  %tmp20 = ashr <2 x i32> %tmp19, <i32 8, i32 8>
833  %tmp21 = mul <2 x i32> %tmp18, %tmp20
834  store <2 x i32> %tmp21, <2 x i32> addrspace(1)* %out
835  br label %bb7
836
837bb7:
838  ret void
839
840}
841attributes #0 = { nounwind }
842