1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
6; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM %s
7
8; Signed 24-bit multiply is not supported on pre-Cayman GPUs.
9define amdgpu_kernel void @test_smul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
10; SI-LABEL: test_smul24_i32:
11; SI:       ; %bb.0: ; %entry
12; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
13; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
14; SI-NEXT:    s_mov_b32 s3, 0xf000
15; SI-NEXT:    s_waitcnt lgkmcnt(0)
16; SI-NEXT:    s_bfe_i32 s2, s4, 0x180000
17; SI-NEXT:    s_bfe_i32 s4, s5, 0x180000
18; SI-NEXT:    s_mul_i32 s4, s2, s4
19; SI-NEXT:    s_mov_b32 s2, -1
20; SI-NEXT:    v_mov_b32_e32 v0, s4
21; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
22; SI-NEXT:    s_endpgm
23;
24; VI-LABEL: test_smul24_i32:
25; VI:       ; %bb.0: ; %entry
26; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
27; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
28; VI-NEXT:    s_mov_b32 s3, 0xf000
29; VI-NEXT:    s_mov_b32 s2, -1
30; VI-NEXT:    s_waitcnt lgkmcnt(0)
31; VI-NEXT:    s_bfe_i32 s4, s4, 0x180000
32; VI-NEXT:    s_bfe_i32 s5, s5, 0x180000
33; VI-NEXT:    s_mul_i32 s4, s4, s5
34; VI-NEXT:    v_mov_b32_e32 v0, s4
35; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
36; VI-NEXT:    s_endpgm
37;
38; GFX9-LABEL: test_smul24_i32:
39; GFX9:       ; %bb.0: ; %entry
40; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
41; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
42; GFX9-NEXT:    s_mov_b32 s7, 0xf000
43; GFX9-NEXT:    s_mov_b32 s6, -1
44; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
45; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
46; GFX9-NEXT:    s_bfe_i32 s1, s3, 0x180000
47; GFX9-NEXT:    s_mul_i32 s0, s0, s1
48; GFX9-NEXT:    v_mov_b32_e32 v0, s0
49; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
50; GFX9-NEXT:    s_endpgm
51;
52; EG-LABEL: test_smul24_i32:
53; EG:       ; %bb.0: ; %entry
54; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
55; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
56; EG-NEXT:    CF_END
57; EG-NEXT:    PAD
58; EG-NEXT:    ALU clause starting at 4:
59; EG-NEXT:     LSHL T0.W, KC0[2].Z, literal.x,
60; EG-NEXT:     LSHL * T1.W, KC0[2].W, literal.x,
61; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
62; EG-NEXT:     ASHR T1.W, PS, literal.x,
63; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
64; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
65; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
66; EG-NEXT:     MULLO_INT * T1.X, PS, PV.W,
67; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
68;
69; CM-LABEL: test_smul24_i32:
70; CM:       ; %bb.0: ; %entry
71; CM-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
72; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
73; CM-NEXT:    CF_END
74; CM-NEXT:    PAD
75; CM-NEXT:    ALU clause starting at 4:
76; CM-NEXT:     LSHL T0.Z, KC0[2].Z, literal.x,
77; CM-NEXT:     LSHL * T0.W, KC0[2].W, literal.x,
78; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
79; CM-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
80; CM-NEXT:     ASHR T1.Z, PV.W, literal.y,
81; CM-NEXT:     ASHR * T0.W, PV.Z, literal.y,
82; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
83; CM-NEXT:     MULLO_INT T1.X, T0.W, T1.Z,
84; CM-NEXT:     MULLO_INT T1.Y (MASKED), T0.W, T1.Z,
85; CM-NEXT:     MULLO_INT T1.Z (MASKED), T0.W, T1.Z,
86; CM-NEXT:     MULLO_INT * T1.W (MASKED), T0.W, T1.Z,
87entry:
88  %a.shl = shl i32 %a, 8
89  %a.24 = ashr i32 %a.shl, 8
90  %b.shl = shl i32 %b, 8
91  %b.24 = ashr i32 %b.shl, 8
92  %mul24 = mul i32 %a.24, %b.24
93  store i32 %mul24, i32 addrspace(1)* %out
94  ret void
95}
96
97define amdgpu_kernel void @test_smulhi24_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
98; SI-LABEL: test_smulhi24_i64:
99; SI:       ; %bb.0: ; %entry
100; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
101; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
102; SI-NEXT:    s_mov_b32 s3, 0xf000
103; SI-NEXT:    s_mov_b32 s2, -1
104; SI-NEXT:    s_waitcnt lgkmcnt(0)
105; SI-NEXT:    v_mov_b32_e32 v0, s5
106; SI-NEXT:    v_mul_hi_i32_i24_e32 v0, s4, v0
107; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
108; SI-NEXT:    s_endpgm
109;
110; VI-LABEL: test_smulhi24_i64:
111; VI:       ; %bb.0: ; %entry
112; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
113; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
114; VI-NEXT:    s_mov_b32 s3, 0xf000
115; VI-NEXT:    s_mov_b32 s2, -1
116; VI-NEXT:    s_waitcnt lgkmcnt(0)
117; VI-NEXT:    v_mov_b32_e32 v0, s5
118; VI-NEXT:    v_mul_hi_i32_i24_e32 v0, s4, v0
119; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
120; VI-NEXT:    s_endpgm
121;
122; GFX9-LABEL: test_smulhi24_i64:
123; GFX9:       ; %bb.0: ; %entry
124; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
125; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
126; GFX9-NEXT:    s_mov_b32 s7, 0xf000
127; GFX9-NEXT:    s_mov_b32 s6, -1
128; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
129; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
130; GFX9-NEXT:    s_bfe_i32 s1, s3, 0x180000
131; GFX9-NEXT:    s_mul_hi_i32 s0, s0, s1
132; GFX9-NEXT:    v_mov_b32_e32 v0, s0
133; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
134; GFX9-NEXT:    s_endpgm
135;
136; EG-LABEL: test_smulhi24_i64:
137; EG:       ; %bb.0: ; %entry
138; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
139; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
140; EG-NEXT:    CF_END
141; EG-NEXT:    PAD
142; EG-NEXT:    ALU clause starting at 4:
143; EG-NEXT:     LSHL T0.W, KC0[2].Z, literal.x,
144; EG-NEXT:     LSHL * T1.W, KC0[2].W, literal.x,
145; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
146; EG-NEXT:     ASHR T1.W, PS, literal.x,
147; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
148; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
149; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
150; EG-NEXT:     MULHI_INT * T1.X, PS, PV.W,
151; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
152;
153; CM-LABEL: test_smulhi24_i64:
154; CM:       ; %bb.0: ; %entry
155; CM-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
156; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
157; CM-NEXT:    CF_END
158; CM-NEXT:    PAD
159; CM-NEXT:    ALU clause starting at 4:
160; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
161; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
162; CM-NEXT:     MULHI_INT24 T1.X, KC0[2].Z, KC0[2].W,
163; CM-NEXT:     MULHI_INT24 T1.Y (MASKED), KC0[2].Z, KC0[2].W,
164; CM-NEXT:     MULHI_INT24 T1.Z (MASKED), KC0[2].Z, KC0[2].W,
165; CM-NEXT:     MULHI_INT24 * T1.W (MASKED), KC0[2].Z, KC0[2].W,
166entry:
167  %a.shl = shl i32 %a, 8
168  %a.24 = ashr i32 %a.shl, 8
169  %b.shl = shl i32 %b, 8
170  %b.24 = ashr i32 %b.shl, 8
171  %a.24.i64 = sext i32 %a.24 to i64
172  %b.24.i64 = sext i32 %b.24 to i64
173  %mul48 = mul i64 %a.24.i64, %b.24.i64
174  %mul48.hi = lshr i64 %mul48, 32
175  %mul24hi = trunc i64 %mul48.hi to i32
176  store i32 %mul24hi, i32 addrspace(1)* %out
177  ret void
178}
179
180define i64 @test_smul48_i64(i64 %lhs, i64 %rhs) {
181; SI-LABEL: test_smul48_i64:
182; SI:       ; %bb.0:
183; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
184; SI-NEXT:    v_mul_i32_i24_e32 v3, v0, v2
185; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v2
186; SI-NEXT:    v_mov_b32_e32 v0, v3
187; SI-NEXT:    s_setpc_b64 s[30:31]
188;
189; VI-LABEL: test_smul48_i64:
190; VI:       ; %bb.0:
191; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
192; VI-NEXT:    v_mul_i32_i24_e32 v3, v0, v2
193; VI-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v2
194; VI-NEXT:    v_mov_b32_e32 v0, v3
195; VI-NEXT:    s_setpc_b64 s[30:31]
196;
197; GFX9-LABEL: test_smul48_i64:
198; GFX9:       ; %bb.0:
199; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
200; GFX9-NEXT:    v_mul_i32_i24_e32 v3, v0, v2
201; GFX9-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v2
202; GFX9-NEXT:    v_mov_b32_e32 v0, v3
203; GFX9-NEXT:    s_setpc_b64 s[30:31]
204;
205; EG-LABEL: test_smul48_i64:
206; EG:       ; %bb.0:
207; EG-NEXT:    CF_END
208; EG-NEXT:    PAD
209;
210; CM-LABEL: test_smul48_i64:
211; CM:       ; %bb.0:
212; CM-NEXT:    CF_END
213; CM-NEXT:    PAD
214  %shl.lhs = shl i64 %lhs, 40
215  %lhs24 = ashr i64 %shl.lhs, 40
216  %shl.rhs = shl i64 %rhs, 40
217  %rhs24 = ashr i64 %shl.rhs, 40
218  %mul = mul i64 %lhs24, %rhs24
219  ret i64 %mul
220}
221
222define <2 x i64> @test_smul48_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
223; SI-LABEL: test_smul48_v2i64:
224; SI:       ; %bb.0:
225; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
226; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
227; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v0
228; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v6
229; SI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
230; SI-NEXT:    v_ashr_i64 v[5:6], v[0:1], 40
231; SI-NEXT:    v_ashr_i64 v[1:2], v[1:2], 40
232; SI-NEXT:    v_ashr_i64 v[6:7], v[2:3], 40
233; SI-NEXT:    v_ashr_i64 v[2:3], v[3:4], 40
234; SI-NEXT:    v_mul_i32_i24_e32 v0, v1, v2
235; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, v1, v2
236; SI-NEXT:    v_mul_i32_i24_e32 v2, v5, v6
237; SI-NEXT:    v_mul_hi_i32_i24_e32 v3, v5, v6
238; SI-NEXT:    s_setpc_b64 s[30:31]
239;
240; VI-LABEL: test_smul48_v2i64:
241; VI:       ; %bb.0:
242; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
243; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
244; VI-NEXT:    v_ashrrev_i64 v[7:8], 40, v[0:1]
245; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
246; VI-NEXT:    v_ashrrev_i64 v[1:2], 40, v[0:1]
247; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
248; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v4
249; VI-NEXT:    v_ashrrev_i64 v[3:4], 40, v[2:3]
250; VI-NEXT:    v_ashrrev_i64 v[4:5], 40, v[1:2]
251; VI-NEXT:    v_mul_i32_i24_e32 v0, v1, v3
252; VI-NEXT:    v_mul_hi_i32_i24_e32 v1, v1, v3
253; VI-NEXT:    v_mul_i32_i24_e32 v2, v7, v4
254; VI-NEXT:    v_mul_hi_i32_i24_e32 v3, v7, v4
255; VI-NEXT:    s_setpc_b64 s[30:31]
256;
257; GFX9-LABEL: test_smul48_v2i64:
258; GFX9:       ; %bb.0:
259; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
260; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
261; GFX9-NEXT:    v_ashrrev_i64 v[7:8], 40, v[0:1]
262; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v0
263; GFX9-NEXT:    v_ashrrev_i64 v[1:2], 40, v[0:1]
264; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
265; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 8, v4
266; GFX9-NEXT:    v_ashrrev_i64 v[3:4], 40, v[2:3]
267; GFX9-NEXT:    v_ashrrev_i64 v[4:5], 40, v[1:2]
268; GFX9-NEXT:    v_mul_i32_i24_e32 v0, v1, v3
269; GFX9-NEXT:    v_mul_hi_i32_i24_e32 v1, v1, v3
270; GFX9-NEXT:    v_mul_i32_i24_e32 v2, v7, v4
271; GFX9-NEXT:    v_mul_hi_i32_i24_e32 v3, v7, v4
272; GFX9-NEXT:    s_setpc_b64 s[30:31]
273;
274; EG-LABEL: test_smul48_v2i64:
275; EG:       ; %bb.0:
276; EG-NEXT:    CF_END
277; EG-NEXT:    PAD
278;
279; CM-LABEL: test_smul48_v2i64:
280; CM:       ; %bb.0:
281; CM-NEXT:    CF_END
282; CM-NEXT:    PAD
283  %shl.lhs = shl <2 x i64> %lhs, <i64 40, i64 40>
284  %lhs24 = ashr <2 x i64> %shl.lhs, <i64 40, i64 40>
285  %shl.rhs = shl <2 x i64> %rhs, <i64 40, i64 40>
286  %rhs24 = ashr <2 x i64> %shl.rhs, <i64 40, i64 40>
287  %mul = mul <2 x i64> %lhs24, %rhs24
288  ret <2 x i64> %mul
289}
290
291; This requires handling of the original 64-bit mul node to eliminate
292; unnecessary extension instructions because after legalization they
293; will not be removed by SimplifyDemandedBits because there are
294; multiple uses by the separate mul and mulhi.
295define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 {
296; SI-LABEL: test_smul24_i64:
297; SI:       ; %bb.0:
298; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
299; SI-NEXT:    s_load_dword s2, s[0:1], 0x13
300; SI-NEXT:    s_load_dword s0, s[0:1], 0x1c
301; SI-NEXT:    s_mov_b32 s7, 0xf000
302; SI-NEXT:    s_mov_b32 s6, -1
303; SI-NEXT:    s_waitcnt lgkmcnt(0)
304; SI-NEXT:    s_bfe_i32 s1, s2, 0x180000
305; SI-NEXT:    s_bfe_i32 s0, s0, 0x180000
306; SI-NEXT:    v_mov_b32_e32 v0, s1
307; SI-NEXT:    s_mul_i32 s1, s0, s1
308; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, s0, v0
309; SI-NEXT:    v_mov_b32_e32 v0, s1
310; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
311; SI-NEXT:    s_endpgm
312;
313; VI-LABEL: test_smul24_i64:
314; VI:       ; %bb.0:
315; VI-NEXT:    s_load_dword s4, s[0:1], 0x4c
316; VI-NEXT:    s_load_dword s5, s[0:1], 0x70
317; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
318; VI-NEXT:    s_mov_b32 s3, 0xf000
319; VI-NEXT:    s_mov_b32 s2, -1
320; VI-NEXT:    s_waitcnt lgkmcnt(0)
321; VI-NEXT:    s_bfe_i32 s4, s4, 0x180000
322; VI-NEXT:    s_bfe_i32 s5, s5, 0x180000
323; VI-NEXT:    v_mov_b32_e32 v0, s4
324; VI-NEXT:    v_mul_hi_i32_i24_e32 v1, s5, v0
325; VI-NEXT:    v_mul_i32_i24_e32 v0, s5, v0
326; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
327; VI-NEXT:    s_endpgm
328;
329; GFX9-LABEL: test_smul24_i64:
330; GFX9:       ; %bb.0:
331; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x4c
332; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x70
333; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
334; GFX9-NEXT:    s_mov_b32 s7, 0xf000
335; GFX9-NEXT:    s_mov_b32 s6, -1
336; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
337; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
338; GFX9-NEXT:    s_bfe_i32 s1, s3, 0x180000
339; GFX9-NEXT:    s_mul_hi_i32 s2, s1, s0
340; GFX9-NEXT:    s_mul_i32 s1, s1, s0
341; GFX9-NEXT:    v_mov_b32_e32 v0, s1
342; GFX9-NEXT:    v_mov_b32_e32 v1, s2
343; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
344; GFX9-NEXT:    s_endpgm
345;
346; EG-LABEL: test_smul24_i64:
347; EG:       ; %bb.0:
348; EG-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
349; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
350; EG-NEXT:    CF_END
351; EG-NEXT:    PAD
352; EG-NEXT:    ALU clause starting at 4:
353; EG-NEXT:     LSHL T0.W, KC0[4].Z, literal.x,
354; EG-NEXT:     LSHL * T1.W, KC0[6].W, literal.x,
355; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
356; EG-NEXT:     ASHR T1.W, PS, literal.x,
357; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
358; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
359; EG-NEXT:     MULHI_INT * T0.Y, PV.W, PS,
360; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
361; EG-NEXT:     MULLO_INT * T0.X, T1.W, T0.W,
362; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
363;
364; CM-LABEL: test_smul24_i64:
365; CM:       ; %bb.0:
366; CM-NEXT:    ALU 14, @4, KC0[CB0:0-32], KC1[]
367; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
368; CM-NEXT:    CF_END
369; CM-NEXT:    PAD
370; CM-NEXT:    ALU clause starting at 4:
371; CM-NEXT:     LSHL T0.Z, KC0[4].Z, literal.x,
372; CM-NEXT:     LSHL * T0.W, KC0[6].W, literal.x,
373; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
374; CM-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
375; CM-NEXT:     ASHR T1.Z, PV.W, literal.y,
376; CM-NEXT:     ASHR * T0.W, PV.Z, literal.y,
377; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
378; CM-NEXT:     MULLO_INT T1.X, T1.Z, T0.W,
379; CM-NEXT:     MULLO_INT T1.Y (MASKED), T1.Z, T0.W,
380; CM-NEXT:     MULLO_INT T1.Z (MASKED), T1.Z, T0.W,
381; CM-NEXT:     MULLO_INT * T1.W (MASKED), T1.Z, T0.W,
382; CM-NEXT:     MULHI_INT24 T1.X (MASKED), KC0[6].W, KC0[4].Z,
383; CM-NEXT:     MULHI_INT24 T1.Y, KC0[6].W, KC0[4].Z,
384; CM-NEXT:     MULHI_INT24 T1.Z (MASKED), KC0[6].W, KC0[4].Z,
385; CM-NEXT:     MULHI_INT24 * T1.W (MASKED), KC0[6].W, KC0[4].Z,
386  %shl.i = shl i32 %a, 8
387  %shr.i = ashr i32 %shl.i, 8
388  %conv.i = sext i32 %shr.i to i64
389  %shl1.i = shl i32 %b, 8
390  %shr2.i = ashr i32 %shl1.i, 8
391  %conv3.i = sext i32 %shr2.i to i64
392  %mul.i = mul i64 %conv3.i, %conv.i
393  store i64 %mul.i, i64 addrspace(1)* %out
394  ret void
395}
396
397define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
398; SI-LABEL: test_smul24_i64_square:
399; SI:       ; %bb.0:
400; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
401; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
402; SI-NEXT:    s_mov_b32 s3, 0xf000
403; SI-NEXT:    s_mov_b32 s2, -1
404; SI-NEXT:    s_waitcnt lgkmcnt(0)
405; SI-NEXT:    s_bfe_i32 s4, s4, 0x180000
406; SI-NEXT:    s_mul_i32 s5, s4, s4
407; SI-NEXT:    v_mul_hi_i32_i24_e64 v1, s4, s4
408; SI-NEXT:    v_mov_b32_e32 v0, s5
409; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
410; SI-NEXT:    s_endpgm
411;
412; VI-LABEL: test_smul24_i64_square:
413; VI:       ; %bb.0:
414; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
415; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
416; VI-NEXT:    s_mov_b32 s3, 0xf000
417; VI-NEXT:    s_mov_b32 s2, -1
418; VI-NEXT:    s_waitcnt lgkmcnt(0)
419; VI-NEXT:    s_bfe_i32 s4, s4, 0x180000
420; VI-NEXT:    v_mul_hi_i32_i24_e64 v1, s4, s4
421; VI-NEXT:    v_mul_i32_i24_e64 v0, s4, s4
422; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
423; VI-NEXT:    s_endpgm
424;
425; GFX9-LABEL: test_smul24_i64_square:
426; GFX9:       ; %bb.0:
427; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
428; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
429; GFX9-NEXT:    s_mov_b32 s7, 0xf000
430; GFX9-NEXT:    s_mov_b32 s6, -1
431; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
432; GFX9-NEXT:    s_bfe_i32 s0, s2, 0x180000
433; GFX9-NEXT:    s_mul_hi_i32 s1, s0, s0
434; GFX9-NEXT:    s_mul_i32 s0, s0, s0
435; GFX9-NEXT:    v_mov_b32_e32 v0, s0
436; GFX9-NEXT:    v_mov_b32_e32 v1, s1
437; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
438; GFX9-NEXT:    s_endpgm
439;
440; EG-LABEL: test_smul24_i64_square:
441; EG:       ; %bb.0:
442; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
443; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
444; EG-NEXT:    CF_END
445; EG-NEXT:    PAD
446; EG-NEXT:    ALU clause starting at 4:
447; EG-NEXT:     LSHL * T0.W, KC0[2].Z, literal.x,
448; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
449; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
450; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
451; EG-NEXT:     MULHI_INT * T0.Y, PV.W, PV.W,
452; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
453; EG-NEXT:     MULLO_INT * T0.X, T0.W, T0.W,
454; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
455;
456; CM-LABEL: test_smul24_i64_square:
457; CM:       ; %bb.0:
458; CM-NEXT:    ALU 12, @4, KC0[CB0:0-32], KC1[]
459; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T0.X
460; CM-NEXT:    CF_END
461; CM-NEXT:    PAD
462; CM-NEXT:    ALU clause starting at 4:
463; CM-NEXT:     LSHL * T0.W, KC0[2].Z, literal.x,
464; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
465; CM-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
466; CM-NEXT:     ASHR * T0.W, PV.W, literal.y,
467; CM-NEXT:    2(2.802597e-45), 8(1.121039e-44)
468; CM-NEXT:     MULLO_INT T1.X, T0.W, T0.W,
469; CM-NEXT:     MULLO_INT T1.Y (MASKED), T0.W, T0.W,
470; CM-NEXT:     MULLO_INT T1.Z (MASKED), T0.W, T0.W,
471; CM-NEXT:     MULLO_INT * T1.W (MASKED), T0.W, T0.W,
472; CM-NEXT:     MULHI_INT24 T1.X (MASKED), KC0[2].Z, KC0[2].Z,
473; CM-NEXT:     MULHI_INT24 T1.Y, KC0[2].Z, KC0[2].Z,
474; CM-NEXT:     MULHI_INT24 T1.Z (MASKED), KC0[2].Z, KC0[2].Z,
475; CM-NEXT:     MULHI_INT24 * T1.W (MASKED), KC0[2].Z, KC0[2].Z,
476  %shl.i = shl i32 %a, 8
477  %shr.i = ashr i32 %shl.i, 8
478  %conv.i = sext i32 %shr.i to i64
479  %mul.i = mul i64 %conv.i, %conv.i
480  store i64 %mul.i, i64 addrspace(1)* %out
481  ret void
482}
483
484define amdgpu_kernel void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 {
485; SI-LABEL: test_smul24_i33:
486; SI:       ; %bb.0: ; %entry
487; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
488; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
489; SI-NEXT:    s_load_dword s0, s[0:1], 0xd
490; SI-NEXT:    s_mov_b32 s7, 0xf000
491; SI-NEXT:    s_mov_b32 s6, -1
492; SI-NEXT:    s_waitcnt lgkmcnt(0)
493; SI-NEXT:    s_lshl_b32 s1, s2, 8
494; SI-NEXT:    s_lshl_b32 s3, s0, 8
495; SI-NEXT:    s_ashr_i64 s[2:3], s[2:3], 40
496; SI-NEXT:    s_ashr_i64 s[0:1], s[0:1], 40
497; SI-NEXT:    v_mov_b32_e32 v0, s2
498; SI-NEXT:    s_mul_i32 s1, s0, s2
499; SI-NEXT:    v_mul_hi_i32_i24_e32 v1, s0, v0
500; SI-NEXT:    v_mov_b32_e32 v0, s1
501; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 31
502; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], 31
503; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
504; SI-NEXT:    s_endpgm
505;
506; VI-LABEL: test_smul24_i33:
507; VI:       ; %bb.0: ; %entry
508; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
509; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
510; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
511; VI-NEXT:    s_waitcnt lgkmcnt(0)
512; VI-NEXT:    s_lshl_b32 s3, s2, 8
513; VI-NEXT:    s_lshl_b32 s5, s4, 8
514; VI-NEXT:    s_ashr_i64 s[4:5], s[4:5], 40
515; VI-NEXT:    s_ashr_i64 s[2:3], s[2:3], 40
516; VI-NEXT:    v_mov_b32_e32 v0, s4
517; VI-NEXT:    v_mul_hi_i32_i24_e32 v1, s2, v0
518; VI-NEXT:    v_mul_i32_i24_e32 v0, s2, v0
519; VI-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
520; VI-NEXT:    s_mov_b32 s3, 0xf000
521; VI-NEXT:    v_ashrrev_i64 v[0:1], 31, v[0:1]
522; VI-NEXT:    s_mov_b32 s2, -1
523; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
524; VI-NEXT:    s_endpgm
525;
526; GFX9-LABEL: test_smul24_i33:
527; GFX9:       ; %bb.0: ; %entry
528; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
529; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
530; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x34
531; GFX9-NEXT:    s_mov_b32 s7, 0xf000
532; GFX9-NEXT:    s_mov_b32 s6, -1
533; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
534; GFX9-NEXT:    s_lshl_b32 s1, s2, 8
535; GFX9-NEXT:    s_ashr_i64 s[0:1], s[0:1], 40
536; GFX9-NEXT:    s_lshl_b32 s1, s3, 8
537; GFX9-NEXT:    s_ashr_i64 s[2:3], s[0:1], 40
538; GFX9-NEXT:    s_mul_hi_i32 s1, s0, s2
539; GFX9-NEXT:    s_mul_i32 s0, s0, s2
540; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 31
541; GFX9-NEXT:    s_ashr_i64 s[0:1], s[0:1], 31
542; GFX9-NEXT:    v_mov_b32_e32 v0, s0
543; GFX9-NEXT:    v_mov_b32_e32 v1, s1
544; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
545; GFX9-NEXT:    s_endpgm
546;
547; EG-LABEL: test_smul24_i33:
548; EG:       ; %bb.0: ; %entry
549; EG-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
550; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1
551; EG-NEXT:    CF_END
552; EG-NEXT:    PAD
553; EG-NEXT:    ALU clause starting at 4:
554; EG-NEXT:     LSHL T0.W, KC0[2].W, literal.x,
555; EG-NEXT:     LSHL * T1.W, KC0[3].Y, literal.x,
556; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
557; EG-NEXT:     ASHR T1.W, PS, literal.x,
558; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
559; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
560; EG-NEXT:     MULHI_INT * T0.X, PS, PV.W,
561; EG-NEXT:     MULLO_INT * T1.X, T0.W, T1.W,
562; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
563; EG-NEXT:     BFE_INT * T1.Y, T0.X, 0.0, 1,
564; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
565;
566; CM-LABEL: test_smul24_i33:
567; CM:       ; %bb.0: ; %entry
568; CM-NEXT:    ALU 16, @4, KC0[CB0:0-32], KC1[]
569; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
570; CM-NEXT:    CF_END
571; CM-NEXT:    PAD
572; CM-NEXT:    ALU clause starting at 4:
573; CM-NEXT:     LSHL T0.Z, KC0[2].W, literal.x,
574; CM-NEXT:     LSHL * T0.W, KC0[3].Y, literal.x,
575; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
576; CM-NEXT:     ASHR T1.Z, PV.W, literal.x,
577; CM-NEXT:     ASHR * T0.W, PV.Z, literal.x,
578; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
579; CM-NEXT:     MULHI_INT24 T0.X, KC0[2].W, KC0[3].Y,
580; CM-NEXT:     MULHI_INT24 T0.Y (MASKED), KC0[2].W, KC0[3].Y,
581; CM-NEXT:     MULHI_INT24 T0.Z (MASKED), KC0[2].W, KC0[3].Y,
582; CM-NEXT:     MULHI_INT24 * T0.W (MASKED), KC0[2].W, KC0[3].Y,
583; CM-NEXT:     MULLO_INT T1.X, T0.W, T1.Z,
584; CM-NEXT:     MULLO_INT T1.Y (MASKED), T0.W, T1.Z,
585; CM-NEXT:     MULLO_INT T1.Z (MASKED), T0.W, T1.Z,
586; CM-NEXT:     MULLO_INT * T1.W (MASKED), T0.W, T1.Z,
587; CM-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
588; CM-NEXT:     BFE_INT * T1.Y, T0.X, 0.0, 1,
589; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
590entry:
591  %a.shl = shl i33 %a, 9
592  %a.24 = ashr i33 %a.shl, 9
593  %b.shl = shl i33 %b, 9
594  %b.24 = ashr i33 %b.shl, 9
595  %mul24 = mul i33 %a.24, %b.24
596  %ext = sext i33 %mul24 to i64
597  store i64 %ext, i64 addrspace(1)* %out
598  ret void
599}
600
601define amdgpu_kernel void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
602; SI-LABEL: test_smulhi24_i33:
603; SI:       ; %bb.0: ; %entry
604; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
605; SI-NEXT:    s_load_dword s5, s[0:1], 0xb
606; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
607; SI-NEXT:    s_mov_b32 s3, 0xf000
608; SI-NEXT:    s_mov_b32 s2, -1
609; SI-NEXT:    s_waitcnt lgkmcnt(0)
610; SI-NEXT:    v_mov_b32_e32 v0, s4
611; SI-NEXT:    v_mul_hi_i32_i24_e32 v0, s5, v0
612; SI-NEXT:    v_and_b32_e32 v0, 1, v0
613; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
614; SI-NEXT:    s_endpgm
615;
616; VI-LABEL: test_smulhi24_i33:
617; VI:       ; %bb.0: ; %entry
618; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
619; VI-NEXT:    s_load_dword s5, s[0:1], 0x2c
620; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
621; VI-NEXT:    s_mov_b32 s3, 0xf000
622; VI-NEXT:    s_mov_b32 s2, -1
623; VI-NEXT:    s_waitcnt lgkmcnt(0)
624; VI-NEXT:    v_mov_b32_e32 v0, s4
625; VI-NEXT:    v_mul_hi_i32_i24_e32 v0, s5, v0
626; VI-NEXT:    v_and_b32_e32 v0, 1, v0
627; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
628; VI-NEXT:    s_endpgm
629;
630; GFX9-LABEL: test_smulhi24_i33:
631; GFX9:       ; %bb.0: ; %entry
632; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
633; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
634; GFX9-NEXT:    s_load_dword s3, s[0:1], 0x34
635; GFX9-NEXT:    s_mov_b32 s7, 0xf000
636; GFX9-NEXT:    s_mov_b32 s6, -1
637; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
638; GFX9-NEXT:    s_lshl_b32 s1, s2, 8
639; GFX9-NEXT:    s_ashr_i64 s[0:1], s[0:1], 40
640; GFX9-NEXT:    s_lshl_b32 s1, s3, 8
641; GFX9-NEXT:    s_ashr_i64 s[2:3], s[0:1], 40
642; GFX9-NEXT:    s_mul_hi_i32 s0, s0, s2
643; GFX9-NEXT:    s_and_b32 s0, s0, 1
644; GFX9-NEXT:    v_mov_b32_e32 v0, s0
645; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
646; GFX9-NEXT:    s_endpgm
647;
648; EG-LABEL: test_smulhi24_i33:
649; EG:       ; %bb.0: ; %entry
650; EG-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
651; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
652; EG-NEXT:    CF_END
653; EG-NEXT:    PAD
654; EG-NEXT:    ALU clause starting at 4:
655; EG-NEXT:     LSHL T0.W, KC0[2].W, literal.x,
656; EG-NEXT:     LSHL * T1.W, KC0[3].Y, literal.x,
657; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
658; EG-NEXT:     ASHR T1.W, PS, literal.x,
659; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
660; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
661; EG-NEXT:     MULHI_INT * T0.X, PS, PV.W,
662; EG-NEXT:     AND_INT T0.X, PS, 1,
663; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
664; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
665;
666; CM-LABEL: test_smulhi24_i33:
667; CM:       ; %bb.0: ; %entry
668; CM-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
669; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
670; CM-NEXT:    CF_END
671; CM-NEXT:    PAD
672; CM-NEXT:    ALU clause starting at 4:
673; CM-NEXT:     MULHI_INT24 T0.X, KC0[2].W, KC0[3].Y,
674; CM-NEXT:     MULHI_INT24 T0.Y (MASKED), KC0[2].W, KC0[3].Y,
675; CM-NEXT:     MULHI_INT24 T0.Z (MASKED), KC0[2].W, KC0[3].Y,
676; CM-NEXT:     MULHI_INT24 * T0.W (MASKED), KC0[2].W, KC0[3].Y,
677; CM-NEXT:     AND_INT * T0.X, PV.X, 1,
678; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
679; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
680entry:
681  %tmp0 = shl i33 %a, 9
682  %a_24 = ashr i33 %tmp0, 9
683  %tmp1 = shl i33 %b, 9
684  %b_24 = ashr i33 %tmp1, 9
685  %tmp2 = mul i33 %a_24, %b_24
686  %hi = lshr i33 %tmp2, 32
687  %trunc = trunc i33 %hi to i32
688
689  store i32 %trunc, i32 addrspace(1)* %out
690  ret void
691}
692
693define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) {
694; SI-LABEL: simplify_i24_crash:
695; SI:       ; %bb.0: ; %bb
696; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
697; SI-NEXT:    s_waitcnt lgkmcnt(0)
698; SI-NEXT:    s_cmp_lg_u32 s2, 0
699; SI-NEXT:    s_cbranch_scc0 .LBB8_2
700; SI-NEXT:  ; %bb.1: ; %bb7
701; SI-NEXT:    s_endpgm
702; SI-NEXT:  .LBB8_2: ; %bb11
703; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
704; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
705; SI-NEXT:    s_mov_b32 s3, 0xf000
706; SI-NEXT:    s_waitcnt lgkmcnt(0)
707; SI-NEXT:    s_bfe_i32 s2, s4, 0x180000
708; SI-NEXT:    s_bfe_i32 s4, s6, 0x180000
709; SI-NEXT:    s_mul_i32 s4, s2, s4
710; SI-NEXT:    s_mov_b32 s2, -1
711; SI-NEXT:    v_mov_b32_e32 v0, s4
712; SI-NEXT:    v_mov_b32_e32 v1, s4
713; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
714; SI-NEXT:    s_endpgm
715;
716; VI-LABEL: simplify_i24_crash:
717; VI:       ; %bb.0: ; %bb
718; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
719; VI-NEXT:    s_waitcnt lgkmcnt(0)
720; VI-NEXT:    s_cmp_lg_u32 s2, 0
721; VI-NEXT:    s_cbranch_scc0 .LBB8_2
722; VI-NEXT:  ; %bb.1: ; %bb7
723; VI-NEXT:    s_endpgm
724; VI-NEXT:  .LBB8_2: ; %bb11
725; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
726; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
727; VI-NEXT:    s_mov_b32 s3, 0xf000
728; VI-NEXT:    s_mov_b32 s2, -1
729; VI-NEXT:    s_waitcnt lgkmcnt(0)
730; VI-NEXT:    s_bfe_i32 s4, s4, 0x180000
731; VI-NEXT:    s_bfe_i32 s5, s6, 0x180000
732; VI-NEXT:    s_mul_i32 s4, s4, s5
733; VI-NEXT:    v_mov_b32_e32 v0, s4
734; VI-NEXT:    v_mov_b32_e32 v1, s4
735; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
736; VI-NEXT:    s_endpgm
737;
738; GFX9-LABEL: simplify_i24_crash:
739; GFX9:       ; %bb.0: ; %bb
740; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
741; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
742; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
743; GFX9-NEXT:    s_cbranch_scc0 .LBB8_2
744; GFX9-NEXT:  ; %bb.1: ; %bb7
745; GFX9-NEXT:    s_endpgm
746; GFX9-NEXT:  .LBB8_2: ; %bb11
747; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
748; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
749; GFX9-NEXT:    s_mov_b32 s11, 0xf000
750; GFX9-NEXT:    s_mov_b32 s10, -1
751; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
752; GFX9-NEXT:    s_bfe_i32 s0, s4, 0x180000
753; GFX9-NEXT:    s_bfe_i32 s1, s6, 0x180000
754; GFX9-NEXT:    s_mul_i32 s0, s0, s1
755; GFX9-NEXT:    v_mov_b32_e32 v0, s0
756; GFX9-NEXT:    v_mov_b32_e32 v1, s0
757; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
758; GFX9-NEXT:    s_endpgm
759;
760; EG-LABEL: simplify_i24_crash:
761; EG:       ; %bb.0: ; %bb
762; EG-NEXT:    ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[]
763; EG-NEXT:    JUMP @5 POP:1
764; EG-NEXT:    ALU 10, @8, KC0[CB0:0-32], KC1[]
765; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 0
766; EG-NEXT:    POP @5 POP:1
767; EG-NEXT:    CF_END
768; EG-NEXT:    ALU clause starting at 6:
769; EG-NEXT:     SETNE_INT * T0.W, KC0[2].Z, 0.0,
770; EG-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
771; EG-NEXT:    ALU clause starting at 8:
772; EG-NEXT:     LSHL T0.W, KC0[2].W, literal.x,
773; EG-NEXT:     LSHL * T1.W, KC0[3].Y, literal.x,
774; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
775; EG-NEXT:     ASHR T1.W, PS, literal.x,
776; EG-NEXT:     ASHR * T0.W, PV.W, literal.x,
777; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
778; EG-NEXT:     MOV T2.W, KC0[2].Y,
779; EG-NEXT:     MULLO_INT * T0.X, PS, PV.W,
780; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
781; EG-NEXT:     MOV * T0.Y, PS,
782; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
783;
784; CM-LABEL: simplify_i24_crash:
785; CM:       ; %bb.0: ; %bb
786; CM-NEXT:    ALU_PUSH_BEFORE 1, @6, KC0[CB0:0-32], KC1[]
787; CM-NEXT:    JUMP @5 POP:1
788; CM-NEXT:    ALU 13, @8, KC0[CB0:0-32], KC1[]
789; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
790; CM-NEXT:    POP @5 POP:1
791; CM-NEXT:    CF_END
792; CM-NEXT:    ALU clause starting at 6:
793; CM-NEXT:     SETNE_INT * T0.W, KC0[2].Z, 0.0,
794; CM-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PV.W, 0.0,
795; CM-NEXT:    ALU clause starting at 8:
796; CM-NEXT:     LSHL T0.Z, KC0[2].W, literal.x,
797; CM-NEXT:     LSHL * T0.W, KC0[3].Y, literal.x,
798; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
799; CM-NEXT:     MOV T0.Y, KC0[2].Y,
800; CM-NEXT:     ASHR T1.Z, PV.W, literal.x,
801; CM-NEXT:     ASHR * T0.W, PV.Z, literal.x,
802; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
803; CM-NEXT:     MULLO_INT T0.X, T0.W, T1.Z,
804; CM-NEXT:     MULLO_INT T0.Y (MASKED), T0.W, T1.Z,
805; CM-NEXT:     MULLO_INT T0.Z (MASKED), T0.W, T1.Z,
806; CM-NEXT:     MULLO_INT * T0.W (MASKED), T0.W, T1.Z,
807; CM-NEXT:     LSHR T1.X, T0.Y, literal.x,
808; CM-NEXT:     MOV * T0.Y, PV.X,
809; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
810bb:
811  %cmp = icmp eq i32 %arg0, 0
812  br i1 %cmp, label %bb11, label %bb7
813
814bb11:
815  %tmp14 = shufflevector <2 x i32> %arg1, <2 x i32> undef, <2 x i32> zeroinitializer
816  %tmp16 = shufflevector <2 x i32> %arg2, <2 x i32> undef, <2 x i32> zeroinitializer
817  %tmp17 = shl <2 x i32> %tmp14, <i32 8, i32 8>
818  %tmp18 = ashr <2 x i32> %tmp17, <i32 8, i32 8>
819  %tmp19 = shl <2 x i32> %tmp16, <i32 8, i32 8>
820  %tmp20 = ashr <2 x i32> %tmp19, <i32 8, i32 8>
821  %tmp21 = mul <2 x i32> %tmp18, %tmp20
822  store <2 x i32> %tmp21, <2 x i32> addrspace(1)* %out
823  br label %bb7
824
825bb7:
826  ret void
827
828}
829attributes #0 = { nounwind }
830