1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
3; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s
5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s
6
7define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
8; CHECK-LABEL: @udiv_i32(
9; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
10; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
11; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000
12; CHECK-NEXT:    [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
13; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 0, [[Y]]
14; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]]
15; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP4]] to i64
16; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
17; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
18; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
19; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
20; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
21; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]]
22; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64
23; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
24; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
25; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
26; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
27; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
28; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]]
29; CHECK-NEXT:    [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]]
30; CHECK-NEXT:    [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]]
31; CHECK-NEXT:    [[TMP23:%.*]] = add i32 [[TMP19]], 1
32; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP19]]
33; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP21]], [[Y]]
34; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP22]], i32 [[TMP25]], i32 [[TMP21]]
35; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]]
36; CHECK-NEXT:    [[TMP28:%.*]] = add i32 [[TMP24]], 1
37; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]]
38; CHECK-NEXT:    store i32 [[TMP29]], i32 addrspace(1)* [[OUT:%.*]], align 4
39; CHECK-NEXT:    ret void
40;
41; GFX6-LABEL: udiv_i32:
42; GFX6:       ; %bb.0:
43; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
44; GFX6-NEXT:    s_mov_b32 s7, 0xf000
45; GFX6-NEXT:    s_mov_b32 s6, -1
46; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
47; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
48; GFX6-NEXT:    s_sub_i32 s4, 0, s3
49; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
50; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
51; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
52; GFX6-NEXT:    v_mul_lo_u32 v1, s4, v0
53; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
54; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
55; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
56; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
57; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s3
58; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
59; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
60; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
61; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
62; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
63; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
64; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
65; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
66; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
67; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
69; GFX6-NEXT:    s_endpgm
70; GFX9-LABEL: udiv_i32:
71; GFX9:       ; %bb.0:
72; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
73; GFX9-NEXT:    v_mov_b32_e32 v2, 0
74; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
75; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
76; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
77; GFX9-NEXT:    s_sub_i32 s4, 0, s3
78; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
79; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
80; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
81; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
82; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
83; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
84; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
85; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s3
86; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
87; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
88; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
89; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
90; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
91; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
92; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
93; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
94; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
95; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
96; GFX9-NEXT:    s_endpgm
97  %r = udiv i32 %x, %y
98  store i32 %r, i32 addrspace(1)* %out
99  ret void
100}
101
102define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
103; CHECK-LABEL: @urem_i32(
104; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
105; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
106; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000
107; CHECK-NEXT:    [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
108; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 0, [[Y]]
109; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]]
110; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP4]] to i64
111; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
112; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
113; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
114; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
115; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
116; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]]
117; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64
118; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
119; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
120; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
121; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
122; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
123; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]]
124; CHECK-NEXT:    [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]]
125; CHECK-NEXT:    [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]]
126; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP21]], [[Y]]
127; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP21]]
128; CHECK-NEXT:    [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]]
129; CHECK-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]]
130; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]]
131; CHECK-NEXT:    store i32 [[TMP27]], i32 addrspace(1)* [[OUT:%.*]], align 4
132; CHECK-NEXT:    ret void
133;
134; GFX6-LABEL: urem_i32:
135; GFX6:       ; %bb.0:
136; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
137; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
138; GFX6-NEXT:    s_mov_b32 s3, 0xf000
139; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
140; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
141; GFX6-NEXT:    s_sub_i32 s2, 0, s5
142; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
143; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
144; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
145; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
146; GFX6-NEXT:    s_mov_b32 s2, -1
147; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
148; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
149; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
150; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s5
151; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
152; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s5, v0
153; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v0
154; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
155; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s5, v0
156; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s5, v0
157; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
158; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
159; GFX6-NEXT:    s_endpgm
160; GFX9-LABEL: urem_i32:
161; GFX9:       ; %bb.0:
162; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
163; GFX9-NEXT:    s_nop 0
164; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
165; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
166; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
167; GFX9-NEXT:    s_sub_i32 s4, 0, s3
168; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
169; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
170; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
171; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
172; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
173; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
174; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
175; GFX9-NEXT:    v_mov_b32_e32 v1, 0
176; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
177; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
178; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
179; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
180; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
181; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
182; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
183; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
184; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
185; GFX9-NEXT:    s_endpgm
186  %r = urem i32 %x, %y
187  store i32 %r, i32 addrspace(1)* %out
188  ret void
189}
190
191define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
192; CHECK-LABEL: @sdiv_i32(
193; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
194; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
195; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
196; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[X]], [[TMP1]]
197; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]]
198; CHECK-NEXT:    [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]]
199; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]]
200; CHECK-NEXT:    [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float
201; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]])
202; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41EFFFFFC0000000
203; CHECK-NEXT:    [[TMP11:%.*]] = fptoui float [[TMP10]] to i32
204; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 0, [[TMP7]]
205; CHECK-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP11]]
206; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP11]] to i64
207; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
208; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
209; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
210; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
211; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
212; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP11]], [[TMP19]]
213; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP6]] to i64
214; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP20]] to i64
215; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]]
216; CHECK-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
217; CHECK-NEXT:    [[TMP25:%.*]] = lshr i64 [[TMP23]], 32
218; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
219; CHECK-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], [[TMP7]]
220; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP6]], [[TMP27]]
221; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP7]]
222; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
223; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
224; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP28]], [[TMP7]]
225; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP29]], i32 [[TMP32]], i32 [[TMP28]]
226; CHECK-NEXT:    [[TMP34:%.*]] = icmp uge i32 [[TMP33]], [[TMP7]]
227; CHECK-NEXT:    [[TMP35:%.*]] = add i32 [[TMP31]], 1
228; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]]
229; CHECK-NEXT:    [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]]
230; CHECK-NEXT:    [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]]
231; CHECK-NEXT:    store i32 [[TMP38]], i32 addrspace(1)* [[OUT:%.*]], align 4
232; CHECK-NEXT:    ret void
233;
234; GFX6-LABEL: sdiv_i32:
235; GFX6:       ; %bb.0:
236; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
237; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
238; GFX6-NEXT:    s_mov_b32 s7, 0xf000
239; GFX6-NEXT:    s_mov_b32 s6, -1
240; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
241; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
242; GFX6-NEXT:    s_add_i32 s3, s3, s8
243; GFX6-NEXT:    s_xor_b32 s9, s3, s8
244; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
245; GFX6-NEXT:    s_sub_i32 s3, 0, s9
246; GFX6-NEXT:    s_ashr_i32 s0, s2, 31
247; GFX6-NEXT:    s_add_i32 s1, s2, s0
248; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
249; GFX6-NEXT:    s_xor_b32 s1, s1, s0
250; GFX6-NEXT:    s_xor_b32 s2, s0, s8
251; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
252; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
253; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
254; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
255; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
256; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
257; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s9
258; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
259; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
260; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v1
261; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
262; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s9, v1
263; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
264; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
265; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
266; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
267; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
268; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
269; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
270; GFX6-NEXT:    s_endpgm
271; GFX9-LABEL: sdiv_i32:
272; GFX9:       ; %bb.0:
273; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
274; GFX9-NEXT:    v_mov_b32_e32 v2, 0
275; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
276; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
277; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
278; GFX9-NEXT:    s_add_i32 s3, s3, s4
279; GFX9-NEXT:    s_xor_b32 s5, s3, s4
280; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s5
281; GFX9-NEXT:    s_sub_i32 s3, 0, s5
282; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
283; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
284; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
285; GFX9-NEXT:    v_mul_lo_u32 v1, s3, v0
286; GFX9-NEXT:    s_ashr_i32 s3, s2, 31
287; GFX9-NEXT:    s_add_i32 s2, s2, s3
288; GFX9-NEXT:    s_xor_b32 s2, s2, s3
289; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
290; GFX9-NEXT:    s_xor_b32 s3, s3, s4
291; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
292; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
293; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s5
294; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
295; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
296; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
297; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
298; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v1
299; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
300; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
301; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
302; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
303; GFX9-NEXT:    v_xor_b32_e32 v0, s3, v0
304; GFX9-NEXT:    v_subrev_u32_e32 v0, s3, v0
305; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
306; GFX9-NEXT:    s_endpgm
307  %r = sdiv i32 %x, %y
308  store i32 %r, i32 addrspace(1)* %out
309  ret void
310}
311
312define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
313; CHECK-LABEL: @srem_i32(
314; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
315; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
316; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[X]], [[TMP1]]
317; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]]
318; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]]
319; CHECK-NEXT:    [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]]
320; CHECK-NEXT:    [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float
321; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
322; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41EFFFFFC0000000
323; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP9]] to i32
324; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 0, [[TMP6]]
325; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP10]]
326; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP10]] to i64
327; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP12]] to i64
328; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]]
329; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
330; CHECK-NEXT:    [[TMP17:%.*]] = lshr i64 [[TMP15]], 32
331; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
332; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP10]], [[TMP18]]
333; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
334; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP19]] to i64
335; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]]
336; CHECK-NEXT:    [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32
337; CHECK-NEXT:    [[TMP24:%.*]] = lshr i64 [[TMP22]], 32
338; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
339; CHECK-NEXT:    [[TMP26:%.*]] = mul i32 [[TMP25]], [[TMP6]]
340; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP5]], [[TMP26]]
341; CHECK-NEXT:    [[TMP28:%.*]] = icmp uge i32 [[TMP27]], [[TMP6]]
342; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP6]]
343; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP28]], i32 [[TMP29]], i32 [[TMP27]]
344; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP6]]
345; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP30]], [[TMP6]]
346; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]]
347; CHECK-NEXT:    [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]]
348; CHECK-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]]
349; CHECK-NEXT:    store i32 [[TMP35]], i32 addrspace(1)* [[OUT:%.*]], align 4
350; CHECK-NEXT:    ret void
351;
352; GFX6-LABEL: srem_i32:
353; GFX6:       ; %bb.0:
354; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
355; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
356; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
357; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
358; GFX6-NEXT:    s_add_i32 s3, s3, s4
359; GFX6-NEXT:    s_xor_b32 s6, s3, s4
360; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
361; GFX6-NEXT:    s_sub_i32 s3, 0, s6
362; GFX6-NEXT:    s_ashr_i32 s4, s2, 31
363; GFX6-NEXT:    s_add_i32 s2, s2, s4
364; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
365; GFX6-NEXT:    s_xor_b32 s5, s2, s4
366; GFX6-NEXT:    s_mov_b32 s2, -1
367; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
368; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
369; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
370; GFX6-NEXT:    s_mov_b32 s3, 0xf000
371; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
372; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
373; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
374; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
375; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
376; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v0
377; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
378; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
379; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v0
380; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
381; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
382; GFX6-NEXT:    v_xor_b32_e32 v0, s4, v0
383; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
384; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
385; GFX6-NEXT:    s_endpgm
386; GFX9-LABEL: srem_i32:
387; GFX9:       ; %bb.0:
388; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
389; GFX9-NEXT:    s_nop 0
390; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
391; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
392; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
393; GFX9-NEXT:    s_add_i32 s3, s3, s4
394; GFX9-NEXT:    s_xor_b32 s3, s3, s4
395; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
396; GFX9-NEXT:    s_sub_i32 s4, 0, s3
397; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
398; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
399; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
400; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
401; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
402; GFX9-NEXT:    s_add_i32 s2, s2, s4
403; GFX9-NEXT:    s_xor_b32 s2, s2, s4
404; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
405; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
406; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
407; GFX9-NEXT:    v_mov_b32_e32 v1, 0
408; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
409; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
410; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
411; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
412; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
413; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
414; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
415; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
416; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
417; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
418; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
419; GFX9-NEXT:    s_endpgm
420  %r = srem i32 %x, %y
421  store i32 %r, i32 addrspace(1)* %out
422  ret void
423}
424
425define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
426; CHECK-LABEL: @udiv_i16(
427; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
428; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
429; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
430; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
431; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
432; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
433; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
434; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
435; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
436; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
437; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
438; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
439; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
440; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
441; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
442; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 65535
443; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
444; CHECK-NEXT:    store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]], align 2
445; CHECK-NEXT:    ret void
446;
447; GFX6-LABEL: udiv_i16:
448; GFX6:       ; %bb.0:
449; GFX6-NEXT:    s_load_dword s2, s[0:1], 0xb
450; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
451; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
452; GFX6-NEXT:    s_lshr_b32 s3, s2, 16
453; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s3
454; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
455; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s2
456; GFX6-NEXT:    s_mov_b32 s3, 0xf000
457; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
458; GFX6-NEXT:    s_mov_b32 s2, -1
459; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
460; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
461; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
462; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
463; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
464; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
465; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
466; GFX6-NEXT:    s_endpgm
467; GFX9-LABEL: udiv_i16:
468; GFX9:       ; %bb.0:
469; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
470; GFX9-NEXT:    v_mov_b32_e32 v3, 0
471; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
472; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
473; GFX9-NEXT:    s_lshr_b32 s3, s2, 16
474; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
475; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
476; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
477; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
478; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
479; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
480; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
481; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
482; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
483; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
484; GFX9-NEXT:    global_store_short v3, v0, s[0:1]
485; GFX9-NEXT:    s_endpgm
486  %r = udiv i16 %x, %y
487  store i16 %r, i16 addrspace(1)* %out
488  ret void
489}
490
491define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
492; CHECK-LABEL: @urem_i16(
493; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
494; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
495; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
496; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
497; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
498; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
499; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
500; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
501; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
502; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
503; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
504; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
505; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
506; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
507; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
508; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
509; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
510; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
511; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
512; CHECK-NEXT:    store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]], align 2
513; CHECK-NEXT:    ret void
514;
515; GFX6-LABEL: urem_i16:
516; GFX6:       ; %bb.0:
517; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
518; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
519; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
520; GFX6-NEXT:    s_lshr_b32 s2, s4, 16
521; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
522; GFX6-NEXT:    s_and_b32 s3, s4, 0xffff
523; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s3
524; GFX6-NEXT:    s_mov_b32 s3, 0xf000
525; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
526; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
527; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
528; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
529; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
530; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
531; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
532; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
533; GFX6-NEXT:    s_mov_b32 s2, -1
534; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
535; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
536; GFX6-NEXT:    s_endpgm
537; GFX9-LABEL: urem_i16:
538; GFX9:       ; %bb.0:
539; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
540; GFX9-NEXT:    s_nop 0
541; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
542; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
543; GFX9-NEXT:    s_lshr_b32 s3, s2, 16
544; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
545; GFX9-NEXT:    s_and_b32 s4, s2, 0xffff
546; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s4
547; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
548; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
549; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
550; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v2
551; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
552; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
553; GFX9-NEXT:    v_mov_b32_e32 v1, 0
554; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
555; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
556; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
557; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
558; GFX9-NEXT:    s_endpgm
559  %r = urem i16 %x, %y
560  store i16 %r, i16 addrspace(1)* %out
561  ret void
562}
563
564define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
565; CHECK-LABEL: @sdiv_i16(
566; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
567; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
568; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
569; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
570; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
571; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
572; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
573; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
574; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
575; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
576; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
577; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
578; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
579; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
580; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
581; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
582; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
583; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
584; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 16
585; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 16
586; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
587; CHECK-NEXT:    store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]], align 2
588; CHECK-NEXT:    ret void
589;
590; GFX6-LABEL: sdiv_i16:
591; GFX6:       ; %bb.0:
592; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
593; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
594; GFX6-NEXT:    s_mov_b32 s7, 0xf000
595; GFX6-NEXT:    s_mov_b32 s6, -1
596; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
597; GFX6-NEXT:    s_ashr_i32 s1, s0, 16
598; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s1
599; GFX6-NEXT:    s_sext_i32_i16 s0, s0
600; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s0
601; GFX6-NEXT:    s_xor_b32 s0, s0, s1
602; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
603; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
604; GFX6-NEXT:    s_or_b32 s0, s0, 1
605; GFX6-NEXT:    v_mov_b32_e32 v3, s0
606; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
607; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
608; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
609; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
610; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
611; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
612; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
613; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0
614; GFX6-NEXT:    s_endpgm
615; GFX9-LABEL: sdiv_i16:
616; GFX9:       ; %bb.0:
617; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
618; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
619; GFX9-NEXT:    v_mov_b32_e32 v1, 0
620; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
621; GFX9-NEXT:    s_ashr_i32 s0, s4, 16
622; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
623; GFX9-NEXT:    s_sext_i32_i16 s1, s4
624; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
625; GFX9-NEXT:    s_xor_b32 s0, s1, s0
626; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
627; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
628; GFX9-NEXT:    s_or_b32 s4, s0, 1
629; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
630; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
631; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
632; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
633; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
634; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
635; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
636; GFX9-NEXT:    v_add_u32_e32 v0, s0, v3
637; GFX9-NEXT:    global_store_short v1, v0, s[2:3]
638; GFX9-NEXT:    s_endpgm
639  %r = sdiv i16 %x, %y
640  store i16 %r, i16 addrspace(1)* %out
641  ret void
642}
643
644define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
645; CHECK-LABEL: @srem_i16(
646; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
647; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
648; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
649; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
650; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
651; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
652; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
653; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
654; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
655; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
656; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
657; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
658; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
659; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
660; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
661; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
662; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
663; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
664; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
665; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
666; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
667; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
668; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
669; CHECK-NEXT:    store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]], align 2
670; CHECK-NEXT:    ret void
671;
672; GFX6-LABEL: srem_i16:
673; GFX6:       ; %bb.0:
674; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
675; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
676; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
677; GFX6-NEXT:    s_ashr_i32 s2, s4, 16
678; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s2
679; GFX6-NEXT:    s_sext_i32_i16 s3, s4
680; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s3
681; GFX6-NEXT:    s_xor_b32 s3, s3, s2
682; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
683; GFX6-NEXT:    s_ashr_i32 s3, s3, 30
684; GFX6-NEXT:    s_or_b32 s3, s3, 1
685; GFX6-NEXT:    v_mov_b32_e32 v3, s3
686; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
687; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
688; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
689; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
690; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
691; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
692; GFX6-NEXT:    s_mov_b32 s3, 0xf000
693; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
694; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
695; GFX6-NEXT:    s_mov_b32 s2, -1
696; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
697; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
698; GFX6-NEXT:    s_endpgm
699; GFX9-LABEL: srem_i16:
700; GFX9:       ; %bb.0:
701; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
702; GFX9-NEXT:    s_nop 0
703; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
704; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
705; GFX9-NEXT:    s_ashr_i32 s5, s4, 16
706; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s5
707; GFX9-NEXT:    s_sext_i32_i16 s2, s4
708; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s2
709; GFX9-NEXT:    s_xor_b32 s2, s2, s5
710; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
711; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
712; GFX9-NEXT:    s_or_b32 s6, s2, 1
713; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
714; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
715; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
716; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
717; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
718; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
719; GFX9-NEXT:    s_cselect_b32 s2, s6, 0
720; GFX9-NEXT:    v_add_u32_e32 v0, s2, v2
721; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
722; GFX9-NEXT:    v_mov_b32_e32 v1, 0
723; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
724; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
725; GFX9-NEXT:    s_endpgm
726  %r = srem i16 %x, %y
727  store i16 %r, i16 addrspace(1)* %out
728  ret void
729}
730
731define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
732; CHECK-LABEL: @udiv_i8(
733; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
734; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
735; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
736; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
737; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
738; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
739; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
740; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
741; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
742; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
743; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
744; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
745; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
746; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
747; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
748; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 255
749; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
750; CHECK-NEXT:    store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]], align 1
751; CHECK-NEXT:    ret void
752;
753; GFX6-LABEL: udiv_i8:
754; GFX6:       ; %bb.0:
755; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
756; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
757; GFX6-NEXT:    s_mov_b32 s7, 0xf000
758; GFX6-NEXT:    s_mov_b32 s6, -1
759; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
760; GFX6-NEXT:    v_cvt_f32_ubyte1_e32 v0, s0
761; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
762; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
763; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
764; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
765; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
766; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
767; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
768; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
769; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
770; GFX6-NEXT:    s_endpgm
771; GFX9-LABEL: udiv_i8:
772; GFX9:       ; %bb.0:
773; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
774; GFX9-NEXT:    v_mov_b32_e32 v2, 0
775; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
776; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
777; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, s2
778; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
779; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, s2
780; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
781; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
782; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v1
783; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v3
784; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
785; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
786; GFX9-NEXT:    global_store_byte v2, v0, s[0:1]
787; GFX9-NEXT:    s_endpgm
788  %r = udiv i8 %x, %y
789  store i8 %r, i8 addrspace(1)* %out
790  ret void
791}
792
793define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
794; CHECK-LABEL: @urem_i8(
795; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
796; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
797; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
798; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
799; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
800; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
801; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
802; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
803; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
804; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
805; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
806; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
807; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
808; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
809; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
810; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
811; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
812; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 255
813; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8
814; CHECK-NEXT:    store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]], align 1
815; CHECK-NEXT:    ret void
816;
817; GFX6-LABEL: urem_i8:
818; GFX6:       ; %bb.0:
819; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
820; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
821; GFX6-NEXT:    s_mov_b32 s3, 0xf000
822; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
823; GFX6-NEXT:    v_cvt_f32_ubyte1_e32 v0, s4
824; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
825; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
826; GFX6-NEXT:    s_lshr_b32 s2, s4, 8
827; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
828; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
829; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
830; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
831; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
832; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
833; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
834; GFX6-NEXT:    s_mov_b32 s2, -1
835; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
836; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
837; GFX6-NEXT:    s_endpgm
838; GFX9-LABEL: urem_i8:
839; GFX9:       ; %bb.0:
840; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
841; GFX9-NEXT:    s_nop 0
842; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
843; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
844; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, s2
845; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
846; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, s2
847; GFX9-NEXT:    s_lshr_b32 s3, s2, 8
848; GFX9-NEXT:    v_mul_f32_e32 v1, v2, v1
849; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
850; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v1
851; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v2
852; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
853; GFX9-NEXT:    v_mov_b32_e32 v1, 0
854; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
855; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
856; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
857; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
858; GFX9-NEXT:    s_endpgm
859  %r = urem i8 %x, %y
860  store i8 %r, i8 addrspace(1)* %out
861  ret void
862}
863
864define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
865; CHECK-LABEL: @sdiv_i8(
866; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
867; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
868; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
869; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
870; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
871; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
872; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
873; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
874; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
875; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
876; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
877; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
878; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
879; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
880; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
881; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
882; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
883; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
884; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 24
885; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 24
886; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8
887; CHECK-NEXT:    store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]], align 1
888; CHECK-NEXT:    ret void
889;
890; GFX6-LABEL: sdiv_i8:
891; GFX6:       ; %bb.0:
892; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
893; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
894; GFX6-NEXT:    s_mov_b32 s7, 0xf000
895; GFX6-NEXT:    s_mov_b32 s6, -1
896; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
897; GFX6-NEXT:    s_bfe_i32 s1, s0, 0x80008
898; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s1
899; GFX6-NEXT:    s_sext_i32_i8 s0, s0
900; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s0
901; GFX6-NEXT:    s_xor_b32 s0, s0, s1
902; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
903; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
904; GFX6-NEXT:    s_or_b32 s0, s0, 1
905; GFX6-NEXT:    v_mov_b32_e32 v3, s0
906; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
907; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
908; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
909; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
910; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
911; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
912; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
913; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
914; GFX6-NEXT:    s_endpgm
915; GFX9-LABEL: sdiv_i8:
916; GFX9:       ; %bb.0:
917; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
918; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
919; GFX9-NEXT:    v_mov_b32_e32 v1, 0
920; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
921; GFX9-NEXT:    s_bfe_i32 s0, s4, 0x80008
922; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
923; GFX9-NEXT:    s_sext_i32_i8 s1, s4
924; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
925; GFX9-NEXT:    s_xor_b32 s0, s1, s0
926; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
927; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
928; GFX9-NEXT:    s_or_b32 s4, s0, 1
929; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
930; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
931; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
932; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
933; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
934; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
935; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
936; GFX9-NEXT:    v_add_u32_e32 v0, s0, v3
937; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
938; GFX9-NEXT:    s_endpgm
939  %r = sdiv i8 %x, %y
940  store i8 %r, i8 addrspace(1)* %out
941  ret void
942}
943
944define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
945; CHECK-LABEL: @srem_i8(
946; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
947; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
948; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
949; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
950; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
951; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
952; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
953; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
954; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
955; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
956; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
957; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
958; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
959; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
960; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
961; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
962; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
963; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
964; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
965; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
966; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 24
967; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 24
968; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8
969; CHECK-NEXT:    store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]], align 1
970; CHECK-NEXT:    ret void
971;
972; GFX6-LABEL: srem_i8:
973; GFX6:       ; %bb.0:
974; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
975; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
976; GFX6-NEXT:    s_mov_b32 s7, 0xf000
977; GFX6-NEXT:    s_mov_b32 s6, -1
978; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
979; GFX6-NEXT:    s_bfe_i32 s1, s0, 0x80008
980; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s1
981; GFX6-NEXT:    s_sext_i32_i8 s3, s0
982; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s3
983; GFX6-NEXT:    s_xor_b32 s1, s3, s1
984; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
985; GFX6-NEXT:    s_ashr_i32 s1, s1, 30
986; GFX6-NEXT:    s_or_b32 s1, s1, 1
987; GFX6-NEXT:    v_mov_b32_e32 v3, s1
988; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
989; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
990; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
991; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
992; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
993; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
994; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
995; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
996; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
997; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
998; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
999; GFX6-NEXT:    s_endpgm
1000; GFX9-LABEL: srem_i8:
1001; GFX9:       ; %bb.0:
1002; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
1003; GFX9-NEXT:    s_nop 0
1004; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1005; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1006; GFX9-NEXT:    s_bfe_i32 s2, s4, 0x80008
1007; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
1008; GFX9-NEXT:    s_sext_i32_i8 s3, s4
1009; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s3
1010; GFX9-NEXT:    s_xor_b32 s2, s3, s2
1011; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1012; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
1013; GFX9-NEXT:    s_lshr_b32 s5, s4, 8
1014; GFX9-NEXT:    s_or_b32 s6, s2, 1
1015; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
1016; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
1017; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
1018; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
1019; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
1020; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
1021; GFX9-NEXT:    s_cselect_b32 s2, s6, 0
1022; GFX9-NEXT:    v_add_u32_e32 v0, s2, v2
1023; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
1024; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1025; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
1026; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
1027; GFX9-NEXT:    s_endpgm
1028  %r = srem i8 %x, %y
1029  store i8 %r, i8 addrspace(1)* %out
1030  ret void
1031}
1032
1033define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1034; CHECK-LABEL: @udiv_v4i32(
1035; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1036; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1037; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
1038; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
1039; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
1040; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
1041; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
1042; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
1043; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
1044; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
1045; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
1046; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
1047; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
1048; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
1049; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
1050; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
1051; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1052; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1053; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1054; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1055; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1056; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
1057; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
1058; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
1059; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP21]], 1
1060; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]]
1061; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]]
1062; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]]
1063; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]]
1064; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
1065; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
1066; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i64 0
1067; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i32> [[X]], i64 1
1068; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1069; CHECK-NEXT:    [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float
1070; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
1071; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000
1072; CHECK-NEXT:    [[TMP38:%.*]] = fptoui float [[TMP37]] to i32
1073; CHECK-NEXT:    [[TMP39:%.*]] = sub i32 0, [[TMP34]]
1074; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]]
1075; CHECK-NEXT:    [[TMP41:%.*]] = zext i32 [[TMP38]] to i64
1076; CHECK-NEXT:    [[TMP42:%.*]] = zext i32 [[TMP40]] to i64
1077; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]]
1078; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
1079; CHECK-NEXT:    [[TMP45:%.*]] = lshr i64 [[TMP43]], 32
1080; CHECK-NEXT:    [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32
1081; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]]
1082; CHECK-NEXT:    [[TMP48:%.*]] = zext i32 [[TMP33]] to i64
1083; CHECK-NEXT:    [[TMP49:%.*]] = zext i32 [[TMP47]] to i64
1084; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]]
1085; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
1086; CHECK-NEXT:    [[TMP52:%.*]] = lshr i64 [[TMP50]], 32
1087; CHECK-NEXT:    [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32
1088; CHECK-NEXT:    [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]]
1089; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]]
1090; CHECK-NEXT:    [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]]
1091; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP53]], 1
1092; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]]
1093; CHECK-NEXT:    [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]]
1094; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]]
1095; CHECK-NEXT:    [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]]
1096; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP58]], 1
1097; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]]
1098; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP63]], i64 1
1099; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <4 x i32> [[X]], i64 2
1100; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1101; CHECK-NEXT:    [[TMP67:%.*]] = uitofp i32 [[TMP66]] to float
1102; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP67]])
1103; CHECK-NEXT:    [[TMP69:%.*]] = fmul fast float [[TMP68]], 0x41EFFFFFC0000000
1104; CHECK-NEXT:    [[TMP70:%.*]] = fptoui float [[TMP69]] to i32
1105; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 0, [[TMP66]]
1106; CHECK-NEXT:    [[TMP72:%.*]] = mul i32 [[TMP71]], [[TMP70]]
1107; CHECK-NEXT:    [[TMP73:%.*]] = zext i32 [[TMP70]] to i64
1108; CHECK-NEXT:    [[TMP74:%.*]] = zext i32 [[TMP72]] to i64
1109; CHECK-NEXT:    [[TMP75:%.*]] = mul i64 [[TMP73]], [[TMP74]]
1110; CHECK-NEXT:    [[TMP76:%.*]] = trunc i64 [[TMP75]] to i32
1111; CHECK-NEXT:    [[TMP77:%.*]] = lshr i64 [[TMP75]], 32
1112; CHECK-NEXT:    [[TMP78:%.*]] = trunc i64 [[TMP77]] to i32
1113; CHECK-NEXT:    [[TMP79:%.*]] = add i32 [[TMP70]], [[TMP78]]
1114; CHECK-NEXT:    [[TMP80:%.*]] = zext i32 [[TMP65]] to i64
1115; CHECK-NEXT:    [[TMP81:%.*]] = zext i32 [[TMP79]] to i64
1116; CHECK-NEXT:    [[TMP82:%.*]] = mul i64 [[TMP80]], [[TMP81]]
1117; CHECK-NEXT:    [[TMP83:%.*]] = trunc i64 [[TMP82]] to i32
1118; CHECK-NEXT:    [[TMP84:%.*]] = lshr i64 [[TMP82]], 32
1119; CHECK-NEXT:    [[TMP85:%.*]] = trunc i64 [[TMP84]] to i32
1120; CHECK-NEXT:    [[TMP86:%.*]] = mul i32 [[TMP85]], [[TMP66]]
1121; CHECK-NEXT:    [[TMP87:%.*]] = sub i32 [[TMP65]], [[TMP86]]
1122; CHECK-NEXT:    [[TMP88:%.*]] = icmp uge i32 [[TMP87]], [[TMP66]]
1123; CHECK-NEXT:    [[TMP89:%.*]] = add i32 [[TMP85]], 1
1124; CHECK-NEXT:    [[TMP90:%.*]] = select i1 [[TMP88]], i32 [[TMP89]], i32 [[TMP85]]
1125; CHECK-NEXT:    [[TMP91:%.*]] = sub i32 [[TMP87]], [[TMP66]]
1126; CHECK-NEXT:    [[TMP92:%.*]] = select i1 [[TMP88]], i32 [[TMP91]], i32 [[TMP87]]
1127; CHECK-NEXT:    [[TMP93:%.*]] = icmp uge i32 [[TMP92]], [[TMP66]]
1128; CHECK-NEXT:    [[TMP94:%.*]] = add i32 [[TMP90]], 1
1129; CHECK-NEXT:    [[TMP95:%.*]] = select i1 [[TMP93]], i32 [[TMP94]], i32 [[TMP90]]
1130; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP95]], i64 2
1131; CHECK-NEXT:    [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 3
1132; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1133; CHECK-NEXT:    [[TMP99:%.*]] = uitofp i32 [[TMP98]] to float
1134; CHECK-NEXT:    [[TMP100:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP99]])
1135; CHECK-NEXT:    [[TMP101:%.*]] = fmul fast float [[TMP100]], 0x41EFFFFFC0000000
1136; CHECK-NEXT:    [[TMP102:%.*]] = fptoui float [[TMP101]] to i32
1137; CHECK-NEXT:    [[TMP103:%.*]] = sub i32 0, [[TMP98]]
1138; CHECK-NEXT:    [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP102]]
1139; CHECK-NEXT:    [[TMP105:%.*]] = zext i32 [[TMP102]] to i64
1140; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP104]] to i64
1141; CHECK-NEXT:    [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]]
1142; CHECK-NEXT:    [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32
1143; CHECK-NEXT:    [[TMP109:%.*]] = lshr i64 [[TMP107]], 32
1144; CHECK-NEXT:    [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32
1145; CHECK-NEXT:    [[TMP111:%.*]] = add i32 [[TMP102]], [[TMP110]]
1146; CHECK-NEXT:    [[TMP112:%.*]] = zext i32 [[TMP97]] to i64
1147; CHECK-NEXT:    [[TMP113:%.*]] = zext i32 [[TMP111]] to i64
1148; CHECK-NEXT:    [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]]
1149; CHECK-NEXT:    [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32
1150; CHECK-NEXT:    [[TMP116:%.*]] = lshr i64 [[TMP114]], 32
1151; CHECK-NEXT:    [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32
1152; CHECK-NEXT:    [[TMP118:%.*]] = mul i32 [[TMP117]], [[TMP98]]
1153; CHECK-NEXT:    [[TMP119:%.*]] = sub i32 [[TMP97]], [[TMP118]]
1154; CHECK-NEXT:    [[TMP120:%.*]] = icmp uge i32 [[TMP119]], [[TMP98]]
1155; CHECK-NEXT:    [[TMP121:%.*]] = add i32 [[TMP117]], 1
1156; CHECK-NEXT:    [[TMP122:%.*]] = select i1 [[TMP120]], i32 [[TMP121]], i32 [[TMP117]]
1157; CHECK-NEXT:    [[TMP123:%.*]] = sub i32 [[TMP119]], [[TMP98]]
1158; CHECK-NEXT:    [[TMP124:%.*]] = select i1 [[TMP120]], i32 [[TMP123]], i32 [[TMP119]]
1159; CHECK-NEXT:    [[TMP125:%.*]] = icmp uge i32 [[TMP124]], [[TMP98]]
1160; CHECK-NEXT:    [[TMP126:%.*]] = add i32 [[TMP122]], 1
1161; CHECK-NEXT:    [[TMP127:%.*]] = select i1 [[TMP125]], i32 [[TMP126]], i32 [[TMP122]]
1162; CHECK-NEXT:    [[TMP128:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP127]], i64 3
1163; CHECK-NEXT:    store <4 x i32> [[TMP128]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1164; CHECK-NEXT:    ret void
1165;
1166; GFX6-LABEL: udiv_v4i32:
1167; GFX6:       ; %bb.0:
1168; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
1169; GFX6-NEXT:    s_mov_b32 s3, 0x4f7ffffe
1170; GFX6-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
1171; GFX6-NEXT:    s_mov_b32 s15, 0xf000
1172; GFX6-NEXT:    s_mov_b32 s14, -1
1173; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1174; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
1175; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
1176; GFX6-NEXT:    s_sub_i32 s2, 0, s8
1177; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s10
1178; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1179; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1180; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s11
1181; GFX6-NEXT:    v_mul_f32_e32 v0, s3, v0
1182; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
1183; GFX6-NEXT:    v_mul_f32_e32 v1, s3, v1
1184; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
1185; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
1186; GFX6-NEXT:    s_sub_i32 s2, 0, s9
1187; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v1
1188; GFX6-NEXT:    s_sub_i32 s2, 0, s10
1189; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
1190; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
1191; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
1192; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
1193; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
1194; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
1195; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s8
1196; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
1197; GFX6-NEXT:    v_mul_lo_u32 v5, v1, s9
1198; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2
1199; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v2
1200; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
1201; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v2
1202; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
1203; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
1204; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
1205; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v4
1206; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1207; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s5, v5
1208; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1209; GFX6-NEXT:    v_mul_f32_e32 v2, s3, v2
1210; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
1211; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v3
1212; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
1213; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v3
1214; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v2
1215; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1216; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v1
1217; GFX6-NEXT:    s_sub_i32 s0, 0, s11
1218; GFX6-NEXT:    v_mul_hi_u32 v4, v2, v4
1219; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
1220; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v6
1221; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
1222; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1223; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
1224; GFX6-NEXT:    v_mul_f32_e32 v4, s3, v4
1225; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
1226; GFX6-NEXT:    v_mul_lo_u32 v3, v2, s10
1227; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
1228; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
1229; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
1230; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v3
1231; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
1232; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
1233; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s10, v3
1234; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
1235; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
1236; GFX6-NEXT:    v_mul_hi_u32 v4, s7, v4
1237; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
1238; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
1239; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1240; GFX6-NEXT:    v_mul_lo_u32 v6, v4, s11
1241; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1242; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v6
1243; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v3
1244; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
1245; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s11, v3
1246; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1247; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1248; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1249; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
1250; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
1251; GFX6-NEXT:    s_endpgm
1252; GFX9-LABEL: udiv_v4i32:
1253; GFX9:       ; %bb.0:
1254; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
1255; GFX9-NEXT:    s_mov_b32 s12, 0x4f7ffffe
1256; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1257; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1258; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1259; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
1260; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
1261; GFX9-NEXT:    s_sub_i32 s2, 0, s8
1262; GFX9-NEXT:    s_sub_i32 s3, 0, s9
1263; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1264; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1265; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s10
1266; GFX9-NEXT:    v_mul_f32_e32 v0, s12, v0
1267; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1268; GFX9-NEXT:    v_mul_f32_e32 v1, s12, v1
1269; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
1270; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
1271; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
1272; GFX9-NEXT:    s_sub_i32 s2, 0, s10
1273; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
1274; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
1275; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
1276; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
1277; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
1278; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
1279; GFX9-NEXT:    v_mul_f32_e32 v2, s12, v5
1280; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
1281; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s8
1282; GFX9-NEXT:    v_add_u32_e32 v6, 1, v0
1283; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
1284; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
1285; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
1286; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
1287; GFX9-NEXT:    v_subrev_u32_e32 v6, s8, v3
1288; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1289; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
1290; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s11
1291; GFX9-NEXT:    v_add_u32_e32 v6, 1, v0
1292; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
1293; GFX9-NEXT:    v_mul_lo_u32 v6, s2, v2
1294; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1295; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s9
1296; GFX9-NEXT:    s_sub_i32 s2, 0, s11
1297; GFX9-NEXT:    v_mul_hi_u32 v6, v2, v6
1298; GFX9-NEXT:    v_mul_f32_e32 v3, s12, v3
1299; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
1300; GFX9-NEXT:    v_sub_u32_e32 v5, s5, v5
1301; GFX9-NEXT:    v_add_u32_e32 v2, v2, v6
1302; GFX9-NEXT:    v_add_u32_e32 v7, 1, v1
1303; GFX9-NEXT:    v_mul_lo_u32 v6, s2, v3
1304; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
1305; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
1306; GFX9-NEXT:    v_subrev_u32_e32 v7, s9, v5
1307; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
1308; GFX9-NEXT:    v_mul_hi_u32 v2, s6, v2
1309; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
1310; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v6
1311; GFX9-NEXT:    v_add_u32_e32 v7, 1, v1
1312; GFX9-NEXT:    v_mul_lo_u32 v8, v2, s10
1313; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
1314; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
1315; GFX9-NEXT:    v_mul_hi_u32 v3, s7, v3
1316; GFX9-NEXT:    v_sub_u32_e32 v6, s6, v8
1317; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v6
1318; GFX9-NEXT:    v_subrev_u32_e32 v5, s10, v6
1319; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
1320; GFX9-NEXT:    v_mul_lo_u32 v6, v3, s11
1321; GFX9-NEXT:    v_add_u32_e32 v7, 1, v2
1322; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
1323; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v5
1324; GFX9-NEXT:    v_add_u32_e32 v7, 1, v2
1325; GFX9-NEXT:    v_sub_u32_e32 v5, s7, v6
1326; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
1327; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
1328; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v5
1329; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1330; GFX9-NEXT:    v_subrev_u32_e32 v6, s11, v5
1331; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
1332; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
1333; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v5
1334; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
1335; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
1336; GFX9-NEXT:    s_endpgm
1337  %r = udiv <4 x i32> %x, %y
1338  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
1339  ret void
1340}
1341
1342define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1343; CHECK-LABEL: @urem_v4i32(
1344; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1345; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1346; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
1347; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
1348; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
1349; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
1350; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
1351; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
1352; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
1353; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
1354; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
1355; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
1356; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
1357; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
1358; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
1359; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
1360; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1361; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1362; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1363; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1364; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1365; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
1366; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
1367; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
1368; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]]
1369; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]]
1370; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]]
1371; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]]
1372; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]]
1373; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> undef, i32 [[TMP29]], i64 0
1374; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[X]], i64 1
1375; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1376; CHECK-NEXT:    [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float
1377; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
1378; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000
1379; CHECK-NEXT:    [[TMP36:%.*]] = fptoui float [[TMP35]] to i32
1380; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 0, [[TMP32]]
1381; CHECK-NEXT:    [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]]
1382; CHECK-NEXT:    [[TMP39:%.*]] = zext i32 [[TMP36]] to i64
1383; CHECK-NEXT:    [[TMP40:%.*]] = zext i32 [[TMP38]] to i64
1384; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]]
1385; CHECK-NEXT:    [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32
1386; CHECK-NEXT:    [[TMP43:%.*]] = lshr i64 [[TMP41]], 32
1387; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
1388; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]]
1389; CHECK-NEXT:    [[TMP46:%.*]] = zext i32 [[TMP31]] to i64
1390; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP45]] to i64
1391; CHECK-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]]
1392; CHECK-NEXT:    [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32
1393; CHECK-NEXT:    [[TMP50:%.*]] = lshr i64 [[TMP48]], 32
1394; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
1395; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]]
1396; CHECK-NEXT:    [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]]
1397; CHECK-NEXT:    [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]]
1398; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]]
1399; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]]
1400; CHECK-NEXT:    [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]]
1401; CHECK-NEXT:    [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]]
1402; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]]
1403; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP59]], i64 1
1404; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <4 x i32> [[X]], i64 2
1405; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1406; CHECK-NEXT:    [[TMP63:%.*]] = uitofp i32 [[TMP62]] to float
1407; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP63]])
1408; CHECK-NEXT:    [[TMP65:%.*]] = fmul fast float [[TMP64]], 0x41EFFFFFC0000000
1409; CHECK-NEXT:    [[TMP66:%.*]] = fptoui float [[TMP65]] to i32
1410; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 0, [[TMP62]]
1411; CHECK-NEXT:    [[TMP68:%.*]] = mul i32 [[TMP67]], [[TMP66]]
1412; CHECK-NEXT:    [[TMP69:%.*]] = zext i32 [[TMP66]] to i64
1413; CHECK-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP68]] to i64
1414; CHECK-NEXT:    [[TMP71:%.*]] = mul i64 [[TMP69]], [[TMP70]]
1415; CHECK-NEXT:    [[TMP72:%.*]] = trunc i64 [[TMP71]] to i32
1416; CHECK-NEXT:    [[TMP73:%.*]] = lshr i64 [[TMP71]], 32
1417; CHECK-NEXT:    [[TMP74:%.*]] = trunc i64 [[TMP73]] to i32
1418; CHECK-NEXT:    [[TMP75:%.*]] = add i32 [[TMP66]], [[TMP74]]
1419; CHECK-NEXT:    [[TMP76:%.*]] = zext i32 [[TMP61]] to i64
1420; CHECK-NEXT:    [[TMP77:%.*]] = zext i32 [[TMP75]] to i64
1421; CHECK-NEXT:    [[TMP78:%.*]] = mul i64 [[TMP76]], [[TMP77]]
1422; CHECK-NEXT:    [[TMP79:%.*]] = trunc i64 [[TMP78]] to i32
1423; CHECK-NEXT:    [[TMP80:%.*]] = lshr i64 [[TMP78]], 32
1424; CHECK-NEXT:    [[TMP81:%.*]] = trunc i64 [[TMP80]] to i32
1425; CHECK-NEXT:    [[TMP82:%.*]] = mul i32 [[TMP81]], [[TMP62]]
1426; CHECK-NEXT:    [[TMP83:%.*]] = sub i32 [[TMP61]], [[TMP82]]
1427; CHECK-NEXT:    [[TMP84:%.*]] = icmp uge i32 [[TMP83]], [[TMP62]]
1428; CHECK-NEXT:    [[TMP85:%.*]] = sub i32 [[TMP83]], [[TMP62]]
1429; CHECK-NEXT:    [[TMP86:%.*]] = select i1 [[TMP84]], i32 [[TMP85]], i32 [[TMP83]]
1430; CHECK-NEXT:    [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP62]]
1431; CHECK-NEXT:    [[TMP88:%.*]] = sub i32 [[TMP86]], [[TMP62]]
1432; CHECK-NEXT:    [[TMP89:%.*]] = select i1 [[TMP87]], i32 [[TMP88]], i32 [[TMP86]]
1433; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP89]], i64 2
1434; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <4 x i32> [[X]], i64 3
1435; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1436; CHECK-NEXT:    [[TMP93:%.*]] = uitofp i32 [[TMP92]] to float
1437; CHECK-NEXT:    [[TMP94:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP93]])
1438; CHECK-NEXT:    [[TMP95:%.*]] = fmul fast float [[TMP94]], 0x41EFFFFFC0000000
1439; CHECK-NEXT:    [[TMP96:%.*]] = fptoui float [[TMP95]] to i32
1440; CHECK-NEXT:    [[TMP97:%.*]] = sub i32 0, [[TMP92]]
1441; CHECK-NEXT:    [[TMP98:%.*]] = mul i32 [[TMP97]], [[TMP96]]
1442; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP96]] to i64
1443; CHECK-NEXT:    [[TMP100:%.*]] = zext i32 [[TMP98]] to i64
1444; CHECK-NEXT:    [[TMP101:%.*]] = mul i64 [[TMP99]], [[TMP100]]
1445; CHECK-NEXT:    [[TMP102:%.*]] = trunc i64 [[TMP101]] to i32
1446; CHECK-NEXT:    [[TMP103:%.*]] = lshr i64 [[TMP101]], 32
1447; CHECK-NEXT:    [[TMP104:%.*]] = trunc i64 [[TMP103]] to i32
1448; CHECK-NEXT:    [[TMP105:%.*]] = add i32 [[TMP96]], [[TMP104]]
1449; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP91]] to i64
1450; CHECK-NEXT:    [[TMP107:%.*]] = zext i32 [[TMP105]] to i64
1451; CHECK-NEXT:    [[TMP108:%.*]] = mul i64 [[TMP106]], [[TMP107]]
1452; CHECK-NEXT:    [[TMP109:%.*]] = trunc i64 [[TMP108]] to i32
1453; CHECK-NEXT:    [[TMP110:%.*]] = lshr i64 [[TMP108]], 32
1454; CHECK-NEXT:    [[TMP111:%.*]] = trunc i64 [[TMP110]] to i32
1455; CHECK-NEXT:    [[TMP112:%.*]] = mul i32 [[TMP111]], [[TMP92]]
1456; CHECK-NEXT:    [[TMP113:%.*]] = sub i32 [[TMP91]], [[TMP112]]
1457; CHECK-NEXT:    [[TMP114:%.*]] = icmp uge i32 [[TMP113]], [[TMP92]]
1458; CHECK-NEXT:    [[TMP115:%.*]] = sub i32 [[TMP113]], [[TMP92]]
1459; CHECK-NEXT:    [[TMP116:%.*]] = select i1 [[TMP114]], i32 [[TMP115]], i32 [[TMP113]]
1460; CHECK-NEXT:    [[TMP117:%.*]] = icmp uge i32 [[TMP116]], [[TMP92]]
1461; CHECK-NEXT:    [[TMP118:%.*]] = sub i32 [[TMP116]], [[TMP92]]
1462; CHECK-NEXT:    [[TMP119:%.*]] = select i1 [[TMP117]], i32 [[TMP118]], i32 [[TMP116]]
1463; CHECK-NEXT:    [[TMP120:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP119]], i64 3
1464; CHECK-NEXT:    store <4 x i32> [[TMP120]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1465; CHECK-NEXT:    ret void
1466;
1467; GFX6-LABEL: urem_v4i32:
1468; GFX6:       ; %bb.0:
1469; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
1470; GFX6-NEXT:    s_mov_b32 s13, 0x4f7ffffe
1471; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1472; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1473; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1474; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
1475; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
1476; GFX6-NEXT:    s_sub_i32 s2, 0, s8
1477; GFX6-NEXT:    s_sub_i32 s12, 0, s9
1478; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1479; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1480; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s10
1481; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s11
1482; GFX6-NEXT:    v_mul_f32_e32 v0, s13, v0
1483; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
1484; GFX6-NEXT:    v_mul_f32_e32 v1, s13, v1
1485; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
1486; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1487; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
1488; GFX6-NEXT:    s_mov_b32 s2, -1
1489; GFX6-NEXT:    v_mul_lo_u32 v4, s12, v1
1490; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
1491; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
1492; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
1493; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
1494; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
1495; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
1496; GFX6-NEXT:    v_mul_f32_e32 v2, s13, v3
1497; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
1498; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
1499; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s9
1500; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1501; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
1502; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1503; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1504; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
1505; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1506; GFX6-NEXT:    s_sub_i32 s4, 0, s10
1507; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1508; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v2
1509; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
1510; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
1511; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1512; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1513; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
1514; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
1515; GFX6-NEXT:    s_sub_i32 s4, 0, s11
1516; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
1517; GFX6-NEXT:    v_mul_f32_e32 v3, s13, v4
1518; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
1519; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
1520; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v2
1521; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1522; GFX6-NEXT:    v_mul_lo_u32 v5, s4, v3
1523; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1524; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s10
1525; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v5
1526; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
1527; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
1528; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
1529; GFX6-NEXT:    v_mul_hi_u32 v3, s7, v3
1530; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1531; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1532; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s10, v2
1533; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s11
1534; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1535; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
1536; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
1537; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s11, v3
1538; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1539; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1540; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s11, v3
1541; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1542; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1543; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1544; GFX6-NEXT:    s_endpgm
1545; GFX9-LABEL: urem_v4i32:
1546; GFX9:       ; %bb.0:
1547; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
1548; GFX9-NEXT:    s_mov_b32 s12, 0x4f7ffffe
1549; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1550; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1551; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1552; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
1553; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
1554; GFX9-NEXT:    s_sub_i32 s2, 0, s8
1555; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s10
1556; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1557; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1558; GFX9-NEXT:    s_sub_i32 s3, 0, s9
1559; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
1560; GFX9-NEXT:    v_mul_f32_e32 v0, s12, v0
1561; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1562; GFX9-NEXT:    v_mul_f32_e32 v1, s12, v1
1563; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
1564; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s11
1565; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
1566; GFX9-NEXT:    s_sub_i32 s2, 0, s10
1567; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
1568; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
1569; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
1570; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
1571; GFX9-NEXT:    v_mul_f32_e32 v2, s12, v5
1572; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
1573; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
1574; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v6
1575; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
1576; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v2
1577; GFX9-NEXT:    s_sub_i32 s2, 0, s11
1578; GFX9-NEXT:    v_mul_f32_e32 v3, s12, v3
1579; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
1580; GFX9-NEXT:    v_mul_hi_u32 v5, v2, v5
1581; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
1582; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s8
1583; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
1584; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v3
1585; GFX9-NEXT:    v_mul_hi_u32 v2, s6, v2
1586; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s9
1587; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
1588; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v5
1589; GFX9-NEXT:    v_subrev_u32_e32 v6, s8, v0
1590; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1591; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
1592; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
1593; GFX9-NEXT:    v_mul_hi_u32 v3, s7, v3
1594; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s10
1595; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
1596; GFX9-NEXT:    v_subrev_u32_e32 v6, s8, v0
1597; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1598; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
1599; GFX9-NEXT:    v_subrev_u32_e32 v6, s9, v1
1600; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1601; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
1602; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s11
1603; GFX9-NEXT:    v_sub_u32_e32 v2, s6, v2
1604; GFX9-NEXT:    v_subrev_u32_e32 v6, s9, v1
1605; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1606; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
1607; GFX9-NEXT:    v_subrev_u32_e32 v5, s10, v2
1608; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1609; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1610; GFX9-NEXT:    v_subrev_u32_e32 v5, s10, v2
1611; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1612; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v3
1613; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1614; GFX9-NEXT:    v_subrev_u32_e32 v5, s11, v3
1615; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1616; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
1617; GFX9-NEXT:    v_subrev_u32_e32 v5, s11, v3
1618; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1619; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
1620; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
1621; GFX9-NEXT:    s_endpgm
1622  %r = urem <4 x i32> %x, %y
1623  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
1624  ret void
1625}
1626
1627define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1628; CHECK-LABEL: @sdiv_v4i32(
1629; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1630; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1631; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
1632; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
1633; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
1634; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]]
1635; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]]
1636; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]]
1637; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]]
1638; CHECK-NEXT:    [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
1639; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]])
1640; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000
1641; CHECK-NEXT:    [[TMP13:%.*]] = fptoui float [[TMP12]] to i32
1642; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 0, [[TMP9]]
1643; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]]
1644; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP13]] to i64
1645; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1646; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1647; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1648; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1649; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1650; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]]
1651; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP8]] to i64
1652; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP22]] to i64
1653; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
1654; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
1655; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
1656; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
1657; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]]
1658; CHECK-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]]
1659; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]]
1660; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP28]], 1
1661; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]]
1662; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]]
1663; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]]
1664; CHECK-NEXT:    [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]]
1665; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP33]], 1
1666; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]]
1667; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]]
1668; CHECK-NEXT:    [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]]
1669; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> undef, i32 [[TMP40]], i64 0
1670; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i32> [[X]], i64 1
1671; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1672; CHECK-NEXT:    [[TMP44:%.*]] = ashr i32 [[TMP42]], 31
1673; CHECK-NEXT:    [[TMP45:%.*]] = ashr i32 [[TMP43]], 31
1674; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]]
1675; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]]
1676; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]]
1677; CHECK-NEXT:    [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]]
1678; CHECK-NEXT:    [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]]
1679; CHECK-NEXT:    [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float
1680; CHECK-NEXT:    [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]])
1681; CHECK-NEXT:    [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000
1682; CHECK-NEXT:    [[TMP54:%.*]] = fptoui float [[TMP53]] to i32
1683; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 0, [[TMP50]]
1684; CHECK-NEXT:    [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]]
1685; CHECK-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP54]] to i64
1686; CHECK-NEXT:    [[TMP58:%.*]] = zext i32 [[TMP56]] to i64
1687; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]]
1688; CHECK-NEXT:    [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32
1689; CHECK-NEXT:    [[TMP61:%.*]] = lshr i64 [[TMP59]], 32
1690; CHECK-NEXT:    [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32
1691; CHECK-NEXT:    [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]]
1692; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[TMP49]] to i64
1693; CHECK-NEXT:    [[TMP65:%.*]] = zext i32 [[TMP63]] to i64
1694; CHECK-NEXT:    [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]]
1695; CHECK-NEXT:    [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32
1696; CHECK-NEXT:    [[TMP68:%.*]] = lshr i64 [[TMP66]], 32
1697; CHECK-NEXT:    [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32
1698; CHECK-NEXT:    [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]]
1699; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]]
1700; CHECK-NEXT:    [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]]
1701; CHECK-NEXT:    [[TMP73:%.*]] = add i32 [[TMP69]], 1
1702; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]]
1703; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]]
1704; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]]
1705; CHECK-NEXT:    [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]]
1706; CHECK-NEXT:    [[TMP78:%.*]] = add i32 [[TMP74]], 1
1707; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]]
1708; CHECK-NEXT:    [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]]
1709; CHECK-NEXT:    [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]]
1710; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP81]], i64 1
1711; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <4 x i32> [[X]], i64 2
1712; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1713; CHECK-NEXT:    [[TMP85:%.*]] = ashr i32 [[TMP83]], 31
1714; CHECK-NEXT:    [[TMP86:%.*]] = ashr i32 [[TMP84]], 31
1715; CHECK-NEXT:    [[TMP87:%.*]] = xor i32 [[TMP85]], [[TMP86]]
1716; CHECK-NEXT:    [[TMP88:%.*]] = add i32 [[TMP83]], [[TMP85]]
1717; CHECK-NEXT:    [[TMP89:%.*]] = add i32 [[TMP84]], [[TMP86]]
1718; CHECK-NEXT:    [[TMP90:%.*]] = xor i32 [[TMP88]], [[TMP85]]
1719; CHECK-NEXT:    [[TMP91:%.*]] = xor i32 [[TMP89]], [[TMP86]]
1720; CHECK-NEXT:    [[TMP92:%.*]] = uitofp i32 [[TMP91]] to float
1721; CHECK-NEXT:    [[TMP93:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP92]])
1722; CHECK-NEXT:    [[TMP94:%.*]] = fmul fast float [[TMP93]], 0x41EFFFFFC0000000
1723; CHECK-NEXT:    [[TMP95:%.*]] = fptoui float [[TMP94]] to i32
1724; CHECK-NEXT:    [[TMP96:%.*]] = sub i32 0, [[TMP91]]
1725; CHECK-NEXT:    [[TMP97:%.*]] = mul i32 [[TMP96]], [[TMP95]]
1726; CHECK-NEXT:    [[TMP98:%.*]] = zext i32 [[TMP95]] to i64
1727; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP97]] to i64
1728; CHECK-NEXT:    [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]]
1729; CHECK-NEXT:    [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32
1730; CHECK-NEXT:    [[TMP102:%.*]] = lshr i64 [[TMP100]], 32
1731; CHECK-NEXT:    [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32
1732; CHECK-NEXT:    [[TMP104:%.*]] = add i32 [[TMP95]], [[TMP103]]
1733; CHECK-NEXT:    [[TMP105:%.*]] = zext i32 [[TMP90]] to i64
1734; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP104]] to i64
1735; CHECK-NEXT:    [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]]
1736; CHECK-NEXT:    [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32
1737; CHECK-NEXT:    [[TMP109:%.*]] = lshr i64 [[TMP107]], 32
1738; CHECK-NEXT:    [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32
1739; CHECK-NEXT:    [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP91]]
1740; CHECK-NEXT:    [[TMP112:%.*]] = sub i32 [[TMP90]], [[TMP111]]
1741; CHECK-NEXT:    [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP91]]
1742; CHECK-NEXT:    [[TMP114:%.*]] = add i32 [[TMP110]], 1
1743; CHECK-NEXT:    [[TMP115:%.*]] = select i1 [[TMP113]], i32 [[TMP114]], i32 [[TMP110]]
1744; CHECK-NEXT:    [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP91]]
1745; CHECK-NEXT:    [[TMP117:%.*]] = select i1 [[TMP113]], i32 [[TMP116]], i32 [[TMP112]]
1746; CHECK-NEXT:    [[TMP118:%.*]] = icmp uge i32 [[TMP117]], [[TMP91]]
1747; CHECK-NEXT:    [[TMP119:%.*]] = add i32 [[TMP115]], 1
1748; CHECK-NEXT:    [[TMP120:%.*]] = select i1 [[TMP118]], i32 [[TMP119]], i32 [[TMP115]]
1749; CHECK-NEXT:    [[TMP121:%.*]] = xor i32 [[TMP120]], [[TMP87]]
1750; CHECK-NEXT:    [[TMP122:%.*]] = sub i32 [[TMP121]], [[TMP87]]
1751; CHECK-NEXT:    [[TMP123:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP122]], i64 2
1752; CHECK-NEXT:    [[TMP124:%.*]] = extractelement <4 x i32> [[X]], i64 3
1753; CHECK-NEXT:    [[TMP125:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1754; CHECK-NEXT:    [[TMP126:%.*]] = ashr i32 [[TMP124]], 31
1755; CHECK-NEXT:    [[TMP127:%.*]] = ashr i32 [[TMP125]], 31
1756; CHECK-NEXT:    [[TMP128:%.*]] = xor i32 [[TMP126]], [[TMP127]]
1757; CHECK-NEXT:    [[TMP129:%.*]] = add i32 [[TMP124]], [[TMP126]]
1758; CHECK-NEXT:    [[TMP130:%.*]] = add i32 [[TMP125]], [[TMP127]]
1759; CHECK-NEXT:    [[TMP131:%.*]] = xor i32 [[TMP129]], [[TMP126]]
1760; CHECK-NEXT:    [[TMP132:%.*]] = xor i32 [[TMP130]], [[TMP127]]
1761; CHECK-NEXT:    [[TMP133:%.*]] = uitofp i32 [[TMP132]] to float
1762; CHECK-NEXT:    [[TMP134:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP133]])
1763; CHECK-NEXT:    [[TMP135:%.*]] = fmul fast float [[TMP134]], 0x41EFFFFFC0000000
1764; CHECK-NEXT:    [[TMP136:%.*]] = fptoui float [[TMP135]] to i32
1765; CHECK-NEXT:    [[TMP137:%.*]] = sub i32 0, [[TMP132]]
1766; CHECK-NEXT:    [[TMP138:%.*]] = mul i32 [[TMP137]], [[TMP136]]
1767; CHECK-NEXT:    [[TMP139:%.*]] = zext i32 [[TMP136]] to i64
1768; CHECK-NEXT:    [[TMP140:%.*]] = zext i32 [[TMP138]] to i64
1769; CHECK-NEXT:    [[TMP141:%.*]] = mul i64 [[TMP139]], [[TMP140]]
1770; CHECK-NEXT:    [[TMP142:%.*]] = trunc i64 [[TMP141]] to i32
1771; CHECK-NEXT:    [[TMP143:%.*]] = lshr i64 [[TMP141]], 32
1772; CHECK-NEXT:    [[TMP144:%.*]] = trunc i64 [[TMP143]] to i32
1773; CHECK-NEXT:    [[TMP145:%.*]] = add i32 [[TMP136]], [[TMP144]]
1774; CHECK-NEXT:    [[TMP146:%.*]] = zext i32 [[TMP131]] to i64
1775; CHECK-NEXT:    [[TMP147:%.*]] = zext i32 [[TMP145]] to i64
1776; CHECK-NEXT:    [[TMP148:%.*]] = mul i64 [[TMP146]], [[TMP147]]
1777; CHECK-NEXT:    [[TMP149:%.*]] = trunc i64 [[TMP148]] to i32
1778; CHECK-NEXT:    [[TMP150:%.*]] = lshr i64 [[TMP148]], 32
1779; CHECK-NEXT:    [[TMP151:%.*]] = trunc i64 [[TMP150]] to i32
1780; CHECK-NEXT:    [[TMP152:%.*]] = mul i32 [[TMP151]], [[TMP132]]
1781; CHECK-NEXT:    [[TMP153:%.*]] = sub i32 [[TMP131]], [[TMP152]]
1782; CHECK-NEXT:    [[TMP154:%.*]] = icmp uge i32 [[TMP153]], [[TMP132]]
1783; CHECK-NEXT:    [[TMP155:%.*]] = add i32 [[TMP151]], 1
1784; CHECK-NEXT:    [[TMP156:%.*]] = select i1 [[TMP154]], i32 [[TMP155]], i32 [[TMP151]]
1785; CHECK-NEXT:    [[TMP157:%.*]] = sub i32 [[TMP153]], [[TMP132]]
1786; CHECK-NEXT:    [[TMP158:%.*]] = select i1 [[TMP154]], i32 [[TMP157]], i32 [[TMP153]]
1787; CHECK-NEXT:    [[TMP159:%.*]] = icmp uge i32 [[TMP158]], [[TMP132]]
1788; CHECK-NEXT:    [[TMP160:%.*]] = add i32 [[TMP156]], 1
1789; CHECK-NEXT:    [[TMP161:%.*]] = select i1 [[TMP159]], i32 [[TMP160]], i32 [[TMP156]]
1790; CHECK-NEXT:    [[TMP162:%.*]] = xor i32 [[TMP161]], [[TMP128]]
1791; CHECK-NEXT:    [[TMP163:%.*]] = sub i32 [[TMP162]], [[TMP128]]
1792; CHECK-NEXT:    [[TMP164:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP163]], i64 3
1793; CHECK-NEXT:    store <4 x i32> [[TMP164]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1794; CHECK-NEXT:    ret void
1795;
1796; GFX6-LABEL: sdiv_v4i32:
1797; GFX6:       ; %bb.0:
1798; GFX6-NEXT:    s_load_dwordx8 s[8:15], s[0:1], 0xd
1799; GFX6-NEXT:    s_mov_b32 s16, 0x4f7ffffe
1800; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1801; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1802; GFX6-NEXT:    s_mov_b32 s6, -1
1803; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1804; GFX6-NEXT:    s_ashr_i32 s2, s12, 31
1805; GFX6-NEXT:    s_add_i32 s3, s12, s2
1806; GFX6-NEXT:    s_xor_b32 s12, s3, s2
1807; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
1808; GFX6-NEXT:    s_ashr_i32 s3, s13, 31
1809; GFX6-NEXT:    s_add_i32 s0, s13, s3
1810; GFX6-NEXT:    s_xor_b32 s13, s0, s3
1811; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1812; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s13
1813; GFX6-NEXT:    s_sub_i32 s1, 0, s12
1814; GFX6-NEXT:    s_ashr_i32 s0, s8, 31
1815; GFX6-NEXT:    v_mul_f32_e32 v0, s16, v0
1816; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
1817; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1818; GFX6-NEXT:    s_xor_b32 s2, s0, s2
1819; GFX6-NEXT:    v_mul_lo_u32 v2, s1, v0
1820; GFX6-NEXT:    s_add_i32 s1, s8, s0
1821; GFX6-NEXT:    v_mul_f32_e32 v1, s16, v1
1822; GFX6-NEXT:    s_xor_b32 s1, s1, s0
1823; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
1824; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
1825; GFX6-NEXT:    s_sub_i32 s0, 0, s13
1826; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
1827; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
1828; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
1829; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s12
1830; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
1831; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
1832; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
1833; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v3
1834; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
1835; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s12, v3
1836; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
1837; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
1838; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
1839; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
1840; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
1841; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
1842; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
1843; GFX6-NEXT:    s_add_i32 s1, s9, s0
1844; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
1845; GFX6-NEXT:    s_xor_b32 s2, s0, s3
1846; GFX6-NEXT:    s_ashr_i32 s3, s14, 31
1847; GFX6-NEXT:    s_xor_b32 s1, s1, s0
1848; GFX6-NEXT:    s_add_i32 s0, s14, s3
1849; GFX6-NEXT:    s_xor_b32 s9, s0, s3
1850; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s9
1851; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
1852; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1853; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s13
1854; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1855; GFX6-NEXT:    v_mul_f32_e32 v3, s16, v3
1856; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
1857; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
1858; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v2
1859; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
1860; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s13, v2
1861; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
1862; GFX6-NEXT:    s_sub_i32 s0, 0, s9
1863; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v3
1864; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1865; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
1866; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1867; GFX6-NEXT:    v_mul_hi_u32 v2, v3, v5
1868; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
1869; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
1870; GFX6-NEXT:    s_ashr_i32 s2, s15, 31
1871; GFX6-NEXT:    s_ashr_i32 s0, s10, 31
1872; GFX6-NEXT:    s_add_i32 s8, s15, s2
1873; GFX6-NEXT:    s_add_i32 s1, s10, s0
1874; GFX6-NEXT:    s_xor_b32 s8, s8, s2
1875; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s8
1876; GFX6-NEXT:    s_xor_b32 s1, s1, s0
1877; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
1878; GFX6-NEXT:    v_mul_hi_u32 v2, s1, v2
1879; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v4
1880; GFX6-NEXT:    s_xor_b32 s3, s0, s3
1881; GFX6-NEXT:    v_mul_lo_u32 v3, v2, s9
1882; GFX6-NEXT:    v_mul_f32_e32 v4, s16, v4
1883; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
1884; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
1885; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
1886; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v3
1887; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
1888; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v3
1889; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1890; GFX6-NEXT:    s_sub_i32 s0, 0, s8
1891; GFX6-NEXT:    v_mul_lo_u32 v5, s0, v4
1892; GFX6-NEXT:    s_ashr_i32 s0, s11, 31
1893; GFX6-NEXT:    s_add_i32 s1, s11, s0
1894; GFX6-NEXT:    s_xor_b32 s1, s1, s0
1895; GFX6-NEXT:    v_mul_hi_u32 v5, v4, v5
1896; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
1897; GFX6-NEXT:    s_xor_b32 s2, s0, s2
1898; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
1899; GFX6-NEXT:    v_mul_hi_u32 v4, s1, v4
1900; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
1901; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1902; GFX6-NEXT:    v_xor_b32_e32 v2, s3, v2
1903; GFX6-NEXT:    v_mul_lo_u32 v3, v4, s8
1904; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1905; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v2
1906; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
1907; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
1908; GFX6-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
1909; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s8, v3
1910; GFX6-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1911; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1912; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
1913; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
1914; GFX6-NEXT:    v_xor_b32_e32 v3, s2, v3
1915; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v3
1916; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1917; GFX6-NEXT:    s_endpgm
1918; GFX9-LABEL: sdiv_v4i32:
1919; GFX9:       ; %bb.0:
1920; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
1921; GFX9-NEXT:    s_mov_b32 s13, 0x4f7ffffe
1922; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1923; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1924; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1925; GFX9-NEXT:    s_ashr_i32 s2, s8, 31
1926; GFX9-NEXT:    s_add_i32 s3, s8, s2
1927; GFX9-NEXT:    s_xor_b32 s14, s3, s2
1928; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s14
1929; GFX9-NEXT:    s_ashr_i32 s8, s9, 31
1930; GFX9-NEXT:    s_add_i32 s9, s9, s8
1931; GFX9-NEXT:    s_xor_b32 s15, s9, s8
1932; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1933; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s15
1934; GFX9-NEXT:    s_sub_i32 s12, 0, s14
1935; GFX9-NEXT:    s_ashr_i32 s3, s4, 31
1936; GFX9-NEXT:    v_mul_f32_e32 v0, s13, v0
1937; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
1938; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1939; GFX9-NEXT:    s_add_i32 s4, s4, s3
1940; GFX9-NEXT:    s_xor_b32 s4, s4, s3
1941; GFX9-NEXT:    v_mul_lo_u32 v2, s12, v0
1942; GFX9-NEXT:    v_mul_f32_e32 v1, s13, v1
1943; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
1944; GFX9-NEXT:    s_sub_i32 s12, 0, s15
1945; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
1946; GFX9-NEXT:    s_ashr_i32 s9, s5, 31
1947; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
1948; GFX9-NEXT:    s_xor_b32 s2, s3, s2
1949; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
1950; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
1951; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v3
1952; GFX9-NEXT:    s_add_i32 s3, s5, s9
1953; GFX9-NEXT:    s_xor_b32 s3, s3, s9
1954; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s14
1955; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
1956; GFX9-NEXT:    v_mul_hi_u32 v1, s3, v1
1957; GFX9-NEXT:    v_add_u32_e32 v2, 1, v0
1958; GFX9-NEXT:    v_sub_u32_e32 v3, s4, v3
1959; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v3
1960; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
1961; GFX9-NEXT:    v_subrev_u32_e32 v2, s14, v3
1962; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
1963; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v2
1964; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s15
1965; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
1966; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1967; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
1968; GFX9-NEXT:    v_sub_u32_e32 v2, s3, v2
1969; GFX9-NEXT:    s_ashr_i32 s3, s10, 31
1970; GFX9-NEXT:    s_add_i32 s4, s10, s3
1971; GFX9-NEXT:    v_subrev_u32_e32 v0, s2, v0
1972; GFX9-NEXT:    s_xor_b32 s2, s9, s8
1973; GFX9-NEXT:    s_xor_b32 s9, s4, s3
1974; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s9
1975; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
1976; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s15, v2
1977; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1978; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1979; GFX9-NEXT:    v_subrev_u32_e32 v5, s15, v2
1980; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1981; GFX9-NEXT:    s_sub_i32 s4, 0, s9
1982; GFX9-NEXT:    v_mul_f32_e32 v3, s13, v3
1983; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
1984; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s15, v2
1985; GFX9-NEXT:    v_add_u32_e32 v5, 1, v1
1986; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1987; GFX9-NEXT:    v_mul_lo_u32 v2, s4, v3
1988; GFX9-NEXT:    s_ashr_i32 s4, s6, 31
1989; GFX9-NEXT:    s_add_i32 s5, s6, s4
1990; GFX9-NEXT:    s_ashr_i32 s6, s11, 31
1991; GFX9-NEXT:    s_add_i32 s8, s11, s6
1992; GFX9-NEXT:    s_xor_b32 s8, s8, s6
1993; GFX9-NEXT:    v_mul_hi_u32 v2, v3, v2
1994; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s8
1995; GFX9-NEXT:    s_xor_b32 s5, s5, s4
1996; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
1997; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
1998; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v5
1999; GFX9-NEXT:    v_mul_hi_u32 v2, s5, v2
2000; GFX9-NEXT:    v_subrev_u32_e32 v1, s2, v1
2001; GFX9-NEXT:    s_xor_b32 s2, s4, s3
2002; GFX9-NEXT:    v_mul_f32_e32 v3, s13, v3
2003; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
2004; GFX9-NEXT:    v_mul_lo_u32 v5, v2, s9
2005; GFX9-NEXT:    s_sub_i32 s3, 0, s8
2006; GFX9-NEXT:    v_add_u32_e32 v6, 1, v2
2007; GFX9-NEXT:    v_mul_lo_u32 v7, s3, v3
2008; GFX9-NEXT:    v_sub_u32_e32 v5, s5, v5
2009; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
2010; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
2011; GFX9-NEXT:    v_subrev_u32_e32 v6, s9, v5
2012; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
2013; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v7
2014; GFX9-NEXT:    s_ashr_i32 s3, s7, 31
2015; GFX9-NEXT:    s_add_i32 s4, s7, s3
2016; GFX9-NEXT:    s_xor_b32 s4, s4, s3
2017; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
2018; GFX9-NEXT:    v_mul_hi_u32 v3, s4, v3
2019; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v5
2020; GFX9-NEXT:    v_add_u32_e32 v6, 1, v2
2021; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
2022; GFX9-NEXT:    v_mul_lo_u32 v5, v3, s8
2023; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
2024; GFX9-NEXT:    v_xor_b32_e32 v2, s2, v2
2025; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v2
2026; GFX9-NEXT:    v_sub_u32_e32 v5, s4, v5
2027; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
2028; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
2029; GFX9-NEXT:    v_subrev_u32_e32 v6, s8, v5
2030; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
2031; GFX9-NEXT:    v_add_u32_e32 v6, 1, v3
2032; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
2033; GFX9-NEXT:    s_xor_b32 s2, s3, s6
2034; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
2035; GFX9-NEXT:    v_xor_b32_e32 v3, s2, v3
2036; GFX9-NEXT:    v_subrev_u32_e32 v3, s2, v3
2037; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2038; GFX9-NEXT:    s_endpgm
2039  %r = sdiv <4 x i32> %x, %y
2040  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
2041  ret void
2042}
2043
2044define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
2045; CHECK-LABEL: @srem_v4i32(
2046; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
2047; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
2048; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
2049; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
2050; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
2051; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]]
2052; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]]
2053; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]]
2054; CHECK-NEXT:    [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
2055; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2056; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000
2057; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP11]] to i32
2058; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 0, [[TMP8]]
2059; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]]
2060; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP12]] to i64
2061; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
2062; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]]
2063; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
2064; CHECK-NEXT:    [[TMP19:%.*]] = lshr i64 [[TMP17]], 32
2065; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
2066; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
2067; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
2068; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP21]] to i64
2069; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]]
2070; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
2071; CHECK-NEXT:    [[TMP26:%.*]] = lshr i64 [[TMP24]], 32
2072; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
2073; CHECK-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]]
2074; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]]
2075; CHECK-NEXT:    [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]]
2076; CHECK-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]]
2077; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]]
2078; CHECK-NEXT:    [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]]
2079; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]]
2080; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]]
2081; CHECK-NEXT:    [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]]
2082; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]]
2083; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i32> undef, i32 [[TMP37]], i64 0
2084; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <4 x i32> [[X]], i64 1
2085; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[Y]], i64 1
2086; CHECK-NEXT:    [[TMP41:%.*]] = ashr i32 [[TMP39]], 31
2087; CHECK-NEXT:    [[TMP42:%.*]] = ashr i32 [[TMP40]], 31
2088; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]]
2089; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]]
2090; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]]
2091; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]]
2092; CHECK-NEXT:    [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float
2093; CHECK-NEXT:    [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]])
2094; CHECK-NEXT:    [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000
2095; CHECK-NEXT:    [[TMP50:%.*]] = fptoui float [[TMP49]] to i32
2096; CHECK-NEXT:    [[TMP51:%.*]] = sub i32 0, [[TMP46]]
2097; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]]
2098; CHECK-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP50]] to i64
2099; CHECK-NEXT:    [[TMP54:%.*]] = zext i32 [[TMP52]] to i64
2100; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]]
2101; CHECK-NEXT:    [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32
2102; CHECK-NEXT:    [[TMP57:%.*]] = lshr i64 [[TMP55]], 32
2103; CHECK-NEXT:    [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32
2104; CHECK-NEXT:    [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]]
2105; CHECK-NEXT:    [[TMP60:%.*]] = zext i32 [[TMP45]] to i64
2106; CHECK-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP59]] to i64
2107; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]]
2108; CHECK-NEXT:    [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32
2109; CHECK-NEXT:    [[TMP64:%.*]] = lshr i64 [[TMP62]], 32
2110; CHECK-NEXT:    [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32
2111; CHECK-NEXT:    [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]]
2112; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]]
2113; CHECK-NEXT:    [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]]
2114; CHECK-NEXT:    [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]]
2115; CHECK-NEXT:    [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]]
2116; CHECK-NEXT:    [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]]
2117; CHECK-NEXT:    [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]]
2118; CHECK-NEXT:    [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]]
2119; CHECK-NEXT:    [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]]
2120; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]]
2121; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP75]], i64 1
2122; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <4 x i32> [[X]], i64 2
2123; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <4 x i32> [[Y]], i64 2
2124; CHECK-NEXT:    [[TMP79:%.*]] = ashr i32 [[TMP77]], 31
2125; CHECK-NEXT:    [[TMP80:%.*]] = ashr i32 [[TMP78]], 31
2126; CHECK-NEXT:    [[TMP81:%.*]] = add i32 [[TMP77]], [[TMP79]]
2127; CHECK-NEXT:    [[TMP82:%.*]] = add i32 [[TMP78]], [[TMP80]]
2128; CHECK-NEXT:    [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP79]]
2129; CHECK-NEXT:    [[TMP84:%.*]] = xor i32 [[TMP82]], [[TMP80]]
2130; CHECK-NEXT:    [[TMP85:%.*]] = uitofp i32 [[TMP84]] to float
2131; CHECK-NEXT:    [[TMP86:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP85]])
2132; CHECK-NEXT:    [[TMP87:%.*]] = fmul fast float [[TMP86]], 0x41EFFFFFC0000000
2133; CHECK-NEXT:    [[TMP88:%.*]] = fptoui float [[TMP87]] to i32
2134; CHECK-NEXT:    [[TMP89:%.*]] = sub i32 0, [[TMP84]]
2135; CHECK-NEXT:    [[TMP90:%.*]] = mul i32 [[TMP89]], [[TMP88]]
2136; CHECK-NEXT:    [[TMP91:%.*]] = zext i32 [[TMP88]] to i64
2137; CHECK-NEXT:    [[TMP92:%.*]] = zext i32 [[TMP90]] to i64
2138; CHECK-NEXT:    [[TMP93:%.*]] = mul i64 [[TMP91]], [[TMP92]]
2139; CHECK-NEXT:    [[TMP94:%.*]] = trunc i64 [[TMP93]] to i32
2140; CHECK-NEXT:    [[TMP95:%.*]] = lshr i64 [[TMP93]], 32
2141; CHECK-NEXT:    [[TMP96:%.*]] = trunc i64 [[TMP95]] to i32
2142; CHECK-NEXT:    [[TMP97:%.*]] = add i32 [[TMP88]], [[TMP96]]
2143; CHECK-NEXT:    [[TMP98:%.*]] = zext i32 [[TMP83]] to i64
2144; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP97]] to i64
2145; CHECK-NEXT:    [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]]
2146; CHECK-NEXT:    [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32
2147; CHECK-NEXT:    [[TMP102:%.*]] = lshr i64 [[TMP100]], 32
2148; CHECK-NEXT:    [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32
2149; CHECK-NEXT:    [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP84]]
2150; CHECK-NEXT:    [[TMP105:%.*]] = sub i32 [[TMP83]], [[TMP104]]
2151; CHECK-NEXT:    [[TMP106:%.*]] = icmp uge i32 [[TMP105]], [[TMP84]]
2152; CHECK-NEXT:    [[TMP107:%.*]] = sub i32 [[TMP105]], [[TMP84]]
2153; CHECK-NEXT:    [[TMP108:%.*]] = select i1 [[TMP106]], i32 [[TMP107]], i32 [[TMP105]]
2154; CHECK-NEXT:    [[TMP109:%.*]] = icmp uge i32 [[TMP108]], [[TMP84]]
2155; CHECK-NEXT:    [[TMP110:%.*]] = sub i32 [[TMP108]], [[TMP84]]
2156; CHECK-NEXT:    [[TMP111:%.*]] = select i1 [[TMP109]], i32 [[TMP110]], i32 [[TMP108]]
2157; CHECK-NEXT:    [[TMP112:%.*]] = xor i32 [[TMP111]], [[TMP79]]
2158; CHECK-NEXT:    [[TMP113:%.*]] = sub i32 [[TMP112]], [[TMP79]]
2159; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP113]], i64 2
2160; CHECK-NEXT:    [[TMP115:%.*]] = extractelement <4 x i32> [[X]], i64 3
2161; CHECK-NEXT:    [[TMP116:%.*]] = extractelement <4 x i32> [[Y]], i64 3
2162; CHECK-NEXT:    [[TMP117:%.*]] = ashr i32 [[TMP115]], 31
2163; CHECK-NEXT:    [[TMP118:%.*]] = ashr i32 [[TMP116]], 31
2164; CHECK-NEXT:    [[TMP119:%.*]] = add i32 [[TMP115]], [[TMP117]]
2165; CHECK-NEXT:    [[TMP120:%.*]] = add i32 [[TMP116]], [[TMP118]]
2166; CHECK-NEXT:    [[TMP121:%.*]] = xor i32 [[TMP119]], [[TMP117]]
2167; CHECK-NEXT:    [[TMP122:%.*]] = xor i32 [[TMP120]], [[TMP118]]
2168; CHECK-NEXT:    [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float
2169; CHECK-NEXT:    [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]])
2170; CHECK-NEXT:    [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41EFFFFFC0000000
2171; CHECK-NEXT:    [[TMP126:%.*]] = fptoui float [[TMP125]] to i32
2172; CHECK-NEXT:    [[TMP127:%.*]] = sub i32 0, [[TMP122]]
2173; CHECK-NEXT:    [[TMP128:%.*]] = mul i32 [[TMP127]], [[TMP126]]
2174; CHECK-NEXT:    [[TMP129:%.*]] = zext i32 [[TMP126]] to i64
2175; CHECK-NEXT:    [[TMP130:%.*]] = zext i32 [[TMP128]] to i64
2176; CHECK-NEXT:    [[TMP131:%.*]] = mul i64 [[TMP129]], [[TMP130]]
2177; CHECK-NEXT:    [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32
2178; CHECK-NEXT:    [[TMP133:%.*]] = lshr i64 [[TMP131]], 32
2179; CHECK-NEXT:    [[TMP134:%.*]] = trunc i64 [[TMP133]] to i32
2180; CHECK-NEXT:    [[TMP135:%.*]] = add i32 [[TMP126]], [[TMP134]]
2181; CHECK-NEXT:    [[TMP136:%.*]] = zext i32 [[TMP121]] to i64
2182; CHECK-NEXT:    [[TMP137:%.*]] = zext i32 [[TMP135]] to i64
2183; CHECK-NEXT:    [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]]
2184; CHECK-NEXT:    [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32
2185; CHECK-NEXT:    [[TMP140:%.*]] = lshr i64 [[TMP138]], 32
2186; CHECK-NEXT:    [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32
2187; CHECK-NEXT:    [[TMP142:%.*]] = mul i32 [[TMP141]], [[TMP122]]
2188; CHECK-NEXT:    [[TMP143:%.*]] = sub i32 [[TMP121]], [[TMP142]]
2189; CHECK-NEXT:    [[TMP144:%.*]] = icmp uge i32 [[TMP143]], [[TMP122]]
2190; CHECK-NEXT:    [[TMP145:%.*]] = sub i32 [[TMP143]], [[TMP122]]
2191; CHECK-NEXT:    [[TMP146:%.*]] = select i1 [[TMP144]], i32 [[TMP145]], i32 [[TMP143]]
2192; CHECK-NEXT:    [[TMP147:%.*]] = icmp uge i32 [[TMP146]], [[TMP122]]
2193; CHECK-NEXT:    [[TMP148:%.*]] = sub i32 [[TMP146]], [[TMP122]]
2194; CHECK-NEXT:    [[TMP149:%.*]] = select i1 [[TMP147]], i32 [[TMP148]], i32 [[TMP146]]
2195; CHECK-NEXT:    [[TMP150:%.*]] = xor i32 [[TMP149]], [[TMP117]]
2196; CHECK-NEXT:    [[TMP151:%.*]] = sub i32 [[TMP150]], [[TMP117]]
2197; CHECK-NEXT:    [[TMP152:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP151]], i64 3
2198; CHECK-NEXT:    store <4 x i32> [[TMP152]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
2199; CHECK-NEXT:    ret void
2200;
2201; GFX6-LABEL: srem_v4i32:
2202; GFX6:       ; %bb.0:
2203; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
2204; GFX6-NEXT:    s_mov_b32 s13, 0x4f7ffffe
2205; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2206; GFX6-NEXT:    s_mov_b32 s3, 0xf000
2207; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2208; GFX6-NEXT:    s_ashr_i32 s2, s8, 31
2209; GFX6-NEXT:    s_add_i32 s8, s8, s2
2210; GFX6-NEXT:    s_xor_b32 s12, s8, s2
2211; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
2212; GFX6-NEXT:    s_ashr_i32 s8, s9, 31
2213; GFX6-NEXT:    s_add_i32 s9, s9, s8
2214; GFX6-NEXT:    s_xor_b32 s14, s9, s8
2215; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2216; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s14
2217; GFX6-NEXT:    s_sub_i32 s9, 0, s12
2218; GFX6-NEXT:    s_ashr_i32 s8, s4, 31
2219; GFX6-NEXT:    v_mul_f32_e32 v0, s13, v0
2220; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
2221; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2222; GFX6-NEXT:    s_add_i32 s4, s4, s8
2223; GFX6-NEXT:    s_xor_b32 s4, s4, s8
2224; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v0
2225; GFX6-NEXT:    v_mul_f32_e32 v1, s13, v1
2226; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
2227; GFX6-NEXT:    s_sub_i32 s9, 0, s14
2228; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
2229; GFX6-NEXT:    s_mov_b32 s2, -1
2230; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
2231; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
2232; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v1
2233; GFX6-NEXT:    s_ashr_i32 s9, s5, 31
2234; GFX6-NEXT:    s_add_i32 s5, s5, s9
2235; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s12
2236; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
2237; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
2238; GFX6-NEXT:    s_xor_b32 s4, s5, s9
2239; GFX6-NEXT:    s_ashr_i32 s5, s10, 31
2240; GFX6-NEXT:    s_add_i32 s10, s10, s5
2241; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s12, v0
2242; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
2243; GFX6-NEXT:    s_xor_b32 s10, s10, s5
2244; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2245; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
2246; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s10
2247; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v1
2248; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s12, v0
2249; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
2250; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
2251; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s14
2252; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
2253; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
2254; GFX6-NEXT:    v_mul_f32_e32 v2, s13, v2
2255; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
2256; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
2257; GFX6-NEXT:    s_sub_i32 s4, 0, s10
2258; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
2259; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s14, v1
2260; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s14, v1
2261; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v2
2262; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2263; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s14, v1
2264; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s14, v1
2265; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2266; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v4
2267; GFX6-NEXT:    s_ashr_i32 s4, s6, 31
2268; GFX6-NEXT:    s_add_i32 s5, s6, s4
2269; GFX6-NEXT:    s_ashr_i32 s6, s11, 31
2270; GFX6-NEXT:    s_add_i32 s8, s11, s6
2271; GFX6-NEXT:    s_xor_b32 s8, s8, s6
2272; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
2273; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s8
2274; GFX6-NEXT:    s_xor_b32 s5, s5, s4
2275; GFX6-NEXT:    v_mul_hi_u32 v2, s5, v2
2276; GFX6-NEXT:    v_xor_b32_e32 v1, s9, v1
2277; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v3
2278; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s9, v1
2279; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s10
2280; GFX6-NEXT:    v_mul_f32_e32 v3, s13, v3
2281; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
2282; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s5, v2
2283; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s10, v2
2284; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
2285; GFX6-NEXT:    s_sub_i32 s5, 0, s8
2286; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
2287; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v3
2288; GFX6-NEXT:    s_ashr_i32 s5, s7, 31
2289; GFX6-NEXT:    s_add_i32 s6, s7, s5
2290; GFX6-NEXT:    s_xor_b32 s6, s6, s5
2291; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
2292; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
2293; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
2294; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v3
2295; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
2296; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
2297; GFX6-NEXT:    v_xor_b32_e32 v2, s4, v2
2298; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s8
2299; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s4, v2
2300; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
2301; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
2302; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
2303; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2304; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
2305; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
2306; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
2307; GFX6-NEXT:    v_xor_b32_e32 v3, s5, v3
2308; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s5, v3
2309; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2310; GFX6-NEXT:    s_endpgm
2311; GFX9-LABEL: srem_v4i32:
2312; GFX9:       ; %bb.0:
2313; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
2314; GFX9-NEXT:    s_mov_b32 s13, 0x4f7ffffe
2315; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2316; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2317; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2318; GFX9-NEXT:    s_ashr_i32 s2, s8, 31
2319; GFX9-NEXT:    s_add_i32 s8, s8, s2
2320; GFX9-NEXT:    s_xor_b32 s2, s8, s2
2321; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
2322; GFX9-NEXT:    s_ashr_i32 s3, s9, 31
2323; GFX9-NEXT:    s_sub_i32 s12, 0, s2
2324; GFX9-NEXT:    s_add_i32 s8, s9, s3
2325; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
2326; GFX9-NEXT:    s_xor_b32 s3, s8, s3
2327; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
2328; GFX9-NEXT:    s_ashr_i32 s8, s4, 31
2329; GFX9-NEXT:    v_mul_f32_e32 v0, s13, v0
2330; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
2331; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
2332; GFX9-NEXT:    s_add_i32 s4, s4, s8
2333; GFX9-NEXT:    s_xor_b32 s4, s4, s8
2334; GFX9-NEXT:    v_mul_lo_u32 v2, s12, v0
2335; GFX9-NEXT:    v_mul_f32_e32 v1, s13, v1
2336; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
2337; GFX9-NEXT:    s_sub_i32 s12, 0, s3
2338; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
2339; GFX9-NEXT:    s_ashr_i32 s9, s5, 31
2340; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
2341; GFX9-NEXT:    s_add_i32 s5, s5, s9
2342; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
2343; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
2344; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
2345; GFX9-NEXT:    s_xor_b32 s5, s5, s9
2346; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
2347; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
2348; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
2349; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
2350; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v0
2351; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
2352; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
2353; GFX9-NEXT:    v_subrev_u32_e32 v2, s2, v0
2354; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
2355; GFX9-NEXT:    s_ashr_i32 s2, s10, 31
2356; GFX9-NEXT:    s_add_i32 s4, s10, s2
2357; GFX9-NEXT:    s_xor_b32 s2, s4, s2
2358; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
2359; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s2
2360; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s3
2361; GFX9-NEXT:    s_sub_i32 s4, 0, s2
2362; GFX9-NEXT:    v_xor_b32_e32 v0, s8, v0
2363; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
2364; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
2365; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
2366; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
2367; GFX9-NEXT:    v_mul_f32_e32 v2, s13, v2
2368; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
2369; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2370; GFX9-NEXT:    v_subrev_u32_e32 v3, s3, v1
2371; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
2372; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
2373; GFX9-NEXT:    v_mul_lo_u32 v3, s4, v2
2374; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
2375; GFX9-NEXT:    s_add_i32 s5, s11, s4
2376; GFX9-NEXT:    s_xor_b32 s4, s5, s4
2377; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s4
2378; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
2379; GFX9-NEXT:    s_ashr_i32 s3, s6, 31
2380; GFX9-NEXT:    s_add_i32 s5, s6, s3
2381; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
2382; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
2383; GFX9-NEXT:    s_xor_b32 s5, s5, s3
2384; GFX9-NEXT:    v_mul_hi_u32 v2, s5, v2
2385; GFX9-NEXT:    v_mul_f32_e32 v3, s13, v5
2386; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
2387; GFX9-NEXT:    s_sub_i32 s6, 0, s4
2388; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s2
2389; GFX9-NEXT:    v_xor_b32_e32 v1, s9, v1
2390; GFX9-NEXT:    v_mul_lo_u32 v5, s6, v3
2391; GFX9-NEXT:    v_subrev_u32_e32 v0, s8, v0
2392; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
2393; GFX9-NEXT:    s_ashr_i32 s5, s7, 31
2394; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v5
2395; GFX9-NEXT:    s_add_i32 s6, s7, s5
2396; GFX9-NEXT:    s_xor_b32 s6, s6, s5
2397; GFX9-NEXT:    v_subrev_u32_e32 v6, s2, v2
2398; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
2399; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v3
2400; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
2401; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
2402; GFX9-NEXT:    v_subrev_u32_e32 v5, s2, v2
2403; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s4
2404; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
2405; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
2406; GFX9-NEXT:    v_xor_b32_e32 v2, s3, v2
2407; GFX9-NEXT:    v_sub_u32_e32 v3, s6, v3
2408; GFX9-NEXT:    v_subrev_u32_e32 v5, s4, v3
2409; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
2410; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
2411; GFX9-NEXT:    v_subrev_u32_e32 v5, s4, v3
2412; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v3
2413; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
2414; GFX9-NEXT:    v_xor_b32_e32 v3, s5, v3
2415; GFX9-NEXT:    v_subrev_u32_e32 v1, s9, v1
2416; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v2
2417; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v3
2418; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
2419; GFX9-NEXT:    s_endpgm
2420  %r = srem <4 x i32> %x, %y
2421  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
2422  ret void
2423}
2424
2425define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
2426; CHECK-LABEL: @udiv_v4i16(
2427; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2428; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2429; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
2430; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
2431; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
2432; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
2433; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
2434; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
2435; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
2436; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
2437; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
2438; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
2439; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
2440; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
2441; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
2442; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
2443; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
2444; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
2445; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
2446; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> undef, i16 [[TMP19]], i64 0
2447; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1
2448; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2449; CHECK-NEXT:    [[TMP23:%.*]] = zext i16 [[TMP21]] to i32
2450; CHECK-NEXT:    [[TMP24:%.*]] = zext i16 [[TMP22]] to i32
2451; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
2452; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
2453; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
2454; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
2455; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
2456; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
2457; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
2458; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
2459; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
2460; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
2461; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
2462; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
2463; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
2464; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 65535
2465; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16
2466; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1
2467; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2
2468; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2469; CHECK-NEXT:    [[TMP43:%.*]] = zext i16 [[TMP41]] to i32
2470; CHECK-NEXT:    [[TMP44:%.*]] = zext i16 [[TMP42]] to i32
2471; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
2472; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
2473; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
2474; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
2475; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
2476; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
2477; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
2478; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
2479; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
2480; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
2481; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
2482; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
2483; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
2484; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 65535
2485; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
2486; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2
2487; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3
2488; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2489; CHECK-NEXT:    [[TMP63:%.*]] = zext i16 [[TMP61]] to i32
2490; CHECK-NEXT:    [[TMP64:%.*]] = zext i16 [[TMP62]] to i32
2491; CHECK-NEXT:    [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float
2492; CHECK-NEXT:    [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float
2493; CHECK-NEXT:    [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]])
2494; CHECK-NEXT:    [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]]
2495; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]])
2496; CHECK-NEXT:    [[TMP70:%.*]] = fneg fast float [[TMP69]]
2497; CHECK-NEXT:    [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]])
2498; CHECK-NEXT:    [[TMP72:%.*]] = fptoui float [[TMP69]] to i32
2499; CHECK-NEXT:    [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]])
2500; CHECK-NEXT:    [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
2501; CHECK-NEXT:    [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]]
2502; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0
2503; CHECK-NEXT:    [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]]
2504; CHECK-NEXT:    [[TMP78:%.*]] = and i32 [[TMP77]], 65535
2505; CHECK-NEXT:    [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16
2506; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3
2507; CHECK-NEXT:    store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
2508; CHECK-NEXT:    ret void
2509;
2510; GFX6-LABEL: udiv_v4i16:
2511; GFX6:       ; %bb.0:
2512; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2513; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
2514; GFX6-NEXT:    s_mov_b32 s8, 0xffff
2515; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2516; GFX6-NEXT:    s_mov_b32 s6, -1
2517; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2518; GFX6-NEXT:    s_and_b32 s9, s2, s8
2519; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
2520; GFX6-NEXT:    s_lshr_b32 s9, s0, 16
2521; GFX6-NEXT:    s_and_b32 s0, s0, s8
2522; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
2523; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s2
2524; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s0
2525; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2526; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s9
2527; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
2528; GFX6-NEXT:    s_and_b32 s2, s3, s8
2529; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
2530; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
2531; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
2532; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
2533; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2534; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
2535; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
2536; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
2537; GFX6-NEXT:    v_mad_f32 v2, -v1, v3, v4
2538; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s2
2539; GFX6-NEXT:    s_lshr_b32 s0, s1, 16
2540; GFX6-NEXT:    s_and_b32 s1, s1, s8
2541; GFX6-NEXT:    s_lshr_b32 s10, s3, 16
2542; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
2543; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
2544; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s10
2545; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s1
2546; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
2547; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v1, vcc
2548; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v3
2549; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2550; GFX6-NEXT:    v_mul_f32_e32 v1, v5, v6
2551; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s0
2552; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
2553; GFX6-NEXT:    v_mad_f32 v5, -v1, v4, v5
2554; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
2555; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
2556; GFX6-NEXT:    v_mul_f32_e32 v4, v6, v7
2557; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
2558; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v4
2559; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2560; GFX6-NEXT:    v_mad_f32 v4, -v4, v3, v6
2561; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v3
2562; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
2563; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
2564; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2565; GFX6-NEXT:    v_and_b32_e32 v1, s8, v1
2566; GFX6-NEXT:    v_or_b32_e32 v1, v1, v3
2567; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
2568; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2569; GFX6-NEXT:    s_endpgm
2570; GFX9-LABEL: udiv_v4i16:
2571; GFX9:       ; %bb.0:
2572; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
2573; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
2574; GFX9-NEXT:    s_mov_b32 s8, 0xffff
2575; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2576; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2577; GFX9-NEXT:    s_and_b32 s1, s6, s8
2578; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
2579; GFX9-NEXT:    s_lshr_b32 s0, s4, 16
2580; GFX9-NEXT:    s_and_b32 s4, s4, s8
2581; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s4
2582; GFX9-NEXT:    s_lshr_b32 s4, s6, 16
2583; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
2584; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s4
2585; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
2586; GFX9-NEXT:    s_and_b32 s0, s7, s8
2587; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
2588; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
2589; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
2590; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
2591; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
2592; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2593; GFX9-NEXT:    v_mul_f32_e32 v1, v5, v6
2594; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
2595; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
2596; GFX9-NEXT:    v_mad_f32 v3, -v1, v4, v5
2597; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
2598; GFX9-NEXT:    s_and_b32 s0, s5, s8
2599; GFX9-NEXT:    s_lshr_b32 s6, s7, 16
2600; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
2601; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
2602; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s6
2603; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
2604; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
2605; GFX9-NEXT:    s_lshr_b32 s1, s5, 16
2606; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
2607; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v4
2608; GFX9-NEXT:    v_mul_f32_e32 v1, v6, v7
2609; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s1
2610; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
2611; GFX9-NEXT:    v_mad_f32 v6, -v1, v5, v6
2612; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
2613; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
2614; GFX9-NEXT:    v_mul_f32_e32 v5, v7, v8
2615; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
2616; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v5
2617; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2618; GFX9-NEXT:    v_mad_f32 v5, -v5, v4, v7
2619; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
2620; GFX9-NEXT:    v_mov_b32_e32 v5, 0xffff
2621; GFX9-NEXT:    v_and_b32_e32 v0, v5, v0
2622; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
2623; GFX9-NEXT:    v_and_b32_e32 v1, v5, v1
2624; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v1
2625; GFX9-NEXT:    v_lshl_or_b32 v0, v3, 16, v0
2626; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
2627; GFX9-NEXT:    s_endpgm
2628  %r = udiv <4 x i16> %x, %y
2629  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
2630  ret void
2631}
2632
2633define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
2634; CHECK-LABEL: @urem_v4i16(
2635; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2636; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2637; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
2638; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
2639; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
2640; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
2641; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
2642; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
2643; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
2644; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
2645; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
2646; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
2647; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
2648; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
2649; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
2650; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
2651; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
2652; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
2653; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
2654; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 65535
2655; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
2656; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i16> undef, i16 [[TMP21]], i64 0
2657; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1
2658; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2659; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP23]] to i32
2660; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TMP24]] to i32
2661; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
2662; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
2663; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
2664; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
2665; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
2666; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
2667; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
2668; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
2669; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2670; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
2671; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
2672; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
2673; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
2674; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
2675; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
2676; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 65535
2677; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16
2678; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1
2679; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2
2680; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2681; CHECK-NEXT:    [[TMP47:%.*]] = zext i16 [[TMP45]] to i32
2682; CHECK-NEXT:    [[TMP48:%.*]] = zext i16 [[TMP46]] to i32
2683; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
2684; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
2685; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
2686; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
2687; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
2688; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
2689; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
2690; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
2691; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
2692; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
2693; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
2694; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
2695; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
2696; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
2697; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
2698; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 65535
2699; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
2700; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2
2701; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3
2702; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2703; CHECK-NEXT:    [[TMP69:%.*]] = zext i16 [[TMP67]] to i32
2704; CHECK-NEXT:    [[TMP70:%.*]] = zext i16 [[TMP68]] to i32
2705; CHECK-NEXT:    [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float
2706; CHECK-NEXT:    [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float
2707; CHECK-NEXT:    [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]])
2708; CHECK-NEXT:    [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]]
2709; CHECK-NEXT:    [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]])
2710; CHECK-NEXT:    [[TMP76:%.*]] = fneg fast float [[TMP75]]
2711; CHECK-NEXT:    [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]])
2712; CHECK-NEXT:    [[TMP78:%.*]] = fptoui float [[TMP75]] to i32
2713; CHECK-NEXT:    [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]])
2714; CHECK-NEXT:    [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]])
2715; CHECK-NEXT:    [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]]
2716; CHECK-NEXT:    [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0
2717; CHECK-NEXT:    [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]]
2718; CHECK-NEXT:    [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]]
2719; CHECK-NEXT:    [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]]
2720; CHECK-NEXT:    [[TMP86:%.*]] = and i32 [[TMP85]], 65535
2721; CHECK-NEXT:    [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16
2722; CHECK-NEXT:    [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3
2723; CHECK-NEXT:    store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
2724; CHECK-NEXT:    ret void
2725;
2726; GFX6-LABEL: urem_v4i16:
2727; GFX6:       ; %bb.0:
2728; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2729; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
2730; GFX6-NEXT:    s_mov_b32 s8, 0xffff
2731; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2732; GFX6-NEXT:    s_mov_b32 s6, -1
2733; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2734; GFX6-NEXT:    s_and_b32 s9, s2, s8
2735; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
2736; GFX6-NEXT:    s_and_b32 s10, s0, s8
2737; GFX6-NEXT:    s_lshr_b32 s11, s2, 16
2738; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s10
2739; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2740; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s11
2741; GFX6-NEXT:    s_lshr_b32 s9, s0, 16
2742; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s9
2743; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
2744; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
2745; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
2746; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
2747; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
2748; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2749; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
2750; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
2751; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
2752; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v1
2753; GFX6-NEXT:    v_mad_f32 v1, -v1, v3, v4
2754; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v3
2755; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
2756; GFX6-NEXT:    s_and_b32 s2, s3, s8
2757; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
2758; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s2
2759; GFX6-NEXT:    s_and_b32 s2, s1, s8
2760; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s11
2761; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s2
2762; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2763; GFX6-NEXT:    s_lshr_b32 s12, s3, 16
2764; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s9, v1
2765; GFX6-NEXT:    s_lshr_b32 s10, s1, 16
2766; GFX6-NEXT:    v_mul_f32_e32 v1, v3, v4
2767; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s12
2768; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s10
2769; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
2770; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2771; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
2772; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v3
2773; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
2774; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
2775; GFX6-NEXT:    v_mul_f32_e32 v2, v6, v7
2776; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
2777; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
2778; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2779; GFX6-NEXT:    v_mad_f32 v2, -v2, v4, v6
2780; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
2781; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
2782; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s3
2783; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s12
2784; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
2785; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
2786; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s10, v2
2787; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2788; GFX6-NEXT:    v_and_b32_e32 v1, s8, v1
2789; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
2790; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
2791; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
2792; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2793; GFX6-NEXT:    s_endpgm
2794; GFX9-LABEL: urem_v4i16:
2795; GFX9:       ; %bb.0:
2796; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
2797; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
2798; GFX9-NEXT:    s_mov_b32 s8, 0xffff
2799; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2800; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2801; GFX9-NEXT:    s_and_b32 s1, s6, s8
2802; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
2803; GFX9-NEXT:    s_and_b32 s9, s4, s8
2804; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
2805; GFX9-NEXT:    s_lshr_b32 s9, s6, 16
2806; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
2807; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s9
2808; GFX9-NEXT:    s_lshr_b32 s0, s4, 16
2809; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
2810; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
2811; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
2812; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
2813; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
2814; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
2815; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2816; GFX9-NEXT:    s_lshr_b32 s10, s7, 16
2817; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
2818; GFX9-NEXT:    v_mul_f32_e32 v1, v5, v6
2819; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s6
2820; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
2821; GFX9-NEXT:    s_and_b32 s6, s7, s8
2822; GFX9-NEXT:    v_mad_f32 v3, -v1, v4, v5
2823; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s6
2824; GFX9-NEXT:    s_and_b32 s6, s5, s8
2825; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
2826; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s10
2827; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s6
2828; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
2829; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
2830; GFX9-NEXT:    s_lshr_b32 s1, s5, 16
2831; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v4
2832; GFX9-NEXT:    v_mul_f32_e32 v3, v6, v7
2833; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s1
2834; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
2835; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
2836; GFX9-NEXT:    v_mad_f32 v6, -v3, v5, v6
2837; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
2838; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
2839; GFX9-NEXT:    v_mul_f32_e32 v5, v7, v8
2840; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
2841; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v5
2842; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2843; GFX9-NEXT:    v_mad_f32 v5, -v5, v4, v7
2844; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
2845; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
2846; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s9
2847; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s7
2848; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s10
2849; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
2850; GFX9-NEXT:    v_sub_u32_e32 v5, s0, v1
2851; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v3
2852; GFX9-NEXT:    v_sub_u32_e32 v3, s1, v4
2853; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
2854; GFX9-NEXT:    v_and_b32_e32 v1, v4, v1
2855; GFX9-NEXT:    v_and_b32_e32 v0, v4, v0
2856; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
2857; GFX9-NEXT:    v_lshl_or_b32 v0, v5, 16, v0
2858; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
2859; GFX9-NEXT:    s_endpgm
2860  %r = urem <4 x i16> %x, %y
2861  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
2862  ret void
2863}
2864
2865define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
2866; CHECK-LABEL: @sdiv_v4i16(
2867; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2868; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2869; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
2870; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
2871; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
2872; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
2873; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
2874; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
2875; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
2876; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2877; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
2878; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
2879; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
2880; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
2881; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
2882; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
2883; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2884; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
2885; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
2886; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
2887; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
2888; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
2889; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
2890; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i16> undef, i16 [[TMP23]], i64 0
2891; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1
2892; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2893; CHECK-NEXT:    [[TMP27:%.*]] = sext i16 [[TMP25]] to i32
2894; CHECK-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP26]] to i32
2895; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
2896; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
2897; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
2898; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
2899; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
2900; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
2901; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
2902; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
2903; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
2904; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
2905; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
2906; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
2907; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2908; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
2909; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
2910; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
2911; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 16
2912; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
2913; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
2914; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1
2915; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2
2916; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2917; CHECK-NEXT:    [[TMP51:%.*]] = sext i16 [[TMP49]] to i32
2918; CHECK-NEXT:    [[TMP52:%.*]] = sext i16 [[TMP50]] to i32
2919; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
2920; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
2921; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
2922; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
2923; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
2924; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
2925; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
2926; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
2927; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
2928; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
2929; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
2930; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
2931; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
2932; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
2933; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
2934; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
2935; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 16
2936; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
2937; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
2938; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2
2939; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3
2940; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2941; CHECK-NEXT:    [[TMP75:%.*]] = sext i16 [[TMP73]] to i32
2942; CHECK-NEXT:    [[TMP76:%.*]] = sext i16 [[TMP74]] to i32
2943; CHECK-NEXT:    [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]]
2944; CHECK-NEXT:    [[TMP78:%.*]] = ashr i32 [[TMP77]], 30
2945; CHECK-NEXT:    [[TMP79:%.*]] = or i32 [[TMP78]], 1
2946; CHECK-NEXT:    [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float
2947; CHECK-NEXT:    [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float
2948; CHECK-NEXT:    [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]])
2949; CHECK-NEXT:    [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]]
2950; CHECK-NEXT:    [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]])
2951; CHECK-NEXT:    [[TMP85:%.*]] = fneg fast float [[TMP84]]
2952; CHECK-NEXT:    [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]])
2953; CHECK-NEXT:    [[TMP87:%.*]] = fptosi float [[TMP84]] to i32
2954; CHECK-NEXT:    [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]])
2955; CHECK-NEXT:    [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]])
2956; CHECK-NEXT:    [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]]
2957; CHECK-NEXT:    [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0
2958; CHECK-NEXT:    [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]]
2959; CHECK-NEXT:    [[TMP93:%.*]] = shl i32 [[TMP92]], 16
2960; CHECK-NEXT:    [[TMP94:%.*]] = ashr i32 [[TMP93]], 16
2961; CHECK-NEXT:    [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16
2962; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3
2963; CHECK-NEXT:    store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
2964; CHECK-NEXT:    ret void
2965;
2966; GFX6-LABEL: sdiv_v4i16:
2967; GFX6:       ; %bb.0:
2968; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2969; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
2970; GFX6-NEXT:    s_mov_b32 s7, 0xf000
2971; GFX6-NEXT:    s_mov_b32 s6, -1
2972; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
2973; GFX6-NEXT:    s_sext_i32_i16 s8, s2
2974; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s8
2975; GFX6-NEXT:    s_sext_i32_i16 s9, s0
2976; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
2977; GFX6-NEXT:    s_xor_b32 s8, s9, s8
2978; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2979; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
2980; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
2981; GFX6-NEXT:    s_or_b32 s8, s8, 1
2982; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
2983; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
2984; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
2985; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
2986; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
2987; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s2
2988; GFX6-NEXT:    v_mov_b32_e32 v3, s8
2989; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
2990; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
2991; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
2992; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
2993; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
2994; GFX6-NEXT:    s_xor_b32 s0, s0, s2
2995; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
2996; GFX6-NEXT:    s_or_b32 s0, s0, 1
2997; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
2998; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
2999; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
3000; GFX6-NEXT:    v_mov_b32_e32 v4, s0
3001; GFX6-NEXT:    s_sext_i32_i16 s0, s3
3002; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
3003; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
3004; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
3005; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
3006; GFX6-NEXT:    s_sext_i32_i16 s2, s1
3007; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v1, v3
3008; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s2
3009; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
3010; GFX6-NEXT:    s_xor_b32 s0, s2, s0
3011; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
3012; GFX6-NEXT:    s_or_b32 s0, s0, 1
3013; GFX6-NEXT:    v_mul_f32_e32 v4, v1, v4
3014; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
3015; GFX6-NEXT:    v_mad_f32 v1, -v4, v2, v1
3016; GFX6-NEXT:    v_mov_b32_e32 v5, s0
3017; GFX6-NEXT:    s_ashr_i32 s0, s3, 16
3018; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
3019; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
3020; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
3021; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
3022; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
3023; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
3024; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s1
3025; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
3026; GFX6-NEXT:    s_xor_b32 s0, s1, s0
3027; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
3028; GFX6-NEXT:    s_or_b32 s0, s0, 1
3029; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
3030; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
3031; GFX6-NEXT:    v_mad_f32 v4, -v5, v2, v4
3032; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
3033; GFX6-NEXT:    v_mov_b32_e32 v6, s0
3034; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
3035; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
3036; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
3037; GFX6-NEXT:    s_mov_b32 s0, 0xffff
3038; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3039; GFX6-NEXT:    v_and_b32_e32 v1, s0, v1
3040; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
3041; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
3042; GFX6-NEXT:    v_and_b32_e32 v0, s0, v0
3043; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
3044; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3045; GFX6-NEXT:    s_endpgm
3046; GFX9-LABEL: sdiv_v4i16:
3047; GFX9:       ; %bb.0:
3048; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3049; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
3050; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3051; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3052; GFX9-NEXT:    s_sext_i32_i16 s0, s6
3053; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
3054; GFX9-NEXT:    s_sext_i32_i16 s1, s4
3055; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s1
3056; GFX9-NEXT:    s_xor_b32 s0, s1, s0
3057; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3058; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3059; GFX9-NEXT:    s_or_b32 s8, s0, 1
3060; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
3061; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3062; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
3063; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
3064; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
3065; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
3066; GFX9-NEXT:    s_ashr_i32 s1, s6, 16
3067; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
3068; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
3069; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s4
3070; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
3071; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
3072; GFX9-NEXT:    v_add_u32_e32 v3, s0, v3
3073; GFX9-NEXT:    v_mul_f32_e32 v4, v1, v4
3074; GFX9-NEXT:    s_xor_b32 s0, s4, s1
3075; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
3076; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3077; GFX9-NEXT:    v_mad_f32 v1, -v4, v0, v1
3078; GFX9-NEXT:    s_or_b32 s4, s0, 1
3079; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
3080; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
3081; GFX9-NEXT:    s_sext_i32_i16 s1, s7
3082; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
3083; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
3084; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
3085; GFX9-NEXT:    v_add_u32_e32 v4, s0, v4
3086; GFX9-NEXT:    s_sext_i32_i16 s0, s5
3087; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s0
3088; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v0
3089; GFX9-NEXT:    s_xor_b32 s0, s0, s1
3090; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3091; GFX9-NEXT:    s_or_b32 s4, s0, 1
3092; GFX9-NEXT:    v_mul_f32_e32 v5, v1, v5
3093; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
3094; GFX9-NEXT:    v_mad_f32 v1, -v5, v0, v1
3095; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
3096; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
3097; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
3098; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
3099; GFX9-NEXT:    s_ashr_i32 s1, s7, 16
3100; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
3101; GFX9-NEXT:    v_add_u32_e32 v1, s0, v5
3102; GFX9-NEXT:    s_ashr_i32 s0, s5, 16
3103; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s0
3104; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v0
3105; GFX9-NEXT:    s_xor_b32 s0, s0, s1
3106; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3107; GFX9-NEXT:    s_or_b32 s4, s0, 1
3108; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
3109; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
3110; GFX9-NEXT:    v_mad_f32 v5, -v6, v0, v5
3111; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
3112; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v0|
3113; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
3114; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
3115; GFX9-NEXT:    v_mov_b32_e32 v5, 0xffff
3116; GFX9-NEXT:    v_add_u32_e32 v0, s0, v6
3117; GFX9-NEXT:    v_and_b32_e32 v1, v5, v1
3118; GFX9-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
3119; GFX9-NEXT:    v_and_b32_e32 v0, v5, v3
3120; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
3121; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
3122; GFX9-NEXT:    s_endpgm
3123  %r = sdiv <4 x i16> %x, %y
3124  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
3125  ret void
3126}
3127
3128define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
3129; CHECK-LABEL: @srem_v4i16(
3130; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
3131; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
3132; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
3133; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
3134; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
3135; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
3136; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
3137; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
3138; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
3139; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
3140; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
3141; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
3142; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
3143; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
3144; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
3145; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
3146; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3147; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
3148; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
3149; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
3150; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
3151; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
3152; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 16
3153; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
3154; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
3155; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i16> undef, i16 [[TMP25]], i64 0
3156; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1
3157; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1
3158; CHECK-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP27]] to i32
3159; CHECK-NEXT:    [[TMP30:%.*]] = sext i16 [[TMP28]] to i32
3160; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
3161; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
3162; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
3163; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
3164; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
3165; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
3166; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
3167; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
3168; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
3169; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
3170; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
3171; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
3172; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
3173; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
3174; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
3175; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
3176; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
3177; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
3178; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 16
3179; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
3180; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
3181; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1
3182; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2
3183; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2
3184; CHECK-NEXT:    [[TMP55:%.*]] = sext i16 [[TMP53]] to i32
3185; CHECK-NEXT:    [[TMP56:%.*]] = sext i16 [[TMP54]] to i32
3186; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
3187; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
3188; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
3189; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
3190; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
3191; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
3192; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
3193; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
3194; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
3195; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
3196; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
3197; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
3198; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
3199; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
3200; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
3201; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
3202; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
3203; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
3204; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 16
3205; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
3206; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
3207; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2
3208; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3
3209; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3
3210; CHECK-NEXT:    [[TMP81:%.*]] = sext i16 [[TMP79]] to i32
3211; CHECK-NEXT:    [[TMP82:%.*]] = sext i16 [[TMP80]] to i32
3212; CHECK-NEXT:    [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]]
3213; CHECK-NEXT:    [[TMP84:%.*]] = ashr i32 [[TMP83]], 30
3214; CHECK-NEXT:    [[TMP85:%.*]] = or i32 [[TMP84]], 1
3215; CHECK-NEXT:    [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float
3216; CHECK-NEXT:    [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float
3217; CHECK-NEXT:    [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]])
3218; CHECK-NEXT:    [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]]
3219; CHECK-NEXT:    [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]])
3220; CHECK-NEXT:    [[TMP91:%.*]] = fneg fast float [[TMP90]]
3221; CHECK-NEXT:    [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]])
3222; CHECK-NEXT:    [[TMP93:%.*]] = fptosi float [[TMP90]] to i32
3223; CHECK-NEXT:    [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]])
3224; CHECK-NEXT:    [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]])
3225; CHECK-NEXT:    [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]]
3226; CHECK-NEXT:    [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0
3227; CHECK-NEXT:    [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]]
3228; CHECK-NEXT:    [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]]
3229; CHECK-NEXT:    [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]]
3230; CHECK-NEXT:    [[TMP101:%.*]] = shl i32 [[TMP100]], 16
3231; CHECK-NEXT:    [[TMP102:%.*]] = ashr i32 [[TMP101]], 16
3232; CHECK-NEXT:    [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16
3233; CHECK-NEXT:    [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3
3234; CHECK-NEXT:    store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
3235; CHECK-NEXT:    ret void
3236;
3237; GFX6-LABEL: srem_v4i16:
3238; GFX6:       ; %bb.0:
3239; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3240; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
3241; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3242; GFX6-NEXT:    s_mov_b32 s6, -1
3243; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3244; GFX6-NEXT:    s_sext_i32_i16 s8, s2
3245; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s8
3246; GFX6-NEXT:    s_sext_i32_i16 s9, s0
3247; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
3248; GFX6-NEXT:    s_xor_b32 s8, s9, s8
3249; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3250; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
3251; GFX6-NEXT:    s_or_b32 s8, s8, 1
3252; GFX6-NEXT:    v_mov_b32_e32 v3, s8
3253; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3254; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3255; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3256; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
3257; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
3258; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
3259; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
3260; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
3261; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
3262; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s2
3263; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
3264; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
3265; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
3266; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
3267; GFX6-NEXT:    s_xor_b32 s8, s0, s2
3268; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
3269; GFX6-NEXT:    s_or_b32 s8, s8, 1
3270; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
3271; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
3272; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
3273; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
3274; GFX6-NEXT:    v_mov_b32_e32 v4, s8
3275; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
3276; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
3277; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
3278; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s2
3279; GFX6-NEXT:    s_sext_i32_i16 s2, s3
3280; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s2
3281; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s0, v1
3282; GFX6-NEXT:    s_sext_i32_i16 s0, s1
3283; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s0
3284; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
3285; GFX6-NEXT:    s_xor_b32 s0, s0, s2
3286; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
3287; GFX6-NEXT:    s_or_b32 s0, s0, 1
3288; GFX6-NEXT:    v_mul_f32_e32 v4, v1, v4
3289; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
3290; GFX6-NEXT:    v_mad_f32 v1, -v4, v2, v1
3291; GFX6-NEXT:    v_mov_b32_e32 v5, s0
3292; GFX6-NEXT:    s_ashr_i32 s0, s3, 16
3293; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
3294; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
3295; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
3296; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
3297; GFX6-NEXT:    s_ashr_i32 s2, s1, 16
3298; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
3299; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s2
3300; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
3301; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s3
3302; GFX6-NEXT:    s_xor_b32 s3, s2, s0
3303; GFX6-NEXT:    s_ashr_i32 s3, s3, 30
3304; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
3305; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
3306; GFX6-NEXT:    v_mad_f32 v4, -v5, v2, v4
3307; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
3308; GFX6-NEXT:    s_or_b32 s3, s3, 1
3309; GFX6-NEXT:    v_mov_b32_e32 v6, s3
3310; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
3311; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
3312; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
3313; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s0
3314; GFX6-NEXT:    s_mov_b32 s0, 0xffff
3315; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
3316; GFX6-NEXT:    v_and_b32_e32 v1, s0, v1
3317; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
3318; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3319; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
3320; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
3321; GFX6-NEXT:    v_and_b32_e32 v0, s0, v0
3322; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
3323; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3324; GFX6-NEXT:    s_endpgm
3325; GFX9-LABEL: srem_v4i16:
3326; GFX9:       ; %bb.0:
3327; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3328; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
3329; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3330; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3331; GFX9-NEXT:    s_sext_i32_i16 s0, s6
3332; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
3333; GFX9-NEXT:    s_sext_i32_i16 s1, s4
3334; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s1
3335; GFX9-NEXT:    s_xor_b32 s0, s1, s0
3336; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3337; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3338; GFX9-NEXT:    s_or_b32 s8, s0, 1
3339; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
3340; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3341; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
3342; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
3343; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
3344; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
3345; GFX9-NEXT:    s_ashr_i32 s9, s6, 16
3346; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
3347; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s9
3348; GFX9-NEXT:    s_ashr_i32 s8, s4, 16
3349; GFX9-NEXT:    v_add_u32_e32 v1, s0, v3
3350; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s8
3351; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
3352; GFX9-NEXT:    s_xor_b32 s0, s8, s9
3353; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3354; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
3355; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
3356; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
3357; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
3358; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
3359; GFX9-NEXT:    s_or_b32 s6, s0, 1
3360; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v0|
3361; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
3362; GFX9-NEXT:    s_cselect_b32 s0, s6, 0
3363; GFX9-NEXT:    v_add_u32_e32 v0, s0, v4
3364; GFX9-NEXT:    s_sext_i32_i16 s0, s7
3365; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s0
3366; GFX9-NEXT:    s_sext_i32_i16 s1, s5
3367; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s1
3368; GFX9-NEXT:    s_xor_b32 s0, s1, s0
3369; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v3
3370; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3371; GFX9-NEXT:    s_or_b32 s6, s0, 1
3372; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s9
3373; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
3374; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
3375; GFX9-NEXT:    v_mad_f32 v4, -v5, v3, v4
3376; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
3377; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
3378; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
3379; GFX9-NEXT:    s_cselect_b32 s0, s6, 0
3380; GFX9-NEXT:    s_ashr_i32 s6, s7, 16
3381; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s6
3382; GFX9-NEXT:    v_add_u32_e32 v3, s0, v5
3383; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s7
3384; GFX9-NEXT:    s_ashr_i32 s7, s5, 16
3385; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s7
3386; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
3387; GFX9-NEXT:    s_xor_b32 s0, s7, s6
3388; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3389; GFX9-NEXT:    s_or_b32 s9, s0, 1
3390; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
3391; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
3392; GFX9-NEXT:    v_mad_f32 v5, -v6, v4, v5
3393; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
3394; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v4|
3395; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
3396; GFX9-NEXT:    s_cselect_b32 s0, s9, 0
3397; GFX9-NEXT:    v_add_u32_e32 v4, s0, v6
3398; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s6
3399; GFX9-NEXT:    v_sub_u32_e32 v5, s4, v1
3400; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v3
3401; GFX9-NEXT:    v_sub_u32_e32 v0, s8, v0
3402; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v4
3403; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
3404; GFX9-NEXT:    v_and_b32_e32 v1, v4, v1
3405; GFX9-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
3406; GFX9-NEXT:    v_and_b32_e32 v3, v4, v5
3407; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
3408; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
3409; GFX9-NEXT:    s_endpgm
3410  %r = srem <4 x i16> %x, %y
3411  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
3412  ret void
3413}
3414
3415define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
3416; CHECK-LABEL: @udiv_i3(
3417; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
3418; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
3419; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
3420; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
3421; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
3422; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
3423; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
3424; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
3425; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
3426; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
3427; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3428; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
3429; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
3430; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
3431; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
3432; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 7
3433; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3
3434; CHECK-NEXT:    store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]], align 1
3435; CHECK-NEXT:    ret void
3436;
3437; GFX6-LABEL: udiv_i3:
3438; GFX6:       ; %bb.0:
3439; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3440; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3441; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3442; GFX6-NEXT:    s_mov_b32 s6, -1
3443; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3444; GFX6-NEXT:    s_bfe_u32 s1, s0, 0x30008
3445; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, s1
3446; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
3447; GFX6-NEXT:    s_and_b32 s0, s0, 7
3448; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
3449; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
3450; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
3451; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
3452; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
3453; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3454; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
3455; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
3456; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
3457; GFX6-NEXT:    s_endpgm
3458; GFX9-LABEL: udiv_i3:
3459; GFX9:       ; %bb.0:
3460; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3461; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
3462; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3463; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3464; GFX9-NEXT:    s_bfe_u32 s0, s4, 0x30008
3465; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, s0
3466; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
3467; GFX9-NEXT:    s_and_b32 s0, s4, 7
3468; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, s0
3469; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
3470; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
3471; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v1
3472; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v3
3473; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3474; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
3475; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
3476; GFX9-NEXT:    global_store_byte v2, v0, s[2:3]
3477; GFX9-NEXT:    s_endpgm
3478  %r = udiv i3 %x, %y
3479  store i3 %r, i3 addrspace(1)* %out
3480  ret void
3481}
3482
3483define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
3484; CHECK-LABEL: @urem_i3(
3485; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
3486; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
3487; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
3488; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
3489; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
3490; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
3491; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
3492; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
3493; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
3494; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
3495; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3496; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
3497; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
3498; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
3499; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
3500; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
3501; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
3502; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 7
3503; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3
3504; CHECK-NEXT:    store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]], align 1
3505; CHECK-NEXT:    ret void
3506;
3507; GFX6-LABEL: urem_i3:
3508; GFX6:       ; %bb.0:
3509; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3510; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3511; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3512; GFX6-NEXT:    s_mov_b32 s6, -1
3513; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3514; GFX6-NEXT:    s_bfe_u32 s1, s0, 0x30008
3515; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, s1
3516; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
3517; GFX6-NEXT:    s_and_b32 s2, s0, 7
3518; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s2
3519; GFX6-NEXT:    s_lshr_b32 s1, s0, 8
3520; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
3521; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
3522; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v1
3523; GFX6-NEXT:    v_mad_f32 v1, -v1, v0, v2
3524; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3525; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
3526; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s1
3527; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
3528; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
3529; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
3530; GFX6-NEXT:    s_endpgm
3531; GFX9-LABEL: urem_i3:
3532; GFX9:       ; %bb.0:
3533; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
3534; GFX9-NEXT:    s_nop 0
3535; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3536; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3537; GFX9-NEXT:    s_bfe_u32 s3, s2, 0x30008
3538; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
3539; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
3540; GFX9-NEXT:    s_and_b32 s4, s2, 7
3541; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
3542; GFX9-NEXT:    s_lshr_b32 s3, s2, 8
3543; GFX9-NEXT:    v_mul_f32_e32 v1, v2, v1
3544; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
3545; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v1
3546; GFX9-NEXT:    v_mad_f32 v1, -v1, v0, v2
3547; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3548; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3549; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
3550; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
3551; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
3552; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
3553; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
3554; GFX9-NEXT:    s_endpgm
3555  %r = urem i3 %x, %y
3556  store i3 %r, i3 addrspace(1)* %out
3557  ret void
3558}
3559
3560define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
3561; CHECK-LABEL: @sdiv_i3(
3562; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
3563; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
3564; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
3565; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
3566; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
3567; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
3568; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
3569; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
3570; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
3571; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
3572; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
3573; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
3574; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
3575; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
3576; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
3577; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
3578; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
3579; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
3580; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 29
3581; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 29
3582; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3
3583; CHECK-NEXT:    store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]], align 1
3584; CHECK-NEXT:    ret void
3585;
3586; GFX6-LABEL: sdiv_i3:
3587; GFX6:       ; %bb.0:
3588; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3589; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3590; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3591; GFX6-NEXT:    s_mov_b32 s6, -1
3592; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3593; GFX6-NEXT:    s_bfe_i32 s1, s0, 0x30008
3594; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s1
3595; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x30000
3596; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s0
3597; GFX6-NEXT:    s_xor_b32 s0, s0, s1
3598; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3599; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
3600; GFX6-NEXT:    s_or_b32 s0, s0, 1
3601; GFX6-NEXT:    v_mov_b32_e32 v3, s0
3602; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3603; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3604; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3605; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
3606; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
3607; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
3608; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
3609; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
3610; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
3611; GFX6-NEXT:    s_endpgm
3612; GFX9-LABEL: sdiv_i3:
3613; GFX9:       ; %bb.0:
3614; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3615; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
3616; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3617; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3618; GFX9-NEXT:    s_bfe_i32 s0, s4, 0x30008
3619; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
3620; GFX9-NEXT:    s_bfe_i32 s1, s4, 0x30000
3621; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
3622; GFX9-NEXT:    s_xor_b32 s0, s1, s0
3623; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3624; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
3625; GFX9-NEXT:    s_or_b32 s4, s0, 1
3626; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
3627; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3628; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
3629; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
3630; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
3631; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
3632; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
3633; GFX9-NEXT:    v_add_u32_e32 v0, s0, v3
3634; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
3635; GFX9-NEXT:    global_store_byte v1, v0, s[2:3]
3636; GFX9-NEXT:    s_endpgm
3637  %r = sdiv i3 %x, %y
3638  store i3 %r, i3 addrspace(1)* %out
3639  ret void
3640}
3641
3642define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
3643; CHECK-LABEL: @srem_i3(
3644; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
3645; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
3646; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
3647; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
3648; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
3649; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
3650; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
3651; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
3652; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
3653; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
3654; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
3655; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
3656; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
3657; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
3658; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
3659; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
3660; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
3661; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
3662; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
3663; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
3664; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 29
3665; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 29
3666; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3
3667; CHECK-NEXT:    store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]], align 1
3668; CHECK-NEXT:    ret void
3669;
3670; GFX6-LABEL: srem_i3:
3671; GFX6:       ; %bb.0:
3672; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3673; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
3674; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3675; GFX6-NEXT:    s_mov_b32 s6, -1
3676; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3677; GFX6-NEXT:    s_bfe_i32 s1, s0, 0x30008
3678; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s1
3679; GFX6-NEXT:    s_bfe_i32 s3, s0, 0x30000
3680; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s3
3681; GFX6-NEXT:    s_xor_b32 s1, s3, s1
3682; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3683; GFX6-NEXT:    s_ashr_i32 s1, s1, 30
3684; GFX6-NEXT:    s_or_b32 s1, s1, 1
3685; GFX6-NEXT:    v_mov_b32_e32 v3, s1
3686; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3687; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3688; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3689; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
3690; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
3691; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
3692; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
3693; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
3694; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
3695; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
3696; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
3697; GFX6-NEXT:    buffer_store_byte v0, off, s[4:7], 0
3698; GFX6-NEXT:    s_endpgm
3699; GFX9-LABEL: srem_i3:
3700; GFX9:       ; %bb.0:
3701; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
3702; GFX9-NEXT:    s_nop 0
3703; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3704; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3705; GFX9-NEXT:    s_bfe_i32 s2, s4, 0x30008
3706; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
3707; GFX9-NEXT:    s_bfe_i32 s3, s4, 0x30000
3708; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s3
3709; GFX9-NEXT:    s_xor_b32 s2, s3, s2
3710; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3711; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
3712; GFX9-NEXT:    s_lshr_b32 s5, s4, 8
3713; GFX9-NEXT:    s_or_b32 s6, s2, 1
3714; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
3715; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
3716; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
3717; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
3718; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
3719; GFX9-NEXT:    s_cmp_lg_u64 s[2:3], 0
3720; GFX9-NEXT:    s_cselect_b32 s2, s6, 0
3721; GFX9-NEXT:    v_add_u32_e32 v0, s2, v2
3722; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
3723; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3724; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
3725; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
3726; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
3727; GFX9-NEXT:    s_endpgm
3728  %r = srem i3 %x, %y
3729  store i3 %r, i3 addrspace(1)* %out
3730  ret void
3731}
3732
3733define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
3734; CHECK-LABEL: @udiv_v3i16(
3735; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
3736; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
3737; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
3738; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
3739; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3740; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3741; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3742; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3743; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3744; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
3745; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3746; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3747; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3748; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3749; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3750; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3751; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3752; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
3753; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
3754; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <3 x i16> undef, i16 [[TMP19]], i64 0
3755; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1
3756; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1
3757; CHECK-NEXT:    [[TMP23:%.*]] = zext i16 [[TMP21]] to i32
3758; CHECK-NEXT:    [[TMP24:%.*]] = zext i16 [[TMP22]] to i32
3759; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
3760; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
3761; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
3762; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
3763; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
3764; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
3765; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
3766; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
3767; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
3768; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
3769; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
3770; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
3771; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
3772; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 65535
3773; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16
3774; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1
3775; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2
3776; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2
3777; CHECK-NEXT:    [[TMP43:%.*]] = zext i16 [[TMP41]] to i32
3778; CHECK-NEXT:    [[TMP44:%.*]] = zext i16 [[TMP42]] to i32
3779; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
3780; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
3781; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
3782; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
3783; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
3784; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
3785; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
3786; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
3787; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
3788; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
3789; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
3790; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
3791; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
3792; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 65535
3793; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
3794; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2
3795; CHECK-NEXT:    store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
3796; CHECK-NEXT:    ret void
3797;
3798; GFX6-LABEL: udiv_v3i16:
3799; GFX6:       ; %bb.0:
3800; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3801; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3802; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3803; GFX6-NEXT:    s_mov_b32 s8, 0xffff
3804; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3805; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3806; GFX6-NEXT:    s_and_b32 s6, s0, s8
3807; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
3808; GFX6-NEXT:    s_and_b32 s6, s2, s8
3809; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
3810; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s0
3811; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s6
3812; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3813; GFX6-NEXT:    s_lshr_b32 s0, s2, 16
3814; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s0
3815; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
3816; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
3817; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3818; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
3819; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
3820; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
3821; GFX6-NEXT:    v_mul_f32_e32 v1, v4, v5
3822; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
3823; GFX6-NEXT:    s_and_b32 s0, s1, s8
3824; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
3825; GFX6-NEXT:    v_mad_f32 v2, -v1, v3, v4
3826; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s0
3827; GFX6-NEXT:    s_and_b32 s0, s3, s8
3828; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s0
3829; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
3830; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
3831; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
3832; GFX6-NEXT:    s_mov_b32 s6, -1
3833; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3834; GFX6-NEXT:    v_mul_f32_e32 v2, v5, v6
3835; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
3836; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
3837; GFX6-NEXT:    v_mad_f32 v2, -v2, v4, v5
3838; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
3839; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3840; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
3841; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
3842; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
3843; GFX6-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
3844; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3845; GFX6-NEXT:    s_endpgm
3846; GFX9-LABEL: udiv_v3i16:
3847; GFX9:       ; %bb.0:
3848; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
3849; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
3850; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
3851; GFX9-NEXT:    s_mov_b32 s8, 0xffff
3852; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3853; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3854; GFX9-NEXT:    s_and_b32 s0, s6, s8
3855; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
3856; GFX9-NEXT:    s_and_b32 s0, s4, s8
3857; GFX9-NEXT:    s_lshr_b32 s1, s6, 16
3858; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s0
3859; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3860; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s1
3861; GFX9-NEXT:    s_lshr_b32 s0, s4, 16
3862; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
3863; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
3864; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
3865; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3866; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
3867; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
3868; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
3869; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v6
3870; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
3871; GFX9-NEXT:    s_and_b32 s0, s7, s8
3872; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
3873; GFX9-NEXT:    v_mad_f32 v3, -v2, v4, v5
3874; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
3875; GFX9-NEXT:    s_and_b32 s0, s5, s8
3876; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
3877; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
3878; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v5
3879; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
3880; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3881; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
3882; GFX9-NEXT:    v_mul_f32_e32 v3, v6, v7
3883; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
3884; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v3
3885; GFX9-NEXT:    v_mad_f32 v3, -v3, v5, v6
3886; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
3887; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
3888; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
3889; GFX9-NEXT:    global_store_short v1, v3, s[2:3] offset:4
3890; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
3891; GFX9-NEXT:    s_endpgm
3892  %r = udiv <3 x i16> %x, %y
3893  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
3894  ret void
3895}
3896
3897define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
3898; CHECK-LABEL: @urem_v3i16(
3899; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
3900; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
3901; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
3902; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
3903; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3904; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3905; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3906; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3907; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3908; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
3909; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3910; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3911; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3912; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3913; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3914; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3915; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3916; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
3917; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
3918; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 65535
3919; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
3920; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x i16> undef, i16 [[TMP21]], i64 0
3921; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1
3922; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1
3923; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP23]] to i32
3924; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TMP24]] to i32
3925; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
3926; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
3927; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
3928; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
3929; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
3930; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
3931; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
3932; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
3933; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
3934; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
3935; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
3936; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
3937; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
3938; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
3939; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
3940; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 65535
3941; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16
3942; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1
3943; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2
3944; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2
3945; CHECK-NEXT:    [[TMP47:%.*]] = zext i16 [[TMP45]] to i32
3946; CHECK-NEXT:    [[TMP48:%.*]] = zext i16 [[TMP46]] to i32
3947; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
3948; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
3949; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
3950; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
3951; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
3952; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
3953; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
3954; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
3955; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
3956; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
3957; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
3958; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
3959; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
3960; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
3961; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
3962; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 65535
3963; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
3964; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2
3965; CHECK-NEXT:    store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
3966; CHECK-NEXT:    ret void
3967;
3968; GFX6-LABEL: urem_v3i16:
3969; GFX6:       ; %bb.0:
3970; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3971; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3972; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3973; GFX6-NEXT:    s_mov_b32 s8, 0xffff
3974; GFX6-NEXT:    s_mov_b32 s7, 0xf000
3975; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
3976; GFX6-NEXT:    v_mov_b32_e32 v1, s2
3977; GFX6-NEXT:    s_and_b32 s6, s0, s8
3978; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
3979; GFX6-NEXT:    s_and_b32 s6, s2, s8
3980; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s6
3981; GFX6-NEXT:    v_mov_b32_e32 v4, s0
3982; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3983; GFX6-NEXT:    v_alignbit_b32 v4, s1, v4, 16
3984; GFX6-NEXT:    v_and_b32_e32 v5, s8, v4
3985; GFX6-NEXT:    v_alignbit_b32 v1, s3, v1, 16
3986; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
3987; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
3988; GFX6-NEXT:    v_mad_f32 v2, -v3, v0, v2
3989; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
3990; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
3991; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, v5
3992; GFX6-NEXT:    v_and_b32_e32 v3, s8, v1
3993; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
3994; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s0
3995; GFX6-NEXT:    s_and_b32 s0, s1, s8
3996; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, v3
3997; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
3998; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s0
3999; GFX6-NEXT:    s_and_b32 s0, s3, s8
4000; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, s0
4001; GFX6-NEXT:    v_mul_f32_e32 v5, v3, v5
4002; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
4003; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v6
4004; GFX6-NEXT:    v_mad_f32 v3, -v5, v2, v3
4005; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v5
4006; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
4007; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
4008; GFX6-NEXT:    v_mul_f32_e32 v3, v7, v8
4009; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
4010; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
4011; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
4012; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v3
4013; GFX6-NEXT:    v_mad_f32 v3, -v3, v6, v7
4014; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v6
4015; GFX6-NEXT:    s_mov_b32 s6, -1
4016; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
4017; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s1
4018; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
4019; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
4020; GFX6-NEXT:    v_and_b32_e32 v0, s8, v0
4021; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s3, v3
4022; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
4023; GFX6-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
4024; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4025; GFX6-NEXT:    s_endpgm
4026; GFX9-LABEL: urem_v3i16:
4027; GFX9:       ; %bb.0:
4028; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4029; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
4030; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
4031; GFX9-NEXT:    s_mov_b32 s8, 0xffff
4032; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4033; GFX9-NEXT:    s_and_b32 s0, s4, s8
4034; GFX9-NEXT:    s_and_b32 s1, s6, s8
4035; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s1
4036; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s0
4037; GFX9-NEXT:    s_lshr_b32 s6, s6, 16
4038; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
4039; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
4040; GFX9-NEXT:    s_lshr_b32 s4, s4, 16
4041; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s4
4042; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v2
4043; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
4044; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
4045; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v3
4046; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
4047; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
4048; GFX9-NEXT:    v_mul_f32_e32 v1, v4, v5
4049; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
4050; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s1
4051; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
4052; GFX9-NEXT:    s_and_b32 s1, s7, s8
4053; GFX9-NEXT:    v_mad_f32 v3, -v1, v2, v4
4054; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s1
4055; GFX9-NEXT:    s_and_b32 s5, s5, s8
4056; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s5
4057; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
4058; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
4059; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
4060; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
4061; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
4062; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v6
4063; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
4064; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v2
4065; GFX9-NEXT:    v_mad_f32 v2, -v2, v4, v5
4066; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
4067; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
4068; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
4069; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s1
4070; GFX9-NEXT:    v_mov_b32_e32 v3, 0
4071; GFX9-NEXT:    v_sub_u32_e32 v1, s4, v1
4072; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v0
4073; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
4074; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
4075; GFX9-NEXT:    global_store_short v3, v2, s[2:3] offset:4
4076; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
4077; GFX9-NEXT:    s_endpgm
4078  %r = urem <3 x i16> %x, %y
4079  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
4080  ret void
4081}
4082
4083define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
4084; CHECK-LABEL: @sdiv_v3i16(
4085; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
4086; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
4087; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
4088; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
4089; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4090; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
4091; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
4092; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
4093; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
4094; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
4095; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
4096; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
4097; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
4098; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
4099; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
4100; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
4101; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
4102; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
4103; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
4104; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
4105; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
4106; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
4107; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
4108; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x i16> undef, i16 [[TMP23]], i64 0
4109; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1
4110; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1
4111; CHECK-NEXT:    [[TMP27:%.*]] = sext i16 [[TMP25]] to i32
4112; CHECK-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP26]] to i32
4113; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
4114; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
4115; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
4116; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
4117; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
4118; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
4119; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
4120; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
4121; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
4122; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
4123; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
4124; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
4125; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
4126; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
4127; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
4128; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
4129; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 16
4130; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
4131; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
4132; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1
4133; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2
4134; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2
4135; CHECK-NEXT:    [[TMP51:%.*]] = sext i16 [[TMP49]] to i32
4136; CHECK-NEXT:    [[TMP52:%.*]] = sext i16 [[TMP50]] to i32
4137; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
4138; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
4139; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
4140; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
4141; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
4142; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
4143; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
4144; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
4145; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
4146; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
4147; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
4148; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
4149; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
4150; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
4151; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
4152; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
4153; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 16
4154; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
4155; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
4156; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2
4157; CHECK-NEXT:    store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
4158; CHECK-NEXT:    ret void
4159;
4160; GFX6-LABEL: sdiv_v3i16:
4161; GFX6:       ; %bb.0:
4162; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4163; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4164; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
4165; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4166; GFX6-NEXT:    s_mov_b32 s6, -1
4167; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4168; GFX6-NEXT:    s_sext_i32_i16 s9, s2
4169; GFX6-NEXT:    s_sext_i32_i16 s8, s0
4170; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s8
4171; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s9
4172; GFX6-NEXT:    s_xor_b32 s8, s9, s8
4173; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
4174; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
4175; GFX6-NEXT:    s_ashr_i32 s8, s8, 30
4176; GFX6-NEXT:    s_or_b32 s8, s8, 1
4177; GFX6-NEXT:    v_mov_b32_e32 v3, s8
4178; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
4179; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
4180; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
4181; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
4182; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
4183; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s0
4184; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
4185; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
4186; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
4187; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s2
4188; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
4189; GFX6-NEXT:    s_xor_b32 s0, s2, s0
4190; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
4191; GFX6-NEXT:    s_or_b32 s0, s0, 1
4192; GFX6-NEXT:    v_mul_f32_e32 v3, v2, v3
4193; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
4194; GFX6-NEXT:    v_mad_f32 v2, -v3, v1, v2
4195; GFX6-NEXT:    v_mov_b32_e32 v4, s0
4196; GFX6-NEXT:    s_sext_i32_i16 s0, s1
4197; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
4198; GFX6-NEXT:    v_cvt_i32_f32_e32 v3, v3
4199; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s0
4200; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
4201; GFX6-NEXT:    s_sext_i32_i16 s1, s3
4202; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
4203; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s1
4204; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
4205; GFX6-NEXT:    s_xor_b32 s0, s1, s0
4206; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
4207; GFX6-NEXT:    s_or_b32 s0, s0, 1
4208; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
4209; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
4210; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
4211; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
4212; GFX6-NEXT:    v_mov_b32_e32 v5, s0
4213; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
4214; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
4215; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
4216; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
4217; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
4218; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
4219; GFX6-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
4220; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4221; GFX6-NEXT:    s_endpgm
4222; GFX9-LABEL: sdiv_v3i16:
4223; GFX9:       ; %bb.0:
4224; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4225; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
4226; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
4227; GFX9-NEXT:    v_mov_b32_e32 v1, 0
4228; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4229; GFX9-NEXT:    s_sext_i32_i16 s1, s4
4230; GFX9-NEXT:    s_sext_i32_i16 s0, s6
4231; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s0
4232; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
4233; GFX9-NEXT:    s_xor_b32 s0, s1, s0
4234; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4235; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
4236; GFX9-NEXT:    s_or_b32 s8, s0, 1
4237; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
4238; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
4239; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
4240; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
4241; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
4242; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
4243; GFX9-NEXT:    s_ashr_i32 s1, s6, 16
4244; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
4245; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
4246; GFX9-NEXT:    s_ashr_i32 s4, s4, 16
4247; GFX9-NEXT:    v_add_u32_e32 v2, s0, v3
4248; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s4
4249; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
4250; GFX9-NEXT:    s_xor_b32 s0, s4, s1
4251; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4252; GFX9-NEXT:    s_or_b32 s4, s0, 1
4253; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
4254; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
4255; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
4256; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v0|
4257; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
4258; GFX9-NEXT:    s_sext_i32_i16 s1, s7
4259; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
4260; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s1
4261; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
4262; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff, v2
4263; GFX9-NEXT:    v_add_u32_e32 v3, s0, v4
4264; GFX9-NEXT:    s_sext_i32_i16 s0, s5
4265; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s0
4266; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v0
4267; GFX9-NEXT:    s_xor_b32 s0, s0, s1
4268; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4269; GFX9-NEXT:    s_or_b32 s4, s0, 1
4270; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
4271; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
4272; GFX9-NEXT:    v_mad_f32 v4, -v5, v0, v4
4273; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
4274; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v0|
4275; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
4276; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
4277; GFX9-NEXT:    v_add_u32_e32 v0, s0, v5
4278; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
4279; GFX9-NEXT:    global_store_short v1, v0, s[2:3] offset:4
4280; GFX9-NEXT:    global_store_dword v1, v2, s[2:3]
4281; GFX9-NEXT:    s_endpgm
4282  %r = sdiv <3 x i16> %x, %y
4283  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
4284  ret void
4285}
4286
4287define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
4288; CHECK-LABEL: @srem_v3i16(
4289; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
4290; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
4291; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
4292; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
4293; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4294; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
4295; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
4296; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
4297; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
4298; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
4299; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
4300; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
4301; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
4302; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
4303; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
4304; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
4305; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
4306; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
4307; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
4308; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
4309; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
4310; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
4311; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 16
4312; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
4313; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
4314; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x i16> undef, i16 [[TMP25]], i64 0
4315; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1
4316; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1
4317; CHECK-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP27]] to i32
4318; CHECK-NEXT:    [[TMP30:%.*]] = sext i16 [[TMP28]] to i32
4319; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
4320; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
4321; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
4322; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
4323; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
4324; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
4325; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
4326; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
4327; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
4328; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
4329; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
4330; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
4331; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
4332; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
4333; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
4334; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
4335; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
4336; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
4337; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 16
4338; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
4339; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
4340; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1
4341; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2
4342; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2
4343; CHECK-NEXT:    [[TMP55:%.*]] = sext i16 [[TMP53]] to i32
4344; CHECK-NEXT:    [[TMP56:%.*]] = sext i16 [[TMP54]] to i32
4345; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
4346; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
4347; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
4348; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
4349; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
4350; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
4351; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
4352; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
4353; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
4354; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
4355; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
4356; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
4357; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
4358; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
4359; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
4360; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
4361; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
4362; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
4363; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 16
4364; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
4365; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
4366; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2
4367; CHECK-NEXT:    store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
4368; CHECK-NEXT:    ret void
4369;
4370; GFX6-LABEL: srem_v3i16:
4371; GFX6:       ; %bb.0:
4372; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4373; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4374; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
4375; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4376; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4377; GFX6-NEXT:    s_sext_i32_i16 s8, s2
4378; GFX6-NEXT:    s_sext_i32_i16 s6, s0
4379; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s6
4380; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s8
4381; GFX6-NEXT:    s_xor_b32 s6, s8, s6
4382; GFX6-NEXT:    s_ashr_i32 s6, s6, 30
4383; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
4384; GFX6-NEXT:    s_or_b32 s6, s6, 1
4385; GFX6-NEXT:    v_mov_b32_e32 v3, s6
4386; GFX6-NEXT:    s_mov_b32 s6, -1
4387; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
4388; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
4389; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
4390; GFX6-NEXT:    v_cvt_i32_f32_e32 v2, v2
4391; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
4392; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
4393; GFX6-NEXT:    v_mov_b32_e32 v1, s2
4394; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
4395; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4396; GFX6-NEXT:    v_alignbit_b32 v2, s1, v2, 16
4397; GFX6-NEXT:    v_bfe_i32 v3, v2, 0, 16
4398; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v3
4399; GFX6-NEXT:    v_alignbit_b32 v1, s3, v1, 16
4400; GFX6-NEXT:    v_bfe_i32 v5, v1, 0, 16
4401; GFX6-NEXT:    v_cvt_f32_i32_e32 v6, v5
4402; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
4403; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s0
4404; GFX6-NEXT:    v_xor_b32_e32 v3, v5, v3
4405; GFX6-NEXT:    s_sext_i32_i16 s0, s1
4406; GFX6-NEXT:    v_mul_f32_e32 v5, v6, v7
4407; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
4408; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
4409; GFX6-NEXT:    v_mad_f32 v6, -v5, v4, v6
4410; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
4411; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
4412; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v4|
4413; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s0
4414; GFX6-NEXT:    v_or_b32_e32 v3, 1, v3
4415; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
4416; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
4417; GFX6-NEXT:    s_sext_i32_i16 s2, s3
4418; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
4419; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s2
4420; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v4
4421; GFX6-NEXT:    s_xor_b32 s0, s2, s0
4422; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
4423; GFX6-NEXT:    s_or_b32 s0, s0, 1
4424; GFX6-NEXT:    v_mul_f32_e32 v5, v3, v5
4425; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
4426; GFX6-NEXT:    v_mad_f32 v3, -v5, v4, v3
4427; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
4428; GFX6-NEXT:    v_mov_b32_e32 v6, s0
4429; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v4|
4430; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
4431; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
4432; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s1
4433; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
4434; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
4435; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
4436; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s3, v3
4437; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
4438; GFX6-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
4439; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4440; GFX6-NEXT:    s_endpgm
4441; GFX9-LABEL: srem_v3i16:
4442; GFX9:       ; %bb.0:
4443; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
4444; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
4445; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
4446; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4447; GFX9-NEXT:    s_sext_i32_i16 s8, s2
4448; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s8
4449; GFX9-NEXT:    s_sext_i32_i16 s9, s6
4450; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s9
4451; GFX9-NEXT:    s_xor_b32 s0, s9, s8
4452; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
4453; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4454; GFX9-NEXT:    s_or_b32 s10, s0, 1
4455; GFX9-NEXT:    s_sext_i32_i16 s3, s3
4456; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
4457; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
4458; GFX9-NEXT:    v_mad_f32 v1, -v2, v0, v1
4459; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
4460; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
4461; GFX9-NEXT:    s_cselect_b32 s0, s10, 0
4462; GFX9-NEXT:    s_ashr_i32 s2, s2, 16
4463; GFX9-NEXT:    v_cvt_i32_f32_e32 v2, v2
4464; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
4465; GFX9-NEXT:    s_ashr_i32 s6, s6, 16
4466; GFX9-NEXT:    v_add_u32_e32 v1, s0, v2
4467; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s6
4468; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
4469; GFX9-NEXT:    s_xor_b32 s0, s6, s2
4470; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4471; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s8
4472; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
4473; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
4474; GFX9-NEXT:    v_mad_f32 v2, -v3, v0, v2
4475; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
4476; GFX9-NEXT:    s_or_b32 s8, s0, 1
4477; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
4478; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
4479; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s3
4480; GFX9-NEXT:    s_cselect_b32 s0, s8, 0
4481; GFX9-NEXT:    v_add_u32_e32 v0, s0, v3
4482; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s2
4483; GFX9-NEXT:    s_sext_i32_i16 s2, s7
4484; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s2
4485; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
4486; GFX9-NEXT:    s_xor_b32 s0, s2, s3
4487; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
4488; GFX9-NEXT:    s_or_b32 s7, s0, 1
4489; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
4490; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
4491; GFX9-NEXT:    v_mad_f32 v3, -v4, v2, v3
4492; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
4493; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v2|
4494; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
4495; GFX9-NEXT:    s_cselect_b32 s0, s7, 0
4496; GFX9-NEXT:    v_add_u32_e32 v2, s0, v4
4497; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s3
4498; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v1
4499; GFX9-NEXT:    v_mov_b32_e32 v3, 0
4500; GFX9-NEXT:    v_sub_u32_e32 v0, s6, v0
4501; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff, v1
4502; GFX9-NEXT:    v_sub_u32_e32 v2, s2, v2
4503; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
4504; GFX9-NEXT:    global_store_short v3, v2, s[4:5] offset:4
4505; GFX9-NEXT:    global_store_dword v3, v0, s[4:5]
4506; GFX9-NEXT:    s_endpgm
4507  %r = srem <3 x i16> %x, %y
4508  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
4509  ret void
4510}
4511
4512define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
4513; CHECK-LABEL: @udiv_v3i15(
4514; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
4515; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
4516; CHECK-NEXT:    [[TMP3:%.*]] = zext i15 [[TMP1]] to i32
4517; CHECK-NEXT:    [[TMP4:%.*]] = zext i15 [[TMP2]] to i32
4518; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
4519; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
4520; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
4521; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
4522; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
4523; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
4524; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
4525; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
4526; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
4527; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
4528; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
4529; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
4530; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
4531; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 32767
4532; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15
4533; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <3 x i15> undef, i15 [[TMP19]], i64 0
4534; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1
4535; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1
4536; CHECK-NEXT:    [[TMP23:%.*]] = zext i15 [[TMP21]] to i32
4537; CHECK-NEXT:    [[TMP24:%.*]] = zext i15 [[TMP22]] to i32
4538; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
4539; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
4540; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
4541; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
4542; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
4543; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
4544; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
4545; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
4546; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
4547; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
4548; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
4549; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
4550; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
4551; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 32767
4552; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15
4553; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1
4554; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2
4555; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2
4556; CHECK-NEXT:    [[TMP43:%.*]] = zext i15 [[TMP41]] to i32
4557; CHECK-NEXT:    [[TMP44:%.*]] = zext i15 [[TMP42]] to i32
4558; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
4559; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
4560; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
4561; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
4562; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
4563; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
4564; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
4565; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
4566; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
4567; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
4568; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
4569; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
4570; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
4571; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 32767
4572; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15
4573; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2
4574; CHECK-NEXT:    store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
4575; CHECK-NEXT:    ret void
4576;
4577; GFX6-LABEL: udiv_v3i15:
4578; GFX6:       ; %bb.0:
4579; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4580; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4581; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
4582; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4583; GFX6-NEXT:    s_mov_b32 s6, -1
4584; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4585; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4586; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
4587; GFX6-NEXT:    s_movk_i32 s3, 0x7fff
4588; GFX6-NEXT:    s_and_b32 s9, s0, s3
4589; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
4590; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4591; GFX6-NEXT:    s_and_b32 s8, s2, s3
4592; GFX6-NEXT:    s_bfe_u32 s0, s0, 0xf000f
4593; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s0
4594; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s8
4595; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
4596; GFX6-NEXT:    s_bfe_u32 s2, s2, 0xf000f
4597; GFX6-NEXT:    v_alignbit_b32 v2, s1, v2, 30
4598; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s2
4599; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
4600; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
4601; GFX6-NEXT:    v_and_b32_e32 v2, s3, v2
4602; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
4603; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
4604; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
4605; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, v2
4606; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
4607; GFX6-NEXT:    v_mul_f32_e32 v1, v6, v7
4608; GFX6-NEXT:    v_and_b32_e32 v0, s3, v0
4609; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
4610; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
4611; GFX6-NEXT:    v_mad_f32 v4, -v1, v5, v6
4612; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
4613; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, v0
4614; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v2
4615; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v5
4616; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
4617; GFX6-NEXT:    v_mul_f32_e32 v1, v0, v6
4618; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
4619; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v1
4620; GFX6-NEXT:    v_mad_f32 v0, -v1, v2, v0
4621; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v2
4622; GFX6-NEXT:    v_and_b32_e32 v2, s3, v3
4623; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
4624; GFX6-NEXT:    v_and_b32_e32 v3, s3, v4
4625; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
4626; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
4627; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
4628; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
4629; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4630; GFX6-NEXT:    s_waitcnt expcnt(0)
4631; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
4632; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
4633; GFX6-NEXT:    s_endpgm
4634; GFX9-LABEL: udiv_v3i15:
4635; GFX9:       ; %bb.0:
4636; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4637; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
4638; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
4639; GFX9-NEXT:    s_movk_i32 s8, 0x7fff
4640; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4641; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4642; GFX9-NEXT:    s_and_b32 s0, s4, s8
4643; GFX9-NEXT:    s_and_b32 s1, s6, s8
4644; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s1
4645; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s0
4646; GFX9-NEXT:    s_bfe_u32 s0, s6, 0xf000f
4647; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s0
4648; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
4649; GFX9-NEXT:    v_mov_b32_e32 v3, s6
4650; GFX9-NEXT:    s_bfe_u32 s1, s4, 0xf000f
4651; GFX9-NEXT:    v_alignbit_b32 v3, s7, v3, 30
4652; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
4653; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s1
4654; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
4655; GFX9-NEXT:    v_and_b32_e32 v3, s8, v3
4656; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
4657; GFX9-NEXT:    v_mad_f32 v4, -v5, v1, v4
4658; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
4659; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, v3
4660; GFX9-NEXT:    v_mov_b32_e32 v0, s4
4661; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
4662; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 30
4663; GFX9-NEXT:    v_mul_f32_e32 v1, v7, v8
4664; GFX9-NEXT:    v_and_b32_e32 v0, s8, v0
4665; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
4666; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
4667; GFX9-NEXT:    v_mad_f32 v5, -v1, v6, v7
4668; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
4669; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
4670; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v3
4671; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v6
4672; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
4673; GFX9-NEXT:    v_mul_f32_e32 v1, v0, v7
4674; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
4675; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v1
4676; GFX9-NEXT:    v_mad_f32 v0, -v1, v3, v0
4677; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v3
4678; GFX9-NEXT:    v_and_b32_e32 v3, s8, v4
4679; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
4680; GFX9-NEXT:    v_and_b32_e32 v4, s8, v5
4681; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
4682; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
4683; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
4684; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
4685; GFX9-NEXT:    global_store_dword v2, v0, s[2:3]
4686; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
4687; GFX9-NEXT:    global_store_short v2, v0, s[2:3] offset:4
4688; GFX9-NEXT:    s_endpgm
4689  %r = udiv <3 x i15> %x, %y
4690  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
4691  ret void
4692}
4693
4694define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
4695; CHECK-LABEL: @urem_v3i15(
4696; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
4697; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
4698; CHECK-NEXT:    [[TMP3:%.*]] = zext i15 [[TMP1]] to i32
4699; CHECK-NEXT:    [[TMP4:%.*]] = zext i15 [[TMP2]] to i32
4700; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
4701; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
4702; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
4703; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
4704; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
4705; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
4706; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
4707; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
4708; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
4709; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
4710; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
4711; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
4712; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
4713; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
4714; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
4715; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 32767
4716; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15
4717; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x i15> undef, i15 [[TMP21]], i64 0
4718; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1
4719; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1
4720; CHECK-NEXT:    [[TMP25:%.*]] = zext i15 [[TMP23]] to i32
4721; CHECK-NEXT:    [[TMP26:%.*]] = zext i15 [[TMP24]] to i32
4722; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
4723; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
4724; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
4725; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
4726; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
4727; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
4728; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
4729; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
4730; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
4731; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
4732; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
4733; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
4734; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
4735; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
4736; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
4737; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 32767
4738; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15
4739; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1
4740; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2
4741; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2
4742; CHECK-NEXT:    [[TMP47:%.*]] = zext i15 [[TMP45]] to i32
4743; CHECK-NEXT:    [[TMP48:%.*]] = zext i15 [[TMP46]] to i32
4744; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
4745; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
4746; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
4747; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
4748; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
4749; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
4750; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
4751; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
4752; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
4753; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
4754; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
4755; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
4756; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
4757; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
4758; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
4759; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 32767
4760; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15
4761; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2
4762; CHECK-NEXT:    store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
4763; CHECK-NEXT:    ret void
4764;
4765; GFX6-LABEL: urem_v3i15:
4766; GFX6:       ; %bb.0:
4767; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4768; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4769; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
4770; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4771; GFX6-NEXT:    s_mov_b32 s6, -1
4772; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4773; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4774; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
4775; GFX6-NEXT:    s_movk_i32 s3, 0x7fff
4776; GFX6-NEXT:    s_and_b32 s10, s0, s3
4777; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s10
4778; GFX6-NEXT:    s_and_b32 s9, s2, s3
4779; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s9
4780; GFX6-NEXT:    v_mov_b32_e32 v2, s0
4781; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
4782; GFX6-NEXT:    v_alignbit_b32 v2, s1, v2, 30
4783; GFX6-NEXT:    s_bfe_u32 s1, s0, 0xf000f
4784; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s1
4785; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
4786; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
4787; GFX6-NEXT:    v_mad_f32 v3, -v4, v1, v3
4788; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
4789; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
4790; GFX6-NEXT:    s_bfe_u32 s10, s2, 0xf000f
4791; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s10
4792; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
4793; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s0
4794; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v5
4795; GFX6-NEXT:    v_and_b32_e32 v2, s3, v2
4796; GFX6-NEXT:    v_and_b32_e32 v0, s3, v0
4797; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s2, v1
4798; GFX6-NEXT:    v_mul_f32_e32 v1, v3, v4
4799; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, v2
4800; GFX6-NEXT:    v_cvt_f32_u32_e32 v7, v0
4801; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
4802; GFX6-NEXT:    v_mad_f32 v3, -v1, v5, v3
4803; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v4
4804; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
4805; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
4806; GFX6-NEXT:    s_lshr_b32 s0, s0, 15
4807; GFX6-NEXT:    v_mul_f32_e32 v3, v7, v8
4808; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
4809; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v3
4810; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4811; GFX6-NEXT:    v_mad_f32 v3, -v3, v4, v7
4812; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
4813; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
4814; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s0
4815; GFX6-NEXT:    v_mul_lo_u32 v2, v3, v2
4816; GFX6-NEXT:    s_lshr_b32 s8, s2, 15
4817; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s8, v1
4818; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, v2, v0
4819; GFX6-NEXT:    v_and_b32_e32 v3, s3, v3
4820; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
4821; GFX6-NEXT:    v_and_b32_e32 v2, s3, v6
4822; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
4823; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
4824; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
4825; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4826; GFX6-NEXT:    s_waitcnt expcnt(0)
4827; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
4828; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
4829; GFX6-NEXT:    s_endpgm
4830; GFX9-LABEL: urem_v3i15:
4831; GFX9:       ; %bb.0:
4832; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4833; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
4834; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
4835; GFX9-NEXT:    s_movk_i32 s8, 0x7fff
4836; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4837; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4838; GFX9-NEXT:    v_mov_b32_e32 v0, s4
4839; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 30
4840; GFX9-NEXT:    s_and_b32 s5, s6, s8
4841; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
4842; GFX9-NEXT:    s_and_b32 s0, s4, s8
4843; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s0
4844; GFX9-NEXT:    s_bfe_u32 s5, s6, 0xf000f
4845; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
4846; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s5
4847; GFX9-NEXT:    v_mov_b32_e32 v3, s6
4848; GFX9-NEXT:    v_alignbit_b32 v3, s7, v3, 30
4849; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
4850; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
4851; GFX9-NEXT:    v_mad_f32 v4, -v5, v1, v4
4852; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
4853; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
4854; GFX9-NEXT:    s_bfe_u32 s1, s4, 0xf000f
4855; GFX9-NEXT:    v_and_b32_e32 v3, s8, v3
4856; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
4857; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, v3
4858; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s1
4859; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v6
4860; GFX9-NEXT:    v_and_b32_e32 v0, s8, v0
4861; GFX9-NEXT:    v_rcp_iflag_f32_e32 v9, v5
4862; GFX9-NEXT:    s_lshr_b32 s0, s6, 15
4863; GFX9-NEXT:    v_mul_f32_e32 v4, v7, v8
4864; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v0
4865; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
4866; GFX9-NEXT:    v_mad_f32 v7, -v4, v6, v7
4867; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, v6
4868; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
4869; GFX9-NEXT:    v_mul_f32_e32 v6, v8, v9
4870; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
4871; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v6
4872; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
4873; GFX9-NEXT:    v_mad_f32 v6, -v6, v5, v8
4874; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, v5
4875; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s0
4876; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
4877; GFX9-NEXT:    v_mul_lo_u32 v3, v5, v3
4878; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s6
4879; GFX9-NEXT:    s_lshr_b32 s0, s4, 15
4880; GFX9-NEXT:    v_sub_u32_e32 v4, s0, v4
4881; GFX9-NEXT:    v_and_b32_e32 v4, s8, v4
4882; GFX9-NEXT:    v_sub_u32_e32 v5, s4, v1
4883; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v3
4884; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
4885; GFX9-NEXT:    v_and_b32_e32 v3, s8, v5
4886; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
4887; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
4888; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
4889; GFX9-NEXT:    global_store_dword v2, v0, s[2:3]
4890; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
4891; GFX9-NEXT:    global_store_short v2, v0, s[2:3] offset:4
4892; GFX9-NEXT:    s_endpgm
4893  %r = urem <3 x i15> %x, %y
4894  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
4895  ret void
4896}
4897
4898define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
4899; CHECK-LABEL: @sdiv_v3i15(
4900; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
4901; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
4902; CHECK-NEXT:    [[TMP3:%.*]] = sext i15 [[TMP1]] to i32
4903; CHECK-NEXT:    [[TMP4:%.*]] = sext i15 [[TMP2]] to i32
4904; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4905; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
4906; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
4907; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
4908; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
4909; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
4910; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
4911; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
4912; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
4913; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
4914; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
4915; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
4916; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
4917; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
4918; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
4919; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
4920; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 17
4921; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 17
4922; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15
4923; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x i15> undef, i15 [[TMP23]], i64 0
4924; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1
4925; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1
4926; CHECK-NEXT:    [[TMP27:%.*]] = sext i15 [[TMP25]] to i32
4927; CHECK-NEXT:    [[TMP28:%.*]] = sext i15 [[TMP26]] to i32
4928; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
4929; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
4930; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
4931; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
4932; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
4933; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
4934; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
4935; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
4936; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
4937; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
4938; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
4939; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
4940; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
4941; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
4942; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
4943; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
4944; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 17
4945; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 17
4946; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15
4947; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1
4948; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2
4949; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2
4950; CHECK-NEXT:    [[TMP51:%.*]] = sext i15 [[TMP49]] to i32
4951; CHECK-NEXT:    [[TMP52:%.*]] = sext i15 [[TMP50]] to i32
4952; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
4953; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
4954; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
4955; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
4956; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
4957; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
4958; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
4959; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
4960; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
4961; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
4962; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
4963; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
4964; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
4965; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
4966; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
4967; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
4968; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 17
4969; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 17
4970; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15
4971; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2
4972; CHECK-NEXT:    store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
4973; CHECK-NEXT:    ret void
4974;
4975; GFX6-LABEL: sdiv_v3i15:
4976; GFX6:       ; %bb.0:
4977; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4978; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4979; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
4980; GFX6-NEXT:    s_mov_b32 s7, 0xf000
4981; GFX6-NEXT:    s_mov_b32 s6, -1
4982; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
4983; GFX6-NEXT:    v_mov_b32_e32 v0, s2
4984; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
4985; GFX6-NEXT:    s_bfe_i32 s3, s0, 0xf0000
4986; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s3
4987; GFX6-NEXT:    v_mov_b32_e32 v1, s0
4988; GFX6-NEXT:    v_alignbit_b32 v1, s1, v1, 30
4989; GFX6-NEXT:    s_bfe_i32 s1, s2, 0xf0000
4990; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s1
4991; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
4992; GFX6-NEXT:    s_xor_b32 s1, s1, s3
4993; GFX6-NEXT:    s_bfe_i32 s0, s0, 0xf000f
4994; GFX6-NEXT:    s_ashr_i32 s1, s1, 30
4995; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
4996; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
4997; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
4998; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
4999; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
5000; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s0
5001; GFX6-NEXT:    s_or_b32 s1, s1, 1
5002; GFX6-NEXT:    v_mov_b32_e32 v5, s1
5003; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
5004; GFX6-NEXT:    s_bfe_i32 s1, s2, 0xf000f
5005; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
5006; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s1
5007; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
5008; GFX6-NEXT:    s_xor_b32 s0, s1, s0
5009; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 15
5010; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
5011; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
5012; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
5013; GFX6-NEXT:    v_mad_f32 v4, -v5, v3, v4
5014; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
5015; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
5016; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, v1
5017; GFX6-NEXT:    s_or_b32 s0, s0, 1
5018; GFX6-NEXT:    v_mov_b32_e32 v6, s0
5019; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
5020; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 15
5021; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
5022; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, v0
5023; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
5024; GFX6-NEXT:    v_xor_b32_e32 v0, v0, v1
5025; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
5026; GFX6-NEXT:    v_or_b32_e32 v0, 1, v0
5027; GFX6-NEXT:    v_mul_f32_e32 v1, v5, v6
5028; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
5029; GFX6-NEXT:    v_mad_f32 v5, -v1, v4, v5
5030; GFX6-NEXT:    v_cvt_i32_f32_e32 v1, v1
5031; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
5032; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
5033; GFX6-NEXT:    s_movk_i32 s0, 0x7fff
5034; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5035; GFX6-NEXT:    v_and_b32_e32 v3, s0, v3
5036; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
5037; GFX6-NEXT:    v_and_b32_e32 v2, s0, v2
5038; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
5039; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
5040; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
5041; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5042; GFX6-NEXT:    s_waitcnt expcnt(0)
5043; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
5044; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
5045; GFX6-NEXT:    s_endpgm
5046; GFX9-LABEL: sdiv_v3i15:
5047; GFX9:       ; %bb.0:
5048; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5049; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5050; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
5051; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5052; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5053; GFX9-NEXT:    s_bfe_i32 s1, s4, 0xf0000
5054; GFX9-NEXT:    s_bfe_i32 s0, s6, 0xf0000
5055; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s0
5056; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s1
5057; GFX9-NEXT:    s_xor_b32 s0, s1, s0
5058; GFX9-NEXT:    v_mov_b32_e32 v0, s4
5059; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v3
5060; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
5061; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 30
5062; GFX9-NEXT:    s_or_b32 s5, s0, 1
5063; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
5064; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
5065; GFX9-NEXT:    v_mad_f32 v4, -v5, v3, v4
5066; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
5067; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
5068; GFX9-NEXT:    s_cselect_b32 s0, s5, 0
5069; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
5070; GFX9-NEXT:    s_bfe_i32 s1, s6, 0xf000f
5071; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s1
5072; GFX9-NEXT:    v_mov_b32_e32 v1, s6
5073; GFX9-NEXT:    v_add_u32_e32 v4, s0, v5
5074; GFX9-NEXT:    s_bfe_i32 s0, s4, 0xf000f
5075; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s0
5076; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v3
5077; GFX9-NEXT:    v_alignbit_b32 v1, s7, v1, 30
5078; GFX9-NEXT:    s_xor_b32 s0, s0, s1
5079; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
5080; GFX9-NEXT:    v_mul_f32_e32 v6, v5, v6
5081; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
5082; GFX9-NEXT:    v_mad_f32 v5, -v6, v3, v5
5083; GFX9-NEXT:    v_bfe_i32 v1, v1, 0, 15
5084; GFX9-NEXT:    s_or_b32 s4, s0, 1
5085; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v3|
5086; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v1
5087; GFX9-NEXT:    v_cvt_i32_f32_e32 v6, v6
5088; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
5089; GFX9-NEXT:    s_cselect_b32 s0, s4, 0
5090; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 15
5091; GFX9-NEXT:    v_add_u32_e32 v5, s0, v6
5092; GFX9-NEXT:    v_cvt_f32_i32_e32 v6, v0
5093; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v3
5094; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
5095; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
5096; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
5097; GFX9-NEXT:    v_mul_f32_e32 v1, v6, v7
5098; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
5099; GFX9-NEXT:    v_cvt_i32_f32_e32 v7, v1
5100; GFX9-NEXT:    v_mad_f32 v1, -v1, v3, v6
5101; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v3|
5102; GFX9-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
5103; GFX9-NEXT:    s_movk_i32 s0, 0x7fff
5104; GFX9-NEXT:    v_add_u32_e32 v0, v7, v0
5105; GFX9-NEXT:    v_and_b32_e32 v3, s0, v4
5106; GFX9-NEXT:    v_and_b32_e32 v4, s0, v5
5107; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
5108; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
5109; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
5110; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
5111; GFX9-NEXT:    global_store_dword v2, v0, s[2:3]
5112; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
5113; GFX9-NEXT:    global_store_short v2, v0, s[2:3] offset:4
5114; GFX9-NEXT:    s_endpgm
5115  %r = sdiv <3 x i15> %x, %y
5116  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
5117  ret void
5118}
5119
5120define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
5121; CHECK-LABEL: @srem_v3i15(
5122; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
5123; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
5124; CHECK-NEXT:    [[TMP3:%.*]] = sext i15 [[TMP1]] to i32
5125; CHECK-NEXT:    [[TMP4:%.*]] = sext i15 [[TMP2]] to i32
5126; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
5127; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
5128; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
5129; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
5130; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
5131; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
5132; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
5133; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
5134; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
5135; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
5136; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
5137; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
5138; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
5139; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
5140; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
5141; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
5142; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
5143; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
5144; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 17
5145; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 17
5146; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15
5147; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x i15> undef, i15 [[TMP25]], i64 0
5148; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1
5149; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1
5150; CHECK-NEXT:    [[TMP29:%.*]] = sext i15 [[TMP27]] to i32
5151; CHECK-NEXT:    [[TMP30:%.*]] = sext i15 [[TMP28]] to i32
5152; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
5153; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
5154; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
5155; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
5156; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
5157; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
5158; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
5159; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
5160; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
5161; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
5162; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
5163; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
5164; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
5165; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
5166; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
5167; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
5168; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
5169; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
5170; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 17
5171; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 17
5172; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15
5173; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1
5174; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2
5175; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2
5176; CHECK-NEXT:    [[TMP55:%.*]] = sext i15 [[TMP53]] to i32
5177; CHECK-NEXT:    [[TMP56:%.*]] = sext i15 [[TMP54]] to i32
5178; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
5179; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
5180; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
5181; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
5182; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
5183; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
5184; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
5185; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
5186; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
5187; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
5188; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
5189; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
5190; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
5191; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
5192; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
5193; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
5194; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
5195; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
5196; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 17
5197; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 17
5198; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15
5199; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2
5200; CHECK-NEXT:    store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
5201; CHECK-NEXT:    ret void
5202;
5203; GFX6-LABEL: srem_v3i15:
5204; GFX6:       ; %bb.0:
5205; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5206; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
5207; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
5208; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5209; GFX6-NEXT:    s_mov_b32 s6, -1
5210; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5211; GFX6-NEXT:    v_mov_b32_e32 v0, s2
5212; GFX6-NEXT:    v_alignbit_b32 v0, s3, v0, 30
5213; GFX6-NEXT:    s_movk_i32 s3, 0x7fff
5214; GFX6-NEXT:    s_and_b32 s11, s0, s3
5215; GFX6-NEXT:    s_bfe_i32 s11, s11, 0xf0000
5216; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s11
5217; GFX6-NEXT:    s_and_b32 s9, s2, s3
5218; GFX6-NEXT:    s_bfe_i32 s9, s9, 0xf0000
5219; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s9
5220; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
5221; GFX6-NEXT:    s_xor_b32 s9, s9, s11
5222; GFX6-NEXT:    s_ashr_i32 s9, s9, 30
5223; GFX6-NEXT:    s_or_b32 s9, s9, 1
5224; GFX6-NEXT:    v_mul_f32_e32 v4, v3, v4
5225; GFX6-NEXT:    v_trunc_f32_e32 v4, v4
5226; GFX6-NEXT:    v_mad_f32 v3, -v4, v2, v3
5227; GFX6-NEXT:    v_cvt_i32_f32_e32 v4, v4
5228; GFX6-NEXT:    v_mov_b32_e32 v5, s9
5229; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
5230; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
5231; GFX6-NEXT:    v_mov_b32_e32 v1, s0
5232; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
5233; GFX6-NEXT:    s_bfe_u32 s12, s0, 0xf000f
5234; GFX6-NEXT:    v_alignbit_b32 v1, s1, v1, 30
5235; GFX6-NEXT:    v_mul_lo_u32 v2, v2, s0
5236; GFX6-NEXT:    s_lshr_b32 s1, s0, 15
5237; GFX6-NEXT:    s_bfe_i32 s0, s12, 0xf0000
5238; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s0
5239; GFX6-NEXT:    s_bfe_u32 s10, s2, 0xf000f
5240; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
5241; GFX6-NEXT:    s_lshr_b32 s8, s2, 15
5242; GFX6-NEXT:    s_bfe_i32 s2, s10, 0xf0000
5243; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s2
5244; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
5245; GFX6-NEXT:    s_xor_b32 s0, s2, s0
5246; GFX6-NEXT:    s_ashr_i32 s0, s0, 30
5247; GFX6-NEXT:    s_or_b32 s0, s0, 1
5248; GFX6-NEXT:    v_mul_f32_e32 v5, v4, v5
5249; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
5250; GFX6-NEXT:    v_mad_f32 v4, -v5, v3, v4
5251; GFX6-NEXT:    v_cvt_i32_f32_e32 v5, v5
5252; GFX6-NEXT:    v_and_b32_e32 v1, s3, v1
5253; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
5254; GFX6-NEXT:    v_mov_b32_e32 v6, s0
5255; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
5256; GFX6-NEXT:    v_bfe_i32 v4, v1, 0, 15
5257; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
5258; GFX6-NEXT:    v_cvt_f32_i32_e32 v5, v4
5259; GFX6-NEXT:    v_and_b32_e32 v0, s3, v0
5260; GFX6-NEXT:    v_bfe_i32 v6, v0, 0, 15
5261; GFX6-NEXT:    v_cvt_f32_i32_e32 v7, v6
5262; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v5
5263; GFX6-NEXT:    v_xor_b32_e32 v4, v6, v4
5264; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 30, v4
5265; GFX6-NEXT:    v_or_b32_e32 v4, 1, v4
5266; GFX6-NEXT:    v_mul_f32_e32 v6, v7, v8
5267; GFX6-NEXT:    v_trunc_f32_e32 v6, v6
5268; GFX6-NEXT:    v_mad_f32 v7, -v6, v5, v7
5269; GFX6-NEXT:    v_cvt_i32_f32_e32 v6, v6
5270; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v5|
5271; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
5272; GFX6-NEXT:    v_mul_lo_u32 v3, v3, s1
5273; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
5274; GFX6-NEXT:    v_mul_lo_u32 v1, v4, v1
5275; GFX6-NEXT:    v_and_b32_e32 v2, s3, v2
5276; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
5277; GFX6-NEXT:    v_and_b32_e32 v3, s3, v3
5278; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
5279; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
5280; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
5281; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
5282; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
5283; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5284; GFX6-NEXT:    s_waitcnt expcnt(0)
5285; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
5286; GFX6-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
5287; GFX6-NEXT:    s_endpgm
5288; GFX9-LABEL: srem_v3i15:
5289; GFX9:       ; %bb.0:
5290; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5291; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5292; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
5293; GFX9-NEXT:    s_movk_i32 s8, 0x7fff
5294; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5295; GFX9-NEXT:    s_and_b32 s0, s4, s8
5296; GFX9-NEXT:    s_and_b32 s1, s6, s8
5297; GFX9-NEXT:    s_bfe_i32 s1, s1, 0xf0000
5298; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s1
5299; GFX9-NEXT:    s_bfe_i32 s0, s0, 0xf0000
5300; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s0
5301; GFX9-NEXT:    s_xor_b32 s0, s0, s1
5302; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
5303; GFX9-NEXT:    v_mov_b32_e32 v0, s4
5304; GFX9-NEXT:    v_mov_b32_e32 v1, s6
5305; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
5306; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v4
5307; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
5308; GFX9-NEXT:    v_mad_f32 v3, -v4, v2, v3
5309; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
5310; GFX9-NEXT:    v_alignbit_b32 v0, s5, v0, 30
5311; GFX9-NEXT:    v_alignbit_b32 v1, s7, v1, 30
5312; GFX9-NEXT:    s_or_b32 s11, s0, 1
5313; GFX9-NEXT:    s_lshr_b32 s9, s4, 15
5314; GFX9-NEXT:    s_bfe_u32 s5, s4, 0xf000f
5315; GFX9-NEXT:    s_lshr_b32 s7, s6, 15
5316; GFX9-NEXT:    s_bfe_u32 s10, s6, 0xf000f
5317; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v2|
5318; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
5319; GFX9-NEXT:    s_cselect_b32 s0, s11, 0
5320; GFX9-NEXT:    v_add_u32_e32 v2, s0, v4
5321; GFX9-NEXT:    s_bfe_i32 s0, s10, 0xf0000
5322; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s0
5323; GFX9-NEXT:    s_bfe_i32 s1, s5, 0xf0000
5324; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s1
5325; GFX9-NEXT:    s_xor_b32 s0, s1, s0
5326; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v3
5327; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
5328; GFX9-NEXT:    s_or_b32 s5, s0, 1
5329; GFX9-NEXT:    v_and_b32_e32 v1, s8, v1
5330; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v5
5331; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
5332; GFX9-NEXT:    v_mad_f32 v4, -v5, v3, v4
5333; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
5334; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
5335; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
5336; GFX9-NEXT:    s_cselect_b32 s0, s5, 0
5337; GFX9-NEXT:    v_bfe_i32 v4, v1, 0, 15
5338; GFX9-NEXT:    v_add_u32_e32 v3, s0, v5
5339; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, v4
5340; GFX9-NEXT:    v_and_b32_e32 v0, s8, v0
5341; GFX9-NEXT:    v_bfe_i32 v6, v0, 0, 15
5342; GFX9-NEXT:    v_cvt_f32_i32_e32 v7, v6
5343; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v5
5344; GFX9-NEXT:    v_xor_b32_e32 v4, v6, v4
5345; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 30, v4
5346; GFX9-NEXT:    v_or_b32_e32 v4, 1, v4
5347; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v8
5348; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
5349; GFX9-NEXT:    v_cvt_i32_f32_e32 v8, v6
5350; GFX9-NEXT:    v_mad_f32 v6, -v6, v5, v7
5351; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v5|
5352; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
5353; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s7
5354; GFX9-NEXT:    v_add_u32_e32 v4, v8, v4
5355; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s6
5356; GFX9-NEXT:    v_mul_lo_u32 v1, v4, v1
5357; GFX9-NEXT:    v_sub_u32_e32 v3, s9, v3
5358; GFX9-NEXT:    v_and_b32_e32 v3, s8, v3
5359; GFX9-NEXT:    v_sub_u32_e32 v2, s4, v2
5360; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v1
5361; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
5362; GFX9-NEXT:    v_and_b32_e32 v2, s8, v2
5363; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
5364; GFX9-NEXT:    v_or_b32_e32 v2, v2, v3
5365; GFX9-NEXT:    v_mov_b32_e32 v4, 0
5366; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
5367; GFX9-NEXT:    global_store_dword v4, v0, s[2:3]
5368; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
5369; GFX9-NEXT:    global_store_short v4, v0, s[2:3] offset:4
5370; GFX9-NEXT:    s_endpgm
5371  %r = srem <3 x i15> %x, %y
5372  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
5373  ret void
5374}
5375
5376define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
5377; CHECK-LABEL: @udiv_i32_oddk_denom(
5378; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 1235195
5379; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5380; CHECK-NEXT:    ret void
5381;
5382; GFX6-LABEL: udiv_i32_oddk_denom:
5383; GFX6:       ; %bb.0:
5384; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5385; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
5386; GFX6-NEXT:    v_mov_b32_e32 v0, 0xb2a50881
5387; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5388; GFX6-NEXT:    s_mov_b32 s6, -1
5389; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5390; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
5391; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s0, v0
5392; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
5393; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5394; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
5395; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5396; GFX6-NEXT:    s_endpgm
5397; GFX9-LABEL: udiv_i32_oddk_denom:
5398; GFX9:       ; %bb.0:
5399; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5400; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
5401; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5402; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5403; GFX9-NEXT:    s_mul_hi_u32 s0, s4, 0xb2a50881
5404; GFX9-NEXT:    s_sub_i32 s1, s4, s0
5405; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
5406; GFX9-NEXT:    s_add_i32 s1, s1, s0
5407; GFX9-NEXT:    s_lshr_b32 s0, s1, 20
5408; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5409; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
5410; GFX9-NEXT:    s_endpgm
5411  %r = udiv i32 %x, 1235195
5412  store i32 %r, i32 addrspace(1)* %out
5413  ret void
5414}
5415
5416define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
5417; CHECK-LABEL: @udiv_i32_pow2k_denom(
5418; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 4096
5419; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5420; CHECK-NEXT:    ret void
5421;
5422; GFX6-LABEL: udiv_i32_pow2k_denom:
5423; GFX6:       ; %bb.0:
5424; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5425; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
5426; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5427; GFX6-NEXT:    s_mov_b32 s6, -1
5428; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5429; GFX6-NEXT:    s_lshr_b32 s0, s0, 12
5430; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5431; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5432; GFX6-NEXT:    s_endpgm
5433; GFX9-LABEL: udiv_i32_pow2k_denom:
5434; GFX9:       ; %bb.0:
5435; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5436; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
5437; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5438; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5439; GFX9-NEXT:    s_lshr_b32 s0, s4, 12
5440; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5441; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
5442; GFX9-NEXT:    s_endpgm
5443  %r = udiv i32 %x, 4096
5444  store i32 %r, i32 addrspace(1)* %out
5445  ret void
5446}
5447
5448define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
5449; CHECK-LABEL: @udiv_i32_pow2_shl_denom(
5450; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
5451; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]]
5452; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5453; CHECK-NEXT:    ret void
5454;
5455; GFX6-LABEL: udiv_i32_pow2_shl_denom:
5456; GFX6:       ; %bb.0:
5457; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5458; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5459; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5460; GFX6-NEXT:    s_mov_b32 s6, -1
5461; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5462; GFX6-NEXT:    s_add_i32 s1, s1, 12
5463; GFX6-NEXT:    s_lshr_b32 s0, s0, s1
5464; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5465; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5466; GFX6-NEXT:    s_endpgm
5467; GFX9-LABEL: udiv_i32_pow2_shl_denom:
5468; GFX9:       ; %bb.0:
5469; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5470; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5471; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5472; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5473; GFX9-NEXT:    s_add_i32 s0, s5, 12
5474; GFX9-NEXT:    s_lshr_b32 s0, s4, s0
5475; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5476; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
5477; GFX9-NEXT:    s_endpgm
5478  %shl.y = shl i32 4096, %y
5479  %r = udiv i32 %x, %shl.y
5480  store i32 %r, i32 addrspace(1)* %out
5481  ret void
5482}
5483
5484define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
5485; CHECK-LABEL: @udiv_v2i32_pow2k_denom(
5486; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5487; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
5488; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
5489; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
5490; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096
5491; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
5492; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
5493; CHECK-NEXT:    ret void
5494;
5495; GFX6-LABEL: udiv_v2i32_pow2k_denom:
5496; GFX6:       ; %bb.0:
5497; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5498; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5499; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5500; GFX6-NEXT:    s_mov_b32 s6, -1
5501; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5502; GFX6-NEXT:    s_lshr_b32 s0, s0, 12
5503; GFX6-NEXT:    s_lshr_b32 s1, s1, 12
5504; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5505; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5506; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5507; GFX6-NEXT:    s_endpgm
5508; GFX9-LABEL: udiv_v2i32_pow2k_denom:
5509; GFX9:       ; %bb.0:
5510; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5511; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5512; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5513; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5514; GFX9-NEXT:    s_lshr_b32 s0, s4, 12
5515; GFX9-NEXT:    s_lshr_b32 s1, s5, 12
5516; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5517; GFX9-NEXT:    v_mov_b32_e32 v1, s1
5518; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
5519; GFX9-NEXT:    s_endpgm
5520  %r = udiv <2 x i32> %x, <i32 4096, i32 4096>
5521  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
5522  ret void
5523}
5524
5525define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
5526; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom(
5527; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5528; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
5529; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
5530; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
5531; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095
5532; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
5533; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
5534; CHECK-NEXT:    ret void
5535;
5536; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom:
5537; GFX6:       ; %bb.0:
5538; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5539; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5540; GFX6-NEXT:    v_mov_b32_e32 v0, 0x100101
5541; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5542; GFX6-NEXT:    s_mov_b32 s6, -1
5543; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5544; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
5545; GFX6-NEXT:    s_lshr_b32 s0, s0, 12
5546; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v0
5547; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
5548; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5549; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 11, v0
5550; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5551; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5552; GFX6-NEXT:    s_endpgm
5553; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom:
5554; GFX9:       ; %bb.0:
5555; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5556; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5557; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5558; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5559; GFX9-NEXT:    s_mul_hi_u32 s1, s5, 0x100101
5560; GFX9-NEXT:    s_lshr_b32 s0, s4, 12
5561; GFX9-NEXT:    s_sub_i32 s4, s5, s1
5562; GFX9-NEXT:    s_lshr_b32 s4, s4, 1
5563; GFX9-NEXT:    s_add_i32 s4, s4, s1
5564; GFX9-NEXT:    s_lshr_b32 s1, s4, 11
5565; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5566; GFX9-NEXT:    v_mov_b32_e32 v1, s1
5567; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
5568; GFX9-NEXT:    s_endpgm
5569  %r = udiv <2 x i32> %x, <i32 4096, i32 4095>
5570  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
5571  ret void
5572}
5573
5574define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
5575; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom(
5576; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
5577; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5578; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
5579; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
5580; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
5581; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
5582; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
5583; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
5584; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
5585; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
5586; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
5587; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
5588; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
5589; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
5590; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
5591; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
5592; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
5593; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
5594; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
5595; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
5596; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
5597; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
5598; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
5599; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
5600; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
5601; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP21]], 1
5602; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]]
5603; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]]
5604; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]]
5605; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]]
5606; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
5607; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
5608; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x i32> undef, i32 [[TMP31]], i64 0
5609; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x i32> [[X]], i64 1
5610; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
5611; CHECK-NEXT:    [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float
5612; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
5613; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000
5614; CHECK-NEXT:    [[TMP38:%.*]] = fptoui float [[TMP37]] to i32
5615; CHECK-NEXT:    [[TMP39:%.*]] = sub i32 0, [[TMP34]]
5616; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]]
5617; CHECK-NEXT:    [[TMP41:%.*]] = zext i32 [[TMP38]] to i64
5618; CHECK-NEXT:    [[TMP42:%.*]] = zext i32 [[TMP40]] to i64
5619; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]]
5620; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
5621; CHECK-NEXT:    [[TMP45:%.*]] = lshr i64 [[TMP43]], 32
5622; CHECK-NEXT:    [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32
5623; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]]
5624; CHECK-NEXT:    [[TMP48:%.*]] = zext i32 [[TMP33]] to i64
5625; CHECK-NEXT:    [[TMP49:%.*]] = zext i32 [[TMP47]] to i64
5626; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]]
5627; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
5628; CHECK-NEXT:    [[TMP52:%.*]] = lshr i64 [[TMP50]], 32
5629; CHECK-NEXT:    [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32
5630; CHECK-NEXT:    [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]]
5631; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]]
5632; CHECK-NEXT:    [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]]
5633; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP53]], 1
5634; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]]
5635; CHECK-NEXT:    [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]]
5636; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]]
5637; CHECK-NEXT:    [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]]
5638; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP58]], 1
5639; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]]
5640; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP63]], i64 1
5641; CHECK-NEXT:    store <2 x i32> [[TMP64]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
5642; CHECK-NEXT:    ret void
5643;
5644; GFX6-LABEL: udiv_v2i32_pow2_shl_denom:
5645; GFX6:       ; %bb.0:
5646; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
5647; GFX6-NEXT:    s_movk_i32 s4, 0x1000
5648; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5649; GFX6-NEXT:    s_mov_b32 s6, -1
5650; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5651; GFX6-NEXT:    s_lshl_b32 s8, s4, s2
5652; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
5653; GFX6-NEXT:    s_lshl_b32 s9, s4, s3
5654; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
5655; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5656; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
5657; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
5658; GFX6-NEXT:    s_mov_b32 s0, 0x4f7ffffe
5659; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
5660; GFX6-NEXT:    v_mul_f32_e32 v0, s0, v0
5661; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
5662; GFX6-NEXT:    v_mul_f32_e32 v1, s0, v1
5663; GFX6-NEXT:    s_sub_i32 s0, 0, s8
5664; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
5665; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v0
5666; GFX6-NEXT:    s_sub_i32 s0, 0, s9
5667; GFX6-NEXT:    v_mul_lo_u32 v3, s0, v1
5668; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
5669; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
5670; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
5671; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5672; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
5673; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
5674; GFX6-NEXT:    v_mul_hi_u32 v1, s3, v1
5675; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s8
5676; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
5677; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s9
5678; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
5679; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v2
5680; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
5681; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v2
5682; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
5683; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
5684; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
5685; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
5686; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s3, v4
5687; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
5688; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
5689; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
5690; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v2
5691; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
5692; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
5693; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
5694; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
5695; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5696; GFX6-NEXT:    s_endpgm
5697; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
5698; GFX9:       ; %bb.0:
5699; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
5700; GFX9-NEXT:    s_movk_i32 s4, 0x1000
5701; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5702; GFX9-NEXT:    s_lshl_b32 s7, s4, s2
5703; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s7
5704; GFX9-NEXT:    s_lshl_b32 s6, s4, s3
5705; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
5706; GFX9-NEXT:    s_mov_b32 s2, 0x4f7ffffe
5707; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
5708; GFX9-NEXT:    s_sub_i32 s3, 0, s6
5709; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
5710; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
5711; GFX9-NEXT:    v_mul_f32_e32 v0, s2, v0
5712; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
5713; GFX9-NEXT:    v_mul_f32_e32 v1, s2, v1
5714; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
5715; GFX9-NEXT:    s_sub_i32 s2, 0, s7
5716; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
5717; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
5718; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
5719; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
5720; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
5721; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
5722; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5723; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
5724; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
5725; GFX9-NEXT:    v_mul_hi_u32 v1, s3, v1
5726; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5727; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s7
5728; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
5729; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s6
5730; GFX9-NEXT:    v_add_u32_e32 v6, 1, v1
5731; GFX9-NEXT:    v_sub_u32_e32 v3, s2, v3
5732; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
5733; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
5734; GFX9-NEXT:    v_subrev_u32_e32 v5, s7, v3
5735; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
5736; GFX9-NEXT:    v_sub_u32_e32 v4, s3, v4
5737; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v4
5738; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
5739; GFX9-NEXT:    v_subrev_u32_e32 v3, s6, v4
5740; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
5741; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[0:1]
5742; GFX9-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
5743; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
5744; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
5745; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
5746; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
5747; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
5748; GFX9-NEXT:    s_endpgm
5749  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
5750  %r = udiv <2 x i32> %x, %shl.y
5751  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
5752  ret void
5753}
5754
5755define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
5756; CHECK-LABEL: @urem_i32_oddk_denom(
5757; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 1235195
5758; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5759; CHECK-NEXT:    ret void
5760;
5761; GFX6-LABEL: urem_i32_oddk_denom:
5762; GFX6:       ; %bb.0:
5763; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
5764; GFX6-NEXT:    v_mov_b32_e32 v0, 0xb2a50881
5765; GFX6-NEXT:    s_mov_b32 s2, 0x12d8fb
5766; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5767; GFX6-NEXT:    s_mov_b32 s3, 0xf000
5768; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5769; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
5770; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v0
5771; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
5772; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5773; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
5774; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
5775; GFX6-NEXT:    s_mov_b32 s2, -1
5776; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
5777; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5778; GFX6-NEXT:    s_endpgm
5779; GFX9-LABEL: urem_i32_oddk_denom:
5780; GFX9:       ; %bb.0:
5781; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5782; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
5783; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5784; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5785; GFX9-NEXT:    s_mul_hi_u32 s0, s4, 0xb2a50881
5786; GFX9-NEXT:    s_sub_i32 s1, s4, s0
5787; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
5788; GFX9-NEXT:    s_add_i32 s1, s1, s0
5789; GFX9-NEXT:    s_lshr_b32 s0, s1, 20
5790; GFX9-NEXT:    s_mul_i32 s0, s0, 0x12d8fb
5791; GFX9-NEXT:    s_sub_i32 s0, s4, s0
5792; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5793; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
5794; GFX9-NEXT:    s_endpgm
5795  %r = urem i32 %x, 1235195
5796  store i32 %r, i32 addrspace(1)* %out
5797  ret void
5798}
5799
5800define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
5801; CHECK-LABEL: @urem_i32_pow2k_denom(
5802; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 4096
5803; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5804; CHECK-NEXT:    ret void
5805;
5806; GFX6-LABEL: urem_i32_pow2k_denom:
5807; GFX6:       ; %bb.0:
5808; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5809; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
5810; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5811; GFX6-NEXT:    s_mov_b32 s6, -1
5812; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5813; GFX6-NEXT:    s_and_b32 s0, s0, 0xfff
5814; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5815; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5816; GFX6-NEXT:    s_endpgm
5817; GFX9-LABEL: urem_i32_pow2k_denom:
5818; GFX9:       ; %bb.0:
5819; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5820; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
5821; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5822; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5823; GFX9-NEXT:    s_and_b32 s0, s4, 0xfff
5824; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5825; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
5826; GFX9-NEXT:    s_endpgm
5827  %r = urem i32 %x, 4096
5828  store i32 %r, i32 addrspace(1)* %out
5829  ret void
5830}
5831
5832define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
5833; CHECK-LABEL: @urem_i32_pow2_shl_denom(
5834; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
5835; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]]
5836; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
5837; CHECK-NEXT:    ret void
5838;
5839; GFX6-LABEL: urem_i32_pow2_shl_denom:
5840; GFX6:       ; %bb.0:
5841; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5842; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5843; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5844; GFX6-NEXT:    s_mov_b32 s6, -1
5845; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5846; GFX6-NEXT:    s_lshl_b32 s1, 0x1000, s1
5847; GFX6-NEXT:    s_add_i32 s1, s1, -1
5848; GFX6-NEXT:    s_and_b32 s0, s0, s1
5849; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5850; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5851; GFX6-NEXT:    s_endpgm
5852; GFX9-LABEL: urem_i32_pow2_shl_denom:
5853; GFX9:       ; %bb.0:
5854; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5855; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5856; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5857; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5858; GFX9-NEXT:    s_lshl_b32 s0, 0x1000, s5
5859; GFX9-NEXT:    s_add_i32 s0, s0, -1
5860; GFX9-NEXT:    s_and_b32 s0, s4, s0
5861; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5862; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
5863; GFX9-NEXT:    s_endpgm
5864  %shl.y = shl i32 4096, %y
5865  %r = urem i32 %x, %shl.y
5866  store i32 %r, i32 addrspace(1)* %out
5867  ret void
5868}
5869
5870define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
5871; CHECK-LABEL: @urem_v2i32_pow2k_denom(
5872; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5873; CHECK-NEXT:    [[TMP2:%.*]] = urem i32 [[TMP1]], 4096
5874; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
5875; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
5876; CHECK-NEXT:    [[TMP5:%.*]] = urem i32 [[TMP4]], 4096
5877; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
5878; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
5879; CHECK-NEXT:    ret void
5880;
5881; GFX6-LABEL: urem_v2i32_pow2k_denom:
5882; GFX6:       ; %bb.0:
5883; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5884; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
5885; GFX6-NEXT:    s_movk_i32 s2, 0xfff
5886; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5887; GFX6-NEXT:    s_mov_b32 s6, -1
5888; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5889; GFX6-NEXT:    s_and_b32 s0, s0, s2
5890; GFX6-NEXT:    s_and_b32 s1, s1, s2
5891; GFX6-NEXT:    v_mov_b32_e32 v0, s0
5892; GFX6-NEXT:    v_mov_b32_e32 v1, s1
5893; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5894; GFX6-NEXT:    s_endpgm
5895; GFX9-LABEL: urem_v2i32_pow2k_denom:
5896; GFX9:       ; %bb.0:
5897; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5898; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5899; GFX9-NEXT:    s_movk_i32 s0, 0xfff
5900; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5901; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5902; GFX9-NEXT:    s_and_b32 s1, s4, s0
5903; GFX9-NEXT:    s_and_b32 s0, s5, s0
5904; GFX9-NEXT:    v_mov_b32_e32 v0, s1
5905; GFX9-NEXT:    v_mov_b32_e32 v1, s0
5906; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
5907; GFX9-NEXT:    s_endpgm
5908  %r = urem <2 x i32> %x, <i32 4096, i32 4096>
5909  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
5910  ret void
5911}
5912
5913define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
5914; CHECK-LABEL: @urem_v2i32_pow2_shl_denom(
5915; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
5916; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
5917; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
5918; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
5919; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
5920; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
5921; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
5922; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
5923; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
5924; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
5925; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
5926; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
5927; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
5928; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
5929; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
5930; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
5931; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
5932; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
5933; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
5934; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
5935; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
5936; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
5937; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
5938; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
5939; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
5940; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]]
5941; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]]
5942; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]]
5943; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]]
5944; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]]
5945; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> undef, i32 [[TMP29]], i64 0
5946; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x i32> [[X]], i64 1
5947; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
5948; CHECK-NEXT:    [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float
5949; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
5950; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000
5951; CHECK-NEXT:    [[TMP36:%.*]] = fptoui float [[TMP35]] to i32
5952; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 0, [[TMP32]]
5953; CHECK-NEXT:    [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]]
5954; CHECK-NEXT:    [[TMP39:%.*]] = zext i32 [[TMP36]] to i64
5955; CHECK-NEXT:    [[TMP40:%.*]] = zext i32 [[TMP38]] to i64
5956; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]]
5957; CHECK-NEXT:    [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32
5958; CHECK-NEXT:    [[TMP43:%.*]] = lshr i64 [[TMP41]], 32
5959; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
5960; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]]
5961; CHECK-NEXT:    [[TMP46:%.*]] = zext i32 [[TMP31]] to i64
5962; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP45]] to i64
5963; CHECK-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]]
5964; CHECK-NEXT:    [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32
5965; CHECK-NEXT:    [[TMP50:%.*]] = lshr i64 [[TMP48]], 32
5966; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
5967; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]]
5968; CHECK-NEXT:    [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]]
5969; CHECK-NEXT:    [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]]
5970; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]]
5971; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]]
5972; CHECK-NEXT:    [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]]
5973; CHECK-NEXT:    [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]]
5974; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]]
5975; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP59]], i64 1
5976; CHECK-NEXT:    store <2 x i32> [[TMP60]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
5977; CHECK-NEXT:    ret void
5978;
5979; GFX6-LABEL: urem_v2i32_pow2_shl_denom:
5980; GFX6:       ; %bb.0:
5981; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
5982; GFX6-NEXT:    s_movk_i32 s4, 0x1000
5983; GFX6-NEXT:    s_mov_b32 s7, 0xf000
5984; GFX6-NEXT:    s_mov_b32 s6, -1
5985; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
5986; GFX6-NEXT:    s_lshl_b32 s8, s4, s2
5987; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
5988; GFX6-NEXT:    s_lshl_b32 s3, s4, s3
5989; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s3
5990; GFX6-NEXT:    s_mov_b32 s4, 0x4f7ffffe
5991; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
5992; GFX6-NEXT:    s_sub_i32 s2, 0, s8
5993; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v1
5994; GFX6-NEXT:    v_mul_f32_e32 v0, s4, v0
5995; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
5996; GFX6-NEXT:    v_mul_f32_e32 v1, s4, v1
5997; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
5998; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5999; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6000; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v0
6001; GFX6-NEXT:    s_sub_i32 s2, 0, s3
6002; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v1
6003; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
6004; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
6005; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
6006; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6007; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
6008; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
6009; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
6010; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s8
6011; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s3
6012; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
6013; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
6014; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
6015; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6016; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
6017; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
6018; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6019; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
6020; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
6021; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
6022; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6023; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
6024; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
6025; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6026; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6027; GFX6-NEXT:    s_endpgm
6028; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
6029; GFX9:       ; %bb.0:
6030; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
6031; GFX9-NEXT:    s_movk_i32 s4, 0x1000
6032; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6033; GFX9-NEXT:    s_lshl_b32 s5, s4, s3
6034; GFX9-NEXT:    s_lshl_b32 s4, s4, s2
6035; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
6036; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
6037; GFX9-NEXT:    s_mov_b32 s2, 0x4f7ffffe
6038; GFX9-NEXT:    s_sub_i32 s3, 0, s5
6039; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6040; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
6041; GFX9-NEXT:    v_mul_f32_e32 v0, s2, v0
6042; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6043; GFX9-NEXT:    v_mul_f32_e32 v1, s2, v1
6044; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
6045; GFX9-NEXT:    s_sub_i32 s2, 0, s4
6046; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v0
6047; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
6048; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
6049; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
6050; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6051; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
6052; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
6053; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6054; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
6055; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
6056; GFX9-NEXT:    v_mul_hi_u32 v1, s3, v1
6057; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6058; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s4
6059; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s5
6060; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
6061; GFX9-NEXT:    v_subrev_u32_e32 v3, s4, v0
6062; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
6063; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6064; GFX9-NEXT:    v_sub_u32_e32 v1, s3, v1
6065; GFX9-NEXT:    v_subrev_u32_e32 v3, s4, v0
6066; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
6067; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6068; GFX9-NEXT:    v_subrev_u32_e32 v4, s5, v1
6069; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
6070; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
6071; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v1
6072; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
6073; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
6074; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
6075; GFX9-NEXT:    s_endpgm
6076  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
6077  %r = urem <2 x i32> %x, %shl.y
6078  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
6079  ret void
6080}
6081
6082define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
6083; CHECK-LABEL: @sdiv_i32_oddk_denom(
6084; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195
6085; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6086; CHECK-NEXT:    ret void
6087;
6088; GFX6-LABEL: sdiv_i32_oddk_denom:
6089; GFX6:       ; %bb.0:
6090; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6091; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
6092; GFX6-NEXT:    v_mov_b32_e32 v0, 0xd9528441
6093; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6094; GFX6-NEXT:    s_mov_b32 s6, -1
6095; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6096; GFX6-NEXT:    v_mul_hi_i32 v0, s0, v0
6097; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
6098; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
6099; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 20, v0
6100; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
6101; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6102; GFX6-NEXT:    s_endpgm
6103; GFX9-LABEL: sdiv_i32_oddk_denom:
6104; GFX9:       ; %bb.0:
6105; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6106; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
6107; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6108; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6109; GFX9-NEXT:    s_mul_hi_i32 s0, s4, 0xd9528441
6110; GFX9-NEXT:    s_add_i32 s0, s0, s4
6111; GFX9-NEXT:    s_lshr_b32 s1, s0, 31
6112; GFX9-NEXT:    s_ashr_i32 s0, s0, 20
6113; GFX9-NEXT:    s_add_i32 s0, s0, s1
6114; GFX9-NEXT:    v_mov_b32_e32 v1, s0
6115; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
6116; GFX9-NEXT:    s_endpgm
6117  %r = sdiv i32 %x, 1235195
6118  store i32 %r, i32 addrspace(1)* %out
6119  ret void
6120}
6121
6122define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
6123; CHECK-LABEL: @sdiv_i32_pow2k_denom(
6124; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 4096
6125; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6126; CHECK-NEXT:    ret void
6127;
6128; GFX6-LABEL: sdiv_i32_pow2k_denom:
6129; GFX6:       ; %bb.0:
6130; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6131; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
6132; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6133; GFX6-NEXT:    s_mov_b32 s6, -1
6134; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6135; GFX6-NEXT:    s_ashr_i32 s1, s0, 31
6136; GFX6-NEXT:    s_lshr_b32 s1, s1, 20
6137; GFX6-NEXT:    s_add_i32 s0, s0, s1
6138; GFX6-NEXT:    s_ashr_i32 s0, s0, 12
6139; GFX6-NEXT:    v_mov_b32_e32 v0, s0
6140; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6141; GFX6-NEXT:    s_endpgm
6142; GFX9-LABEL: sdiv_i32_pow2k_denom:
6143; GFX9:       ; %bb.0:
6144; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6145; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
6146; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6147; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6148; GFX9-NEXT:    s_ashr_i32 s0, s4, 31
6149; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
6150; GFX9-NEXT:    s_add_i32 s4, s4, s0
6151; GFX9-NEXT:    s_ashr_i32 s0, s4, 12
6152; GFX9-NEXT:    v_mov_b32_e32 v1, s0
6153; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
6154; GFX9-NEXT:    s_endpgm
6155  %r = sdiv i32 %x, 4096
6156  store i32 %r, i32 addrspace(1)* %out
6157  ret void
6158}
6159
6160define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
6161; CHECK-LABEL: @sdiv_i32_pow2_shl_denom(
6162; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
6163; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]]
6164; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6165; CHECK-NEXT:    ret void
6166;
6167; GFX6-LABEL: sdiv_i32_pow2_shl_denom:
6168; GFX6:       ; %bb.0:
6169; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
6170; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6171; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6172; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
6173; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
6174; GFX6-NEXT:    s_add_i32 s3, s3, s4
6175; GFX6-NEXT:    s_xor_b32 s7, s3, s4
6176; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s7
6177; GFX6-NEXT:    s_sub_i32 s3, 0, s7
6178; GFX6-NEXT:    s_ashr_i32 s5, s2, 31
6179; GFX6-NEXT:    s_add_i32 s2, s2, s5
6180; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6181; GFX6-NEXT:    s_xor_b32 s6, s2, s5
6182; GFX6-NEXT:    s_xor_b32 s4, s5, s4
6183; GFX6-NEXT:    s_mov_b32 s2, -1
6184; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6185; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6186; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
6187; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6188; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
6189; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
6190; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
6191; GFX6-NEXT:    v_mul_lo_u32 v1, v0, s7
6192; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
6193; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s6, v1
6194; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s7, v1
6195; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
6196; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6197; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
6198; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
6199; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
6200; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6201; GFX6-NEXT:    v_xor_b32_e32 v0, s4, v0
6202; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
6203; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6204; GFX6-NEXT:    s_endpgm
6205; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
6206; GFX9:       ; %bb.0:
6207; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
6208; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6209; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6210; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6211; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
6212; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
6213; GFX9-NEXT:    s_add_i32 s3, s3, s4
6214; GFX9-NEXT:    s_xor_b32 s5, s3, s4
6215; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s5
6216; GFX9-NEXT:    s_sub_i32 s3, 0, s5
6217; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6218; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6219; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6220; GFX9-NEXT:    v_mul_lo_u32 v1, s3, v0
6221; GFX9-NEXT:    s_ashr_i32 s3, s2, 31
6222; GFX9-NEXT:    s_add_i32 s2, s2, s3
6223; GFX9-NEXT:    s_xor_b32 s2, s2, s3
6224; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
6225; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
6226; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
6227; GFX9-NEXT:    v_mul_lo_u32 v1, v0, s5
6228; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
6229; GFX9-NEXT:    v_sub_u32_e32 v1, s2, v1
6230; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
6231; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6232; GFX9-NEXT:    v_subrev_u32_e32 v3, s5, v1
6233; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
6234; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
6235; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v1
6236; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
6237; GFX9-NEXT:    s_xor_b32 s2, s3, s4
6238; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
6239; GFX9-NEXT:    v_subrev_u32_e32 v0, s2, v0
6240; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
6241; GFX9-NEXT:    s_endpgm
6242  %shl.y = shl i32 4096, %y
6243  %r = sdiv i32 %x, %shl.y
6244  store i32 %r, i32 addrspace(1)* %out
6245  ret void
6246}
6247
6248define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
6249; CHECK-LABEL: @sdiv_v2i32_pow2k_denom(
6250; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6251; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
6252; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
6253; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
6254; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096
6255; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
6256; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
6257; CHECK-NEXT:    ret void
6258;
6259; GFX6-LABEL: sdiv_v2i32_pow2k_denom:
6260; GFX6:       ; %bb.0:
6261; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6262; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6263; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6264; GFX6-NEXT:    s_mov_b32 s6, -1
6265; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6266; GFX6-NEXT:    s_ashr_i32 s2, s0, 31
6267; GFX6-NEXT:    s_lshr_b32 s2, s2, 20
6268; GFX6-NEXT:    s_ashr_i32 s3, s1, 31
6269; GFX6-NEXT:    s_add_i32 s0, s0, s2
6270; GFX6-NEXT:    s_lshr_b32 s2, s3, 20
6271; GFX6-NEXT:    s_add_i32 s1, s1, s2
6272; GFX6-NEXT:    s_ashr_i32 s0, s0, 12
6273; GFX6-NEXT:    s_ashr_i32 s1, s1, 12
6274; GFX6-NEXT:    v_mov_b32_e32 v0, s0
6275; GFX6-NEXT:    v_mov_b32_e32 v1, s1
6276; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6277; GFX6-NEXT:    s_endpgm
6278; GFX9-LABEL: sdiv_v2i32_pow2k_denom:
6279; GFX9:       ; %bb.0:
6280; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6281; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
6282; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6283; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6284; GFX9-NEXT:    s_ashr_i32 s0, s4, 31
6285; GFX9-NEXT:    s_ashr_i32 s1, s5, 31
6286; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
6287; GFX9-NEXT:    s_lshr_b32 s1, s1, 20
6288; GFX9-NEXT:    s_add_i32 s0, s4, s0
6289; GFX9-NEXT:    s_add_i32 s1, s5, s1
6290; GFX9-NEXT:    s_ashr_i32 s0, s0, 12
6291; GFX9-NEXT:    s_ashr_i32 s1, s1, 12
6292; GFX9-NEXT:    v_mov_b32_e32 v0, s0
6293; GFX9-NEXT:    v_mov_b32_e32 v1, s1
6294; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
6295; GFX9-NEXT:    s_endpgm
6296  %r = sdiv <2 x i32> %x, <i32 4096, i32 4096>
6297  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
6298  ret void
6299}
6300
6301define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
6302; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom(
6303; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6304; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
6305; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
6306; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
6307; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095
6308; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
6309; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
6310; CHECK-NEXT:    ret void
6311;
6312; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
6313; GFX6:       ; %bb.0:
6314; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6315; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6316; GFX6-NEXT:    v_mov_b32_e32 v0, 0x80080081
6317; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6318; GFX6-NEXT:    s_mov_b32 s6, -1
6319; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6320; GFX6-NEXT:    v_mul_hi_i32 v0, s1, v0
6321; GFX6-NEXT:    s_ashr_i32 s2, s0, 31
6322; GFX6-NEXT:    s_lshr_b32 s2, s2, 20
6323; GFX6-NEXT:    s_add_i32 s0, s0, s2
6324; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s1, v0
6325; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
6326; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
6327; GFX6-NEXT:    s_ashr_i32 s0, s0, 12
6328; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
6329; GFX6-NEXT:    v_mov_b32_e32 v0, s0
6330; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6331; GFX6-NEXT:    s_endpgm
6332; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
6333; GFX9:       ; %bb.0:
6334; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6335; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
6336; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6337; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6338; GFX9-NEXT:    s_ashr_i32 s0, s4, 31
6339; GFX9-NEXT:    s_mul_hi_i32 s1, s5, 0x80080081
6340; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
6341; GFX9-NEXT:    s_add_i32 s1, s1, s5
6342; GFX9-NEXT:    s_add_i32 s0, s4, s0
6343; GFX9-NEXT:    s_lshr_b32 s4, s1, 31
6344; GFX9-NEXT:    s_ashr_i32 s1, s1, 11
6345; GFX9-NEXT:    s_ashr_i32 s0, s0, 12
6346; GFX9-NEXT:    s_add_i32 s1, s1, s4
6347; GFX9-NEXT:    v_mov_b32_e32 v0, s0
6348; GFX9-NEXT:    v_mov_b32_e32 v1, s1
6349; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
6350; GFX9-NEXT:    s_endpgm
6351  %r = sdiv <2 x i32> %x, <i32 4096, i32 4095>
6352  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
6353  ret void
6354}
6355
6356define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
6357; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom(
6358; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
6359; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6360; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
6361; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
6362; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
6363; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
6364; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]]
6365; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]]
6366; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]]
6367; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]]
6368; CHECK-NEXT:    [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
6369; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]])
6370; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000
6371; CHECK-NEXT:    [[TMP13:%.*]] = fptoui float [[TMP12]] to i32
6372; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 0, [[TMP9]]
6373; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]]
6374; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP13]] to i64
6375; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
6376; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
6377; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
6378; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
6379; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
6380; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]]
6381; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP8]] to i64
6382; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP22]] to i64
6383; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
6384; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
6385; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
6386; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
6387; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]]
6388; CHECK-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]]
6389; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]]
6390; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP28]], 1
6391; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]]
6392; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]]
6393; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]]
6394; CHECK-NEXT:    [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]]
6395; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP33]], 1
6396; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]]
6397; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]]
6398; CHECK-NEXT:    [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]]
6399; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <2 x i32> undef, i32 [[TMP40]], i64 0
6400; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i32> [[X]], i64 1
6401; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
6402; CHECK-NEXT:    [[TMP44:%.*]] = ashr i32 [[TMP42]], 31
6403; CHECK-NEXT:    [[TMP45:%.*]] = ashr i32 [[TMP43]], 31
6404; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]]
6405; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]]
6406; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]]
6407; CHECK-NEXT:    [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]]
6408; CHECK-NEXT:    [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]]
6409; CHECK-NEXT:    [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float
6410; CHECK-NEXT:    [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]])
6411; CHECK-NEXT:    [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000
6412; CHECK-NEXT:    [[TMP54:%.*]] = fptoui float [[TMP53]] to i32
6413; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 0, [[TMP50]]
6414; CHECK-NEXT:    [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]]
6415; CHECK-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP54]] to i64
6416; CHECK-NEXT:    [[TMP58:%.*]] = zext i32 [[TMP56]] to i64
6417; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]]
6418; CHECK-NEXT:    [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32
6419; CHECK-NEXT:    [[TMP61:%.*]] = lshr i64 [[TMP59]], 32
6420; CHECK-NEXT:    [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32
6421; CHECK-NEXT:    [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]]
6422; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[TMP49]] to i64
6423; CHECK-NEXT:    [[TMP65:%.*]] = zext i32 [[TMP63]] to i64
6424; CHECK-NEXT:    [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]]
6425; CHECK-NEXT:    [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32
6426; CHECK-NEXT:    [[TMP68:%.*]] = lshr i64 [[TMP66]], 32
6427; CHECK-NEXT:    [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32
6428; CHECK-NEXT:    [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]]
6429; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]]
6430; CHECK-NEXT:    [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]]
6431; CHECK-NEXT:    [[TMP73:%.*]] = add i32 [[TMP69]], 1
6432; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]]
6433; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]]
6434; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]]
6435; CHECK-NEXT:    [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]]
6436; CHECK-NEXT:    [[TMP78:%.*]] = add i32 [[TMP74]], 1
6437; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]]
6438; CHECK-NEXT:    [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]]
6439; CHECK-NEXT:    [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]]
6440; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <2 x i32> [[TMP41]], i32 [[TMP81]], i64 1
6441; CHECK-NEXT:    store <2 x i32> [[TMP82]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
6442; CHECK-NEXT:    ret void
6443;
6444; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom:
6445; GFX6:       ; %bb.0:
6446; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
6447; GFX6-NEXT:    s_movk_i32 s10, 0x1000
6448; GFX6-NEXT:    s_mov_b32 s13, 0x4f7ffffe
6449; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6450; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
6451; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6452; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6453; GFX6-NEXT:    s_lshl_b32 s2, s10, s2
6454; GFX6-NEXT:    s_ashr_i32 s11, s2, 31
6455; GFX6-NEXT:    s_add_i32 s2, s2, s11
6456; GFX6-NEXT:    s_xor_b32 s12, s2, s11
6457; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
6458; GFX6-NEXT:    s_lshl_b32 s0, s10, s3
6459; GFX6-NEXT:    s_sub_i32 s3, 0, s12
6460; GFX6-NEXT:    s_ashr_i32 s2, s0, 31
6461; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6462; GFX6-NEXT:    s_add_i32 s0, s0, s2
6463; GFX6-NEXT:    s_xor_b32 s10, s0, s2
6464; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s10
6465; GFX6-NEXT:    v_mul_f32_e32 v0, s13, v0
6466; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6467; GFX6-NEXT:    s_ashr_i32 s1, s8, 31
6468; GFX6-NEXT:    s_add_i32 s0, s8, s1
6469; GFX6-NEXT:    s_xor_b32 s0, s0, s1
6470; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
6471; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
6472; GFX6-NEXT:    s_xor_b32 s3, s1, s11
6473; GFX6-NEXT:    s_mov_b32 s6, -1
6474; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
6475; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
6476; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
6477; GFX6-NEXT:    v_mul_f32_e32 v1, s13, v2
6478; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
6479; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s12
6480; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
6481; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s0, v2
6482; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v2
6483; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
6484; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s12, v2
6485; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
6486; GFX6-NEXT:    s_sub_i32 s0, 0, s10
6487; GFX6-NEXT:    v_mul_lo_u32 v3, s0, v1
6488; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
6489; GFX6-NEXT:    s_add_i32 s1, s9, s0
6490; GFX6-NEXT:    s_xor_b32 s1, s1, s0
6491; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
6492; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
6493; GFX6-NEXT:    s_xor_b32 s2, s0, s2
6494; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
6495; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
6496; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v2
6497; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
6498; GFX6-NEXT:    v_xor_b32_e32 v0, s3, v0
6499; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s10
6500; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
6501; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
6502; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
6503; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
6504; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
6505; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s10, v2
6506; GFX6-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
6507; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
6508; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
6509; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
6510; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
6511; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
6512; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6513; GFX6-NEXT:    s_endpgm
6514; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom:
6515; GFX9:       ; %bb.0:
6516; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
6517; GFX9-NEXT:    s_movk_i32 s8, 0x1000
6518; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
6519; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x2c
6520; GFX9-NEXT:    s_mov_b32 s11, 0x4f7ffffe
6521; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6522; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6523; GFX9-NEXT:    s_lshl_b32 s2, s8, s2
6524; GFX9-NEXT:    s_ashr_i32 s9, s2, 31
6525; GFX9-NEXT:    s_add_i32 s2, s2, s9
6526; GFX9-NEXT:    s_xor_b32 s10, s2, s9
6527; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s10
6528; GFX9-NEXT:    s_lshl_b32 s0, s8, s3
6529; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
6530; GFX9-NEXT:    s_add_i32 s0, s0, s1
6531; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6532; GFX9-NEXT:    s_xor_b32 s8, s0, s1
6533; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s8
6534; GFX9-NEXT:    s_sub_i32 s0, 0, s10
6535; GFX9-NEXT:    v_mul_f32_e32 v0, s11, v0
6536; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6537; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
6538; GFX9-NEXT:    s_sub_i32 s3, 0, s8
6539; GFX9-NEXT:    v_mul_lo_u32 v3, s0, v0
6540; GFX9-NEXT:    v_mul_f32_e32 v1, s11, v1
6541; GFX9-NEXT:    s_ashr_i32 s0, s6, 31
6542; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
6543; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v3
6544; GFX9-NEXT:    s_add_i32 s2, s6, s0
6545; GFX9-NEXT:    s_xor_b32 s2, s2, s0
6546; GFX9-NEXT:    s_xor_b32 s0, s0, s9
6547; GFX9-NEXT:    v_add_u32_e32 v0, v0, v3
6548; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
6549; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v1
6550; GFX9-NEXT:    s_ashr_i32 s3, s7, 31
6551; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s10
6552; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
6553; GFX9-NEXT:    v_add_u32_e32 v5, 1, v0
6554; GFX9-NEXT:    v_sub_u32_e32 v4, s2, v4
6555; GFX9-NEXT:    s_add_i32 s2, s7, s3
6556; GFX9-NEXT:    s_xor_b32 s2, s2, s3
6557; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
6558; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v4
6559; GFX9-NEXT:    v_mul_hi_u32 v1, s2, v1
6560; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
6561; GFX9-NEXT:    v_subrev_u32_e32 v5, s10, v4
6562; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
6563; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v4
6564; GFX9-NEXT:    v_add_u32_e32 v3, 1, v0
6565; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6566; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s8
6567; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
6568; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
6569; GFX9-NEXT:    v_subrev_u32_e32 v0, s0, v0
6570; GFX9-NEXT:    v_sub_u32_e32 v3, s2, v3
6571; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
6572; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
6573; GFX9-NEXT:    v_subrev_u32_e32 v4, s8, v3
6574; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
6575; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
6576; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
6577; GFX9-NEXT:    s_xor_b32 s0, s3, s1
6578; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
6579; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v1
6580; GFX9-NEXT:    v_subrev_u32_e32 v1, s0, v1
6581; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
6582; GFX9-NEXT:    s_endpgm
6583  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
6584  %r = sdiv <2 x i32> %x, %shl.y
6585  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
6586  ret void
6587}
6588
6589define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
6590; CHECK-LABEL: @srem_i32_oddk_denom(
6591; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 1235195
6592; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6593; CHECK-NEXT:    ret void
6594;
6595; GFX6-LABEL: srem_i32_oddk_denom:
6596; GFX6:       ; %bb.0:
6597; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xb
6598; GFX6-NEXT:    v_mov_b32_e32 v0, 0xd9528441
6599; GFX6-NEXT:    s_mov_b32 s2, 0x12d8fb
6600; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6601; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6602; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6603; GFX6-NEXT:    v_mul_hi_i32 v0, s4, v0
6604; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
6605; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
6606; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 20, v0
6607; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
6608; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s2
6609; GFX6-NEXT:    s_mov_b32 s2, -1
6610; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
6611; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6612; GFX6-NEXT:    s_endpgm
6613; GFX9-LABEL: srem_i32_oddk_denom:
6614; GFX9:       ; %bb.0:
6615; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6616; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
6617; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6618; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6619; GFX9-NEXT:    s_mul_hi_i32 s0, s4, 0xd9528441
6620; GFX9-NEXT:    s_add_i32 s0, s0, s4
6621; GFX9-NEXT:    s_lshr_b32 s1, s0, 31
6622; GFX9-NEXT:    s_ashr_i32 s0, s0, 20
6623; GFX9-NEXT:    s_add_i32 s0, s0, s1
6624; GFX9-NEXT:    s_mul_i32 s0, s0, 0x12d8fb
6625; GFX9-NEXT:    s_sub_i32 s0, s4, s0
6626; GFX9-NEXT:    v_mov_b32_e32 v1, s0
6627; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
6628; GFX9-NEXT:    s_endpgm
6629  %r = srem i32 %x, 1235195
6630  store i32 %r, i32 addrspace(1)* %out
6631  ret void
6632}
6633
6634define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
6635; CHECK-LABEL: @srem_i32_pow2k_denom(
6636; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 4096
6637; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6638; CHECK-NEXT:    ret void
6639;
6640; GFX6-LABEL: srem_i32_pow2k_denom:
6641; GFX6:       ; %bb.0:
6642; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6643; GFX6-NEXT:    s_load_dword s0, s[0:1], 0xb
6644; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6645; GFX6-NEXT:    s_mov_b32 s6, -1
6646; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6647; GFX6-NEXT:    s_ashr_i32 s1, s0, 31
6648; GFX6-NEXT:    s_lshr_b32 s1, s1, 20
6649; GFX6-NEXT:    s_add_i32 s1, s0, s1
6650; GFX6-NEXT:    s_and_b32 s1, s1, 0xfffff000
6651; GFX6-NEXT:    s_sub_i32 s0, s0, s1
6652; GFX6-NEXT:    v_mov_b32_e32 v0, s0
6653; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
6654; GFX6-NEXT:    s_endpgm
6655; GFX9-LABEL: srem_i32_pow2k_denom:
6656; GFX9:       ; %bb.0:
6657; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6658; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
6659; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6660; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6661; GFX9-NEXT:    s_ashr_i32 s0, s4, 31
6662; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
6663; GFX9-NEXT:    s_add_i32 s0, s4, s0
6664; GFX9-NEXT:    s_and_b32 s0, s0, 0xfffff000
6665; GFX9-NEXT:    s_sub_i32 s0, s4, s0
6666; GFX9-NEXT:    v_mov_b32_e32 v1, s0
6667; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
6668; GFX9-NEXT:    s_endpgm
6669  %r = srem i32 %x, 4096
6670  store i32 %r, i32 addrspace(1)* %out
6671  ret void
6672}
6673
6674define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
6675; CHECK-LABEL: @srem_i32_pow2_shl_denom(
6676; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
6677; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]]
6678; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
6679; CHECK-NEXT:    ret void
6680;
6681; GFX6-LABEL: srem_i32_pow2_shl_denom:
6682; GFX6:       ; %bb.0:
6683; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
6684; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6685; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6686; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
6687; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
6688; GFX6-NEXT:    s_add_i32 s3, s3, s4
6689; GFX6-NEXT:    s_xor_b32 s6, s3, s4
6690; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
6691; GFX6-NEXT:    s_sub_i32 s3, 0, s6
6692; GFX6-NEXT:    s_ashr_i32 s4, s2, 31
6693; GFX6-NEXT:    s_add_i32 s2, s2, s4
6694; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6695; GFX6-NEXT:    s_xor_b32 s5, s2, s4
6696; GFX6-NEXT:    s_mov_b32 s2, -1
6697; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6698; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6699; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v0
6700; GFX6-NEXT:    s_mov_b32 s3, 0xf000
6701; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
6702; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
6703; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
6704; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s6
6705; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
6706; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v0
6707; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
6708; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
6709; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v0
6710; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
6711; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
6712; GFX6-NEXT:    v_xor_b32_e32 v0, s4, v0
6713; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
6714; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6715; GFX6-NEXT:    s_endpgm
6716; GFX9-LABEL: srem_i32_pow2_shl_denom:
6717; GFX9:       ; %bb.0:
6718; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
6719; GFX9-NEXT:    s_nop 0
6720; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
6721; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6722; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
6723; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
6724; GFX9-NEXT:    s_add_i32 s3, s3, s4
6725; GFX9-NEXT:    s_xor_b32 s3, s3, s4
6726; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
6727; GFX9-NEXT:    s_sub_i32 s4, 0, s3
6728; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6729; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
6730; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6731; GFX9-NEXT:    v_mul_lo_u32 v1, s4, v0
6732; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
6733; GFX9-NEXT:    s_add_i32 s2, s2, s4
6734; GFX9-NEXT:    s_xor_b32 s2, s2, s4
6735; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
6736; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
6737; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
6738; GFX9-NEXT:    v_mov_b32_e32 v1, 0
6739; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
6740; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
6741; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
6742; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
6743; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6744; GFX9-NEXT:    v_subrev_u32_e32 v2, s3, v0
6745; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s3, v0
6746; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6747; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
6748; GFX9-NEXT:    v_subrev_u32_e32 v0, s4, v0
6749; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
6750; GFX9-NEXT:    s_endpgm
6751  %shl.y = shl i32 4096, %y
6752  %r = srem i32 %x, %shl.y
6753  store i32 %r, i32 addrspace(1)* %out
6754  ret void
6755}
6756
6757define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
6758; CHECK-LABEL: @srem_v2i32_pow2k_denom(
6759; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6760; CHECK-NEXT:    [[TMP2:%.*]] = srem i32 [[TMP1]], 4096
6761; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
6762; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
6763; CHECK-NEXT:    [[TMP5:%.*]] = srem i32 [[TMP4]], 4096
6764; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
6765; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
6766; CHECK-NEXT:    ret void
6767;
6768; GFX6-LABEL: srem_v2i32_pow2k_denom:
6769; GFX6:       ; %bb.0:
6770; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6771; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6772; GFX6-NEXT:    s_movk_i32 s2, 0xf000
6773; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6774; GFX6-NEXT:    s_mov_b32 s6, -1
6775; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6776; GFX6-NEXT:    s_ashr_i32 s3, s0, 31
6777; GFX6-NEXT:    s_lshr_b32 s3, s3, 20
6778; GFX6-NEXT:    s_add_i32 s3, s0, s3
6779; GFX6-NEXT:    s_and_b32 s3, s3, s2
6780; GFX6-NEXT:    s_sub_i32 s0, s0, s3
6781; GFX6-NEXT:    s_ashr_i32 s3, s1, 31
6782; GFX6-NEXT:    s_lshr_b32 s3, s3, 20
6783; GFX6-NEXT:    s_add_i32 s3, s1, s3
6784; GFX6-NEXT:    s_and_b32 s2, s3, s2
6785; GFX6-NEXT:    s_sub_i32 s1, s1, s2
6786; GFX6-NEXT:    v_mov_b32_e32 v0, s0
6787; GFX6-NEXT:    v_mov_b32_e32 v1, s1
6788; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6789; GFX6-NEXT:    s_endpgm
6790; GFX9-LABEL: srem_v2i32_pow2k_denom:
6791; GFX9:       ; %bb.0:
6792; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6793; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
6794; GFX9-NEXT:    s_movk_i32 s6, 0xf000
6795; GFX9-NEXT:    v_mov_b32_e32 v2, 0
6796; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6797; GFX9-NEXT:    s_ashr_i32 s0, s4, 31
6798; GFX9-NEXT:    s_ashr_i32 s1, s5, 31
6799; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
6800; GFX9-NEXT:    s_lshr_b32 s1, s1, 20
6801; GFX9-NEXT:    s_add_i32 s0, s4, s0
6802; GFX9-NEXT:    s_add_i32 s1, s5, s1
6803; GFX9-NEXT:    s_and_b32 s0, s0, s6
6804; GFX9-NEXT:    s_and_b32 s1, s1, s6
6805; GFX9-NEXT:    s_sub_i32 s0, s4, s0
6806; GFX9-NEXT:    s_sub_i32 s1, s5, s1
6807; GFX9-NEXT:    v_mov_b32_e32 v0, s0
6808; GFX9-NEXT:    v_mov_b32_e32 v1, s1
6809; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
6810; GFX9-NEXT:    s_endpgm
6811  %r = srem <2 x i32> %x, <i32 4096, i32 4096>
6812  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
6813  ret void
6814}
6815
6816define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
6817; CHECK-LABEL: @srem_v2i32_pow2_shl_denom(
6818; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
6819; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
6820; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
6821; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
6822; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
6823; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
6824; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]]
6825; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]]
6826; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]]
6827; CHECK-NEXT:    [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
6828; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
6829; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000
6830; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP11]] to i32
6831; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 0, [[TMP8]]
6832; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]]
6833; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP12]] to i64
6834; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
6835; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]]
6836; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
6837; CHECK-NEXT:    [[TMP19:%.*]] = lshr i64 [[TMP17]], 32
6838; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
6839; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
6840; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
6841; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP21]] to i64
6842; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]]
6843; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
6844; CHECK-NEXT:    [[TMP26:%.*]] = lshr i64 [[TMP24]], 32
6845; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
6846; CHECK-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]]
6847; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]]
6848; CHECK-NEXT:    [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]]
6849; CHECK-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]]
6850; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]]
6851; CHECK-NEXT:    [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]]
6852; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]]
6853; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]]
6854; CHECK-NEXT:    [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]]
6855; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]]
6856; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <2 x i32> undef, i32 [[TMP37]], i64 0
6857; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <2 x i32> [[X]], i64 1
6858; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
6859; CHECK-NEXT:    [[TMP41:%.*]] = ashr i32 [[TMP39]], 31
6860; CHECK-NEXT:    [[TMP42:%.*]] = ashr i32 [[TMP40]], 31
6861; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]]
6862; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]]
6863; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]]
6864; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]]
6865; CHECK-NEXT:    [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float
6866; CHECK-NEXT:    [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]])
6867; CHECK-NEXT:    [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000
6868; CHECK-NEXT:    [[TMP50:%.*]] = fptoui float [[TMP49]] to i32
6869; CHECK-NEXT:    [[TMP51:%.*]] = sub i32 0, [[TMP46]]
6870; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]]
6871; CHECK-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP50]] to i64
6872; CHECK-NEXT:    [[TMP54:%.*]] = zext i32 [[TMP52]] to i64
6873; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]]
6874; CHECK-NEXT:    [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32
6875; CHECK-NEXT:    [[TMP57:%.*]] = lshr i64 [[TMP55]], 32
6876; CHECK-NEXT:    [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32
6877; CHECK-NEXT:    [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]]
6878; CHECK-NEXT:    [[TMP60:%.*]] = zext i32 [[TMP45]] to i64
6879; CHECK-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP59]] to i64
6880; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]]
6881; CHECK-NEXT:    [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32
6882; CHECK-NEXT:    [[TMP64:%.*]] = lshr i64 [[TMP62]], 32
6883; CHECK-NEXT:    [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32
6884; CHECK-NEXT:    [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]]
6885; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]]
6886; CHECK-NEXT:    [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]]
6887; CHECK-NEXT:    [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]]
6888; CHECK-NEXT:    [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]]
6889; CHECK-NEXT:    [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]]
6890; CHECK-NEXT:    [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]]
6891; CHECK-NEXT:    [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]]
6892; CHECK-NEXT:    [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]]
6893; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]]
6894; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP75]], i64 1
6895; CHECK-NEXT:    store <2 x i32> [[TMP76]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
6896; CHECK-NEXT:    ret void
6897;
6898; GFX6-LABEL: srem_v2i32_pow2_shl_denom:
6899; GFX6:       ; %bb.0:
6900; GFX6-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
6901; GFX6-NEXT:    s_movk_i32 s6, 0x1000
6902; GFX6-NEXT:    s_mov_b32 s10, 0x4f7ffffe
6903; GFX6-NEXT:    s_mov_b32 s7, 0xf000
6904; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6905; GFX6-NEXT:    s_lshl_b32 s2, s6, s2
6906; GFX6-NEXT:    s_ashr_i32 s4, s2, 31
6907; GFX6-NEXT:    s_add_i32 s2, s2, s4
6908; GFX6-NEXT:    s_xor_b32 s9, s2, s4
6909; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s9
6910; GFX6-NEXT:    s_lshl_b32 s2, s6, s3
6911; GFX6-NEXT:    s_ashr_i32 s6, s2, 31
6912; GFX6-NEXT:    s_add_i32 s2, s2, s6
6913; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6914; GFX6-NEXT:    s_sub_i32 s8, 0, s9
6915; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6916; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
6917; GFX6-NEXT:    v_mul_f32_e32 v0, s10, v0
6918; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
6919; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
6920; GFX6-NEXT:    s_ashr_i32 s3, s0, 31
6921; GFX6-NEXT:    s_add_i32 s0, s0, s3
6922; GFX6-NEXT:    v_mul_lo_u32 v1, s8, v0
6923; GFX6-NEXT:    s_xor_b32 s8, s2, s6
6924; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s8
6925; GFX6-NEXT:    s_xor_b32 s0, s0, s3
6926; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
6927; GFX6-NEXT:    s_sub_i32 s2, 0, s8
6928; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
6929; GFX6-NEXT:    s_mov_b32 s6, -1
6930; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
6931; GFX6-NEXT:    v_mul_hi_u32 v0, s0, v0
6932; GFX6-NEXT:    v_mul_f32_e32 v1, s10, v2
6933; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
6934; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s9
6935; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
6936; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
6937; GFX6-NEXT:    s_ashr_i32 s0, s1, 31
6938; GFX6-NEXT:    v_mul_hi_u32 v2, v1, v2
6939; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v0
6940; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v0
6941; GFX6-NEXT:    s_add_i32 s1, s1, s0
6942; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6943; GFX6-NEXT:    s_xor_b32 s1, s1, s0
6944; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
6945; GFX6-NEXT:    v_mul_hi_u32 v1, s1, v1
6946; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v0
6947; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v0
6948; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
6949; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s8
6950; GFX6-NEXT:    v_xor_b32_e32 v0, s3, v0
6951; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
6952; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
6953; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v1
6954; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
6955; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6956; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v1
6957; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
6958; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6959; GFX6-NEXT:    v_xor_b32_e32 v1, s0, v1
6960; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s0, v1
6961; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6962; GFX6-NEXT:    s_endpgm
6963; GFX9-LABEL: srem_v2i32_pow2_shl_denom:
6964; GFX9:       ; %bb.0:
6965; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6966; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
6967; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x34
6968; GFX9-NEXT:    s_movk_i32 s8, 0x1000
6969; GFX9-NEXT:    s_mov_b32 s9, 0x4f7ffffe
6970; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6971; GFX9-NEXT:    s_lshl_b32 s0, s8, s6
6972; GFX9-NEXT:    s_ashr_i32 s1, s0, 31
6973; GFX9-NEXT:    s_add_i32 s0, s0, s1
6974; GFX9-NEXT:    s_xor_b32 s0, s0, s1
6975; GFX9-NEXT:    s_lshl_b32 s1, s8, s7
6976; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
6977; GFX9-NEXT:    s_ashr_i32 s6, s1, 31
6978; GFX9-NEXT:    s_add_i32 s1, s1, s6
6979; GFX9-NEXT:    s_xor_b32 s1, s1, s6
6980; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s1
6981; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
6982; GFX9-NEXT:    s_sub_i32 s7, 0, s0
6983; GFX9-NEXT:    s_ashr_i32 s6, s4, 31
6984; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
6985; GFX9-NEXT:    v_mul_f32_e32 v0, s9, v0
6986; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
6987; GFX9-NEXT:    s_add_i32 s4, s4, s6
6988; GFX9-NEXT:    v_mul_f32_e32 v1, s9, v1
6989; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
6990; GFX9-NEXT:    v_mul_lo_u32 v2, s7, v0
6991; GFX9-NEXT:    s_sub_i32 s7, 0, s1
6992; GFX9-NEXT:    s_xor_b32 s4, s4, s6
6993; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v1
6994; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
6995; GFX9-NEXT:    s_ashr_i32 s7, s5, 31
6996; GFX9-NEXT:    s_add_i32 s5, s5, s7
6997; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
6998; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
6999; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
7000; GFX9-NEXT:    s_xor_b32 s5, s5, s7
7001; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
7002; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
7003; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s0
7004; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7005; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s1
7006; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
7007; GFX9-NEXT:    v_subrev_u32_e32 v3, s0, v0
7008; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v0
7009; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
7010; GFX9-NEXT:    v_subrev_u32_e32 v3, s0, v0
7011; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s0, v0
7012; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
7013; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
7014; GFX9-NEXT:    v_subrev_u32_e32 v3, s1, v1
7015; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v1
7016; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
7017; GFX9-NEXT:    v_subrev_u32_e32 v3, s1, v1
7018; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s1, v1
7019; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
7020; GFX9-NEXT:    v_xor_b32_e32 v0, s6, v0
7021; GFX9-NEXT:    v_xor_b32_e32 v1, s7, v1
7022; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
7023; GFX9-NEXT:    v_subrev_u32_e32 v1, s7, v1
7024; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
7025; GFX9-NEXT:    s_endpgm
7026  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
7027  %r = srem <2 x i32> %x, %shl.y
7028  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
7029  ret void
7030}
7031
7032define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
7033; CHECK-LABEL: @udiv_i64_oddk_denom(
7034; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943
7035; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
7036; CHECK-NEXT:    ret void
7037;
7038; GFX6-LABEL: udiv_i64_oddk_denom:
7039; GFX6:       ; %bb.0:
7040; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f176a73
7041; GFX6-NEXT:    v_mov_b32_e32 v1, 0x4f800000
7042; GFX6-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
7043; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
7044; GFX6-NEXT:    s_movk_i32 s2, 0xfee0
7045; GFX6-NEXT:    s_mov_b32 s3, 0x68958c89
7046; GFX6-NEXT:    v_mov_b32_e32 v8, 0
7047; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7048; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7049; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
7050; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7051; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
7052; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
7053; GFX6-NEXT:    v_mov_b32_e32 v7, 0
7054; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
7055; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
7056; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s3
7057; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
7058; GFX6-NEXT:    s_mov_b32 s11, 0xf000
7059; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7060; GFX6-NEXT:    s_mov_b32 s8, s4
7061; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7062; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
7063; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
7064; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
7065; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v2
7066; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v3
7067; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v2
7068; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
7069; GFX6-NEXT:    s_movk_i32 s4, 0x11e
7070; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
7071; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
7072; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
7073; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
7074; GFX6-NEXT:    s_mov_b32 s10, -1
7075; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
7076; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
7077; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
7078; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7079; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
7080; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
7081; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s2
7082; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s3
7083; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
7084; GFX6-NEXT:    v_mul_lo_u32 v6, v2, s3
7085; GFX6-NEXT:    s_mov_b32 s2, 0x976a7377
7086; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
7087; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s3
7088; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
7089; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v4
7090; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v4
7091; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v5
7092; GFX6-NEXT:    v_mul_hi_u32 v11, v2, v4
7093; GFX6-NEXT:    s_movk_i32 s3, 0x11f
7094; GFX6-NEXT:    s_mov_b32 s9, s5
7095; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
7096; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
7097; GFX6-NEXT:    v_mul_lo_u32 v10, v2, v5
7098; GFX6-NEXT:    v_mul_hi_u32 v5, v2, v5
7099; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
7100; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
7101; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
7102; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
7103; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
7104; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
7105; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
7106; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
7107; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
7108; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7109; GFX6-NEXT:    v_mul_lo_u32 v2, s6, v1
7110; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
7111; GFX6-NEXT:    v_mul_hi_u32 v4, s6, v1
7112; GFX6-NEXT:    v_mul_hi_u32 v5, s7, v1
7113; GFX6-NEXT:    v_mul_lo_u32 v1, s7, v1
7114; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7115; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
7116; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
7117; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
7118; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7119; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
7120; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
7121; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
7122; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
7123; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s3
7124; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s2
7125; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s2
7126; GFX6-NEXT:    v_mov_b32_e32 v5, s3
7127; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7128; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s2
7129; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7130; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s7, v2
7131; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
7132; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
7133; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s2, v3
7134; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
7135; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s4, v4
7136; GFX6-NEXT:    s_mov_b32 s2, 0x976a7376
7137; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
7138; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s2, v5
7139; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
7140; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v4
7141; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
7142; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
7143; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
7144; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
7145; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
7146; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
7147; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
7148; GFX6-NEXT:    v_mov_b32_e32 v6, s7
7149; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
7150; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v2
7151; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
7152; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v3
7153; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
7154; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v2
7155; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
7156; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
7157; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
7158; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
7159; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
7160; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
7161; GFX6-NEXT:    s_endpgm
7162; GFX9-LABEL: udiv_i64_oddk_denom:
7163; GFX9:       ; %bb.0:
7164; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f176a73
7165; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
7166; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
7167; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
7168; GFX9-NEXT:    s_movk_i32 s4, 0xfee0
7169; GFX9-NEXT:    s_mov_b32 s5, 0x68958c89
7170; GFX9-NEXT:    v_mov_b32_e32 v6, 0
7171; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7172; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7173; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
7174; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7175; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
7176; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
7177; GFX9-NEXT:    s_movk_i32 s8, 0x11f
7178; GFX9-NEXT:    s_mov_b32 s9, 0x976a7376
7179; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s4
7180; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s5
7181; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s5
7182; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s5
7183; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
7184; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
7185; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
7186; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
7187; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
7188; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
7189; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
7190; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
7191; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
7192; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v4
7193; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
7194; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
7195; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
7196; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v6, vcc
7197; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7198; GFX9-NEXT:    v_mov_b32_e32 v5, 0
7199; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
7200; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
7201; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
7202; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s4
7203; GFX9-NEXT:    v_mul_hi_u32 v7, v0, s5
7204; GFX9-NEXT:    v_mul_lo_u32 v8, v2, s5
7205; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s5
7206; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
7207; GFX9-NEXT:    v_add_u32_e32 v4, v7, v4
7208; GFX9-NEXT:    v_add_u32_e32 v4, v4, v8
7209; GFX9-NEXT:    v_mul_lo_u32 v7, v0, v4
7210; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v9
7211; GFX9-NEXT:    v_mul_hi_u32 v10, v0, v4
7212; GFX9-NEXT:    v_mul_hi_u32 v11, v2, v4
7213; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
7214; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
7215; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v10, vcc
7216; GFX9-NEXT:    v_mul_lo_u32 v10, v2, v9
7217; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v9
7218; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
7219; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v10
7220; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v8, v9, vcc
7221; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v11, v6, vcc
7222; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v7, v2
7223; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v4, vcc
7224; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
7225; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
7226; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
7227; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7228; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
7229; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
7230; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
7231; GFX9-NEXT:    v_mul_hi_u32 v7, s7, v1
7232; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
7233; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7234; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
7235; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
7236; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
7237; GFX9-NEXT:    s_mov_b32 s2, 0x976a7377
7238; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
7239; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
7240; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v6, vcc
7241; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
7242; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
7243; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s8
7244; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s2
7245; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s2
7246; GFX9-NEXT:    v_mov_b32_e32 v5, s8
7247; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
7248; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s2
7249; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
7250; GFX9-NEXT:    v_sub_u32_e32 v4, s7, v2
7251; GFX9-NEXT:    v_sub_co_u32_e64 v3, s[0:1], s6, v3
7252; GFX9-NEXT:    v_subb_co_u32_e64 v4, vcc, v4, v5, s[0:1]
7253; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s2, v3
7254; GFX9-NEXT:    v_subbrev_co_u32_e32 v4, vcc, 0, v4, vcc
7255; GFX9-NEXT:    s_movk_i32 s6, 0x11e
7256; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v4
7257; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
7258; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s9, v5
7259; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
7260; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v4
7261; GFX9-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
7262; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, 2, v0
7263; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
7264; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v0
7265; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
7266; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v4
7267; GFX9-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[2:3]
7268; GFX9-NEXT:    v_mov_b32_e32 v7, s7
7269; GFX9-NEXT:    v_subb_co_u32_e64 v2, vcc, v7, v2, s[0:1]
7270; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v2
7271; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
7272; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s9, v3
7273; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
7274; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v2
7275; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
7276; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
7277; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v5, s[2:3]
7278; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
7279; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
7280; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[4:5]
7281; GFX9-NEXT:    s_endpgm
7282  %r = udiv i64 %x, 1235195949943
7283  store i64 %r, i64 addrspace(1)* %out
7284  ret void
7285}
7286
7287define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
7288; CHECK-LABEL: @udiv_i64_pow2k_denom(
7289; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 4096
7290; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
7291; CHECK-NEXT:    ret void
7292;
7293; GFX6-LABEL: udiv_i64_pow2k_denom:
7294; GFX6:       ; %bb.0:
7295; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
7296; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7297; GFX6-NEXT:    s_mov_b32 s6, -1
7298; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7299; GFX6-NEXT:    s_mov_b32 s4, s0
7300; GFX6-NEXT:    s_mov_b32 s5, s1
7301; GFX6-NEXT:    s_lshr_b64 s[0:1], s[2:3], 12
7302; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7303; GFX6-NEXT:    v_mov_b32_e32 v1, s1
7304; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
7305; GFX6-NEXT:    s_endpgm
7306; GFX9-LABEL: udiv_i64_pow2k_denom:
7307; GFX9:       ; %bb.0:
7308; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
7309; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7310; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7311; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
7312; GFX9-NEXT:    v_mov_b32_e32 v0, s2
7313; GFX9-NEXT:    v_mov_b32_e32 v1, s3
7314; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
7315; GFX9-NEXT:    s_endpgm
7316  %r = udiv i64 %x, 4096
7317  store i64 %r, i64 addrspace(1)* %out
7318  ret void
7319}
7320
7321define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
7322; CHECK-LABEL: @udiv_i64_pow2_shl_denom(
7323; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
7324; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]]
7325; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
7326; CHECK-NEXT:    ret void
7327;
7328; GFX6-LABEL: udiv_i64_pow2_shl_denom:
7329; GFX6:       ; %bb.0:
7330; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
7331; GFX6-NEXT:    s_load_dword s8, s[0:1], 0xd
7332; GFX6-NEXT:    s_mov_b32 s3, 0xf000
7333; GFX6-NEXT:    s_mov_b32 s2, -1
7334; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7335; GFX6-NEXT:    s_mov_b32 s0, s4
7336; GFX6-NEXT:    s_add_i32 s8, s8, 12
7337; GFX6-NEXT:    s_mov_b32 s1, s5
7338; GFX6-NEXT:    s_lshr_b64 s[4:5], s[6:7], s8
7339; GFX6-NEXT:    v_mov_b32_e32 v0, s4
7340; GFX6-NEXT:    v_mov_b32_e32 v1, s5
7341; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
7342; GFX6-NEXT:    s_endpgm
7343; GFX9-LABEL: udiv_i64_pow2_shl_denom:
7344; GFX9:       ; %bb.0:
7345; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
7346; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
7347; GFX9-NEXT:    v_mov_b32_e32 v2, 0
7348; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7349; GFX9-NEXT:    s_add_i32 s2, s2, 12
7350; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], s2
7351; GFX9-NEXT:    v_mov_b32_e32 v0, s0
7352; GFX9-NEXT:    v_mov_b32_e32 v1, s1
7353; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
7354; GFX9-NEXT:    s_endpgm
7355  %shl.y = shl i64 4096, %y
7356  %r = udiv i64 %x, %shl.y
7357  store i64 %r, i64 addrspace(1)* %out
7358  ret void
7359}
7360
7361define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
7362; CHECK-LABEL: @udiv_v2i64_pow2k_denom(
7363; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7364; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
7365; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
7366; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
7367; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096
7368; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
7369; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
7370; CHECK-NEXT:    ret void
7371;
7372; GFX6-LABEL: udiv_v2i64_pow2k_denom:
7373; GFX6:       ; %bb.0:
7374; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7375; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
7376; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7377; GFX6-NEXT:    s_mov_b32 s6, -1
7378; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7379; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], 12
7380; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
7381; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7382; GFX6-NEXT:    v_mov_b32_e32 v1, s1
7383; GFX6-NEXT:    v_mov_b32_e32 v2, s2
7384; GFX6-NEXT:    v_mov_b32_e32 v3, s3
7385; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
7386; GFX6-NEXT:    s_endpgm
7387; GFX9-LABEL: udiv_v2i64_pow2k_denom:
7388; GFX9:       ; %bb.0:
7389; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7390; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
7391; GFX9-NEXT:    v_mov_b32_e32 v4, 0
7392; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7393; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], 12
7394; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], 12
7395; GFX9-NEXT:    v_mov_b32_e32 v0, s0
7396; GFX9-NEXT:    v_mov_b32_e32 v1, s1
7397; GFX9-NEXT:    v_mov_b32_e32 v2, s4
7398; GFX9-NEXT:    v_mov_b32_e32 v3, s5
7399; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
7400; GFX9-NEXT:    s_endpgm
7401  %r = udiv <2 x i64> %x, <i64 4096, i64 4096>
7402  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
7403  ret void
7404}
7405
7406define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
7407; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom(
7408; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7409; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
7410; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
7411; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
7412; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095
7413; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
7414; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
7415; CHECK-NEXT:    ret void
7416;
7417; GFX6-LABEL: udiv_v2i64_mixed_pow2k_denom:
7418; GFX6:       ; %bb.0:
7419; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f800000
7420; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
7421; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
7422; GFX6-NEXT:    s_movk_i32 s6, 0xf001
7423; GFX6-NEXT:    v_mov_b32_e32 v7, 0
7424; GFX6-NEXT:    v_mov_b32_e32 v2, 0
7425; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7426; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7427; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
7428; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7429; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
7430; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
7431; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7432; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
7433; GFX6-NEXT:    s_movk_i32 s0, 0xfff
7434; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s6
7435; GFX6-NEXT:    v_mul_lo_u32 v5, v1, s6
7436; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s6
7437; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7438; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, v0, v3
7439; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
7440; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v4
7441; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v3
7442; GFX6-NEXT:    v_mul_hi_u32 v8, v0, v3
7443; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v3
7444; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
7445; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
7446; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v7, v8, vcc
7447; GFX6-NEXT:    v_mul_lo_u32 v8, v1, v4
7448; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
7449; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
7450; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v6, v4, vcc
7451; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v2, vcc
7452; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
7453; GFX6-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v3
7454; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
7455; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s6
7456; GFX6-NEXT:    v_addc_u32_e64 v3, vcc, v1, v4, s[2:3]
7457; GFX6-NEXT:    v_mul_lo_u32 v6, v3, s6
7458; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s6
7459; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, v0, v5
7460; GFX6-NEXT:    s_mov_b32 s6, -1
7461; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
7462; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v5
7463; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v8
7464; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v5
7465; GFX6-NEXT:    v_mul_hi_u32 v11, v3, v5
7466; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
7467; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v7, v10, vcc
7468; GFX6-NEXT:    v_mul_lo_u32 v10, v3, v8
7469; GFX6-NEXT:    v_mul_hi_u32 v8, v3, v8
7470; GFX6-NEXT:    v_mul_lo_u32 v3, v3, v5
7471; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
7472; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v9, v8, vcc
7473; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v11, v2, vcc
7474; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
7475; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
7476; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
7477; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
7478; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
7479; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7480; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7481; GFX6-NEXT:    v_mul_lo_u32 v3, s10, v1
7482; GFX6-NEXT:    v_mul_hi_u32 v4, s10, v0
7483; GFX6-NEXT:    v_mul_hi_u32 v5, s10, v1
7484; GFX6-NEXT:    v_mul_hi_u32 v6, s11, v1
7485; GFX6-NEXT:    v_mul_lo_u32 v1, s11, v1
7486; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
7487; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
7488; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v0
7489; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
7490; GFX6-NEXT:    s_lshr_b64 s[2:3], s[8:9], 12
7491; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
7492; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v4, v0, vcc
7493; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v6, v2, vcc
7494; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
7495; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v7, v2, vcc
7496; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s0
7497; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s0
7498; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s0
7499; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7500; GFX6-NEXT:    v_mov_b32_e32 v3, s11
7501; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s10, v4
7502; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
7503; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s0, v4
7504; GFX6-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
7505; GFX6-NEXT:    s_movk_i32 s0, 0xffe
7506; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v3
7507; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
7508; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
7509; GFX6-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
7510; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
7511; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
7512; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
7513; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v4
7514; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
7515; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
7516; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
7517; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
7518; GFX6-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[0:1]
7519; GFX6-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
7520; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
7521; GFX6-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s[0:1]
7522; GFX6-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
7523; GFX6-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
7524; GFX6-NEXT:    v_mov_b32_e32 v0, s2
7525; GFX6-NEXT:    v_mov_b32_e32 v1, s3
7526; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
7527; GFX6-NEXT:    s_endpgm
7528; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom:
7529; GFX9:       ; %bb.0:
7530; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
7531; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
7532; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
7533; GFX9-NEXT:    s_movk_i32 s4, 0xf001
7534; GFX9-NEXT:    v_mov_b32_e32 v7, 0
7535; GFX9-NEXT:    v_mov_b32_e32 v5, 0
7536; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7537; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7538; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
7539; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7540; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
7541; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
7542; GFX9-NEXT:    s_movk_i32 s8, 0xfff
7543; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s4
7544; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s4
7545; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s4
7546; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
7547; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
7548; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v3
7549; GFX9-NEXT:    v_mul_lo_u32 v4, v0, v2
7550; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
7551; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v2
7552; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
7553; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v4
7554; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v8, vcc
7555; GFX9-NEXT:    v_mul_lo_u32 v8, v1, v3
7556; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
7557; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
7558; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v3, vcc
7559; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
7560; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7561; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
7562; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
7563; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
7564; GFX9-NEXT:    v_mul_hi_u32 v4, v0, s4
7565; GFX9-NEXT:    v_mul_lo_u32 v6, v2, s4
7566; GFX9-NEXT:    v_mul_lo_u32 v8, v0, s4
7567; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
7568; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v0
7569; GFX9-NEXT:    v_add_u32_e32 v4, v4, v6
7570; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v4
7571; GFX9-NEXT:    v_mul_hi_u32 v9, v0, v8
7572; GFX9-NEXT:    v_mul_hi_u32 v10, v0, v4
7573; GFX9-NEXT:    v_mul_hi_u32 v11, v2, v4
7574; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
7575; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v9, v6
7576; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v7, v10, vcc
7577; GFX9-NEXT:    v_mul_lo_u32 v10, v2, v8
7578; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v8
7579; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
7580; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7581; GFX9-NEXT:    s_lshr_b64 s[4:5], s[4:5], 12
7582; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v10
7583; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v9, v8, vcc
7584; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v11, v5, vcc
7585; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
7586; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
7587; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
7588; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
7589; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
7590; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
7591; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
7592; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
7593; GFX9-NEXT:    v_mul_hi_u32 v6, s7, v1
7594; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
7595; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7596; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
7597; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
7598; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
7599; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7600; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
7601; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
7602; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
7603; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
7604; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v2, vcc
7605; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
7606; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s8
7607; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s8
7608; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s6, v4
7609; GFX9-NEXT:    s_movk_i32 s6, 0xffe
7610; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
7611; GFX9-NEXT:    v_mov_b32_e32 v3, s7
7612; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v2, vcc
7613; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s8, v4
7614; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
7615; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v3
7616; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
7617; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
7618; GFX9-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
7619; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, 2, v0
7620; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
7621; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v0
7622; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
7623; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v4
7624; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
7625; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
7626; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
7627; GFX9-NEXT:    v_cndmask_b32_e32 v2, -1, v4, vcc
7628; GFX9-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[0:1]
7629; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
7630; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
7631; GFX9-NEXT:    v_cndmask_b32_e64 v1, v8, v6, s[0:1]
7632; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
7633; GFX9-NEXT:    v_mov_b32_e32 v0, s4
7634; GFX9-NEXT:    v_mov_b32_e32 v1, s5
7635; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7636; GFX9-NEXT:    global_store_dwordx4 v5, v[0:3], s[2:3]
7637; GFX9-NEXT:    s_endpgm
7638  %r = udiv <2 x i64> %x, <i64 4096, i64 4095>
7639  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
7640  ret void
7641}
7642
7643define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
7644; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom(
7645; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
7646; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
7647; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
7648; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]]
7649; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
7650; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
7651; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
7652; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]]
7653; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
7654; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
7655; CHECK-NEXT:    ret void
7656;
7657; GFX6-LABEL: udiv_v2i64_pow2_shl_denom:
7658; GFX6:       ; %bb.0:
7659; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
7660; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
7661; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
7662; GFX6-NEXT:    s_mov_b32 s7, 0xf000
7663; GFX6-NEXT:    s_mov_b32 s6, -1
7664; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7665; GFX6-NEXT:    s_add_i32 s0, s0, 12
7666; GFX6-NEXT:    s_add_i32 s2, s2, 12
7667; GFX6-NEXT:    s_lshr_b64 s[0:1], s[8:9], s0
7668; GFX6-NEXT:    s_lshr_b64 s[2:3], s[10:11], s2
7669; GFX6-NEXT:    v_mov_b32_e32 v0, s0
7670; GFX6-NEXT:    v_mov_b32_e32 v1, s1
7671; GFX6-NEXT:    v_mov_b32_e32 v2, s2
7672; GFX6-NEXT:    v_mov_b32_e32 v3, s3
7673; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
7674; GFX6-NEXT:    s_endpgm
7675; GFX9-LABEL: udiv_v2i64_pow2_shl_denom:
7676; GFX9:       ; %bb.0:
7677; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
7678; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
7679; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
7680; GFX9-NEXT:    v_mov_b32_e32 v4, 0
7681; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7682; GFX9-NEXT:    s_add_i32 s0, s8, 12
7683; GFX9-NEXT:    s_add_i32 s8, s10, 12
7684; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], s0
7685; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], s8
7686; GFX9-NEXT:    v_mov_b32_e32 v0, s0
7687; GFX9-NEXT:    v_mov_b32_e32 v1, s1
7688; GFX9-NEXT:    v_mov_b32_e32 v2, s4
7689; GFX9-NEXT:    v_mov_b32_e32 v3, s5
7690; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
7691; GFX9-NEXT:    s_endpgm
7692  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
7693  %r = udiv <2 x i64> %x, %shl.y
7694  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
7695  ret void
7696}
7697
7698define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
7699; CHECK-LABEL: @urem_i64_oddk_denom(
7700; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993
7701; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
7702; CHECK-NEXT:    ret void
7703;
7704; GFX6-LABEL: urem_i64_oddk_denom:
7705; GFX6:       ; %bb.0:
7706; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f1761f8
7707; GFX6-NEXT:    v_mov_b32_e32 v1, 0x4f800000
7708; GFX6-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
7709; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
7710; GFX6-NEXT:    s_movk_i32 s2, 0xfee0
7711; GFX6-NEXT:    s_mov_b32 s3, 0x689e0837
7712; GFX6-NEXT:    v_mov_b32_e32 v8, 0
7713; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7714; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7715; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
7716; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7717; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
7718; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
7719; GFX6-NEXT:    v_mov_b32_e32 v7, 0
7720; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
7721; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
7722; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s3
7723; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
7724; GFX6-NEXT:    s_movk_i32 s12, 0x11f
7725; GFX6-NEXT:    s_mov_b32 s13, 0x9761f7c9
7726; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7727; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
7728; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
7729; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
7730; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v2
7731; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v3
7732; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v2
7733; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
7734; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7735; GFX6-NEXT:    s_mov_b32 s9, s5
7736; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
7737; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
7738; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
7739; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
7740; GFX6-NEXT:    s_movk_i32 s5, 0x11e
7741; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
7742; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
7743; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
7744; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7745; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
7746; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
7747; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s2
7748; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s3
7749; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
7750; GFX6-NEXT:    v_mul_lo_u32 v6, v2, s3
7751; GFX6-NEXT:    s_mov_b32 s8, s4
7752; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
7753; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s3
7754; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
7755; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v4
7756; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v4
7757; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v5
7758; GFX6-NEXT:    v_mul_hi_u32 v11, v2, v4
7759; GFX6-NEXT:    s_mov_b32 s4, 0x9761f7c8
7760; GFX6-NEXT:    s_mov_b32 s11, 0xf000
7761; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
7762; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
7763; GFX6-NEXT:    v_mul_lo_u32 v10, v2, v5
7764; GFX6-NEXT:    v_mul_hi_u32 v5, v2, v5
7765; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
7766; GFX6-NEXT:    s_mov_b32 s10, -1
7767; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
7768; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
7769; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
7770; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
7771; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
7772; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
7773; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
7774; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
7775; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
7776; GFX6-NEXT:    v_mul_lo_u32 v2, s6, v1
7777; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
7778; GFX6-NEXT:    v_mul_hi_u32 v4, s6, v1
7779; GFX6-NEXT:    v_mul_hi_u32 v5, s7, v1
7780; GFX6-NEXT:    v_mul_lo_u32 v1, s7, v1
7781; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7782; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
7783; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
7784; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
7785; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
7786; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
7787; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
7788; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
7789; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
7790; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s12
7791; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s13
7792; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s13
7793; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s13
7794; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
7795; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
7796; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s7, v1
7797; GFX6-NEXT:    v_mov_b32_e32 v3, s12
7798; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
7799; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
7800; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s13, v0
7801; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
7802; GFX6-NEXT:    v_cmp_lt_u32_e64 s[2:3], s5, v5
7803; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
7804; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
7805; GFX6-NEXT:    v_cmp_lt_u32_e64 s[2:3], s4, v4
7806; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s13, v4
7807; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
7808; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, v5
7809; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
7810; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
7811; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
7812; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
7813; GFX6-NEXT:    v_mov_b32_e32 v5, s7
7814; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
7815; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v1
7816; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
7817; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v0
7818; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
7819; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s12, v1
7820; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
7821; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
7822; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
7823; GFX6-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
7824; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
7825; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
7826; GFX6-NEXT:    s_endpgm
7827; GFX9-LABEL: urem_i64_oddk_denom:
7828; GFX9:       ; %bb.0:
7829; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f1761f8
7830; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
7831; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
7832; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
7833; GFX9-NEXT:    s_movk_i32 s4, 0xfee0
7834; GFX9-NEXT:    s_mov_b32 s5, 0x689e0837
7835; GFX9-NEXT:    v_mov_b32_e32 v6, 0
7836; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
7837; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
7838; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
7839; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
7840; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
7841; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
7842; GFX9-NEXT:    s_movk_i32 s8, 0x11f
7843; GFX9-NEXT:    s_mov_b32 s9, 0x9761f7c9
7844; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s4
7845; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s5
7846; GFX9-NEXT:    v_mul_lo_u32 v5, v1, s5
7847; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s5
7848; GFX9-NEXT:    s_mov_b32 s10, 0x9761f7c8
7849; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
7850; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
7851; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
7852; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
7853; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
7854; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
7855; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
7856; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
7857; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
7858; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v4
7859; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
7860; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
7861; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
7862; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v6, vcc
7863; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7864; GFX9-NEXT:    v_mov_b32_e32 v5, 0
7865; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
7866; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
7867; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
7868; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s4
7869; GFX9-NEXT:    v_mul_hi_u32 v7, v0, s5
7870; GFX9-NEXT:    v_mul_lo_u32 v8, v2, s5
7871; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s5
7872; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
7873; GFX9-NEXT:    v_add_u32_e32 v4, v7, v4
7874; GFX9-NEXT:    v_add_u32_e32 v4, v4, v8
7875; GFX9-NEXT:    v_mul_lo_u32 v7, v0, v4
7876; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v9
7877; GFX9-NEXT:    v_mul_hi_u32 v10, v0, v4
7878; GFX9-NEXT:    v_mul_hi_u32 v11, v2, v4
7879; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
7880; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
7881; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v10, vcc
7882; GFX9-NEXT:    v_mul_lo_u32 v10, v2, v9
7883; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v9
7884; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
7885; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v10
7886; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v8, v9, vcc
7887; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v11, v6, vcc
7888; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v7, v2
7889; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v4, vcc
7890; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
7891; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
7892; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
7893; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7894; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
7895; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
7896; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
7897; GFX9-NEXT:    v_mul_hi_u32 v7, s7, v1
7898; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
7899; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
7900; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
7901; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
7902; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
7903; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
7904; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
7905; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v6, vcc
7906; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
7907; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
7908; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s8
7909; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s9
7910; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s9
7911; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s9
7912; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
7913; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
7914; GFX9-NEXT:    v_sub_co_u32_e64 v0, s[0:1], s6, v0
7915; GFX9-NEXT:    v_sub_u32_e32 v2, s7, v1
7916; GFX9-NEXT:    v_mov_b32_e32 v3, s8
7917; GFX9-NEXT:    v_subb_co_u32_e64 v2, vcc, v2, v3, s[0:1]
7918; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[2:3], s9, v0
7919; GFX9-NEXT:    v_subbrev_co_u32_e64 v5, vcc, 0, v2, s[2:3]
7920; GFX9-NEXT:    s_movk_i32 s6, 0x11e
7921; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v5
7922; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
7923; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v4
7924; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
7925; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v5
7926; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
7927; GFX9-NEXT:    v_subb_co_u32_e64 v2, vcc, v2, v3, s[2:3]
7928; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s9, v4
7929; GFX9-NEXT:    v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
7930; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v7
7931; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[2:3]
7932; GFX9-NEXT:    v_mov_b32_e32 v5, s7
7933; GFX9-NEXT:    v_subb_co_u32_e64 v1, vcc, v5, v1, s[0:1]
7934; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v1
7935; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
7936; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v0
7937; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
7938; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v1
7939; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
7940; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
7941; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
7942; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[2:3]
7943; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
7944; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[4:5]
7945; GFX9-NEXT:    s_endpgm
7946  %r = urem i64 %x, 1235195393993
7947  store i64 %r, i64 addrspace(1)* %out
7948  ret void
7949}
7950
7951define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
7952; CHECK-LABEL: @urem_i64_pow2k_denom(
7953; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 4096
7954; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
7955; CHECK-NEXT:    ret void
7956;
7957; GFX6-LABEL: urem_i64_pow2k_denom:
7958; GFX6:       ; %bb.0:
7959; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
7960; GFX6-NEXT:    s_mov_b32 s3, 0xf000
7961; GFX6-NEXT:    s_mov_b32 s2, -1
7962; GFX6-NEXT:    v_mov_b32_e32 v1, 0
7963; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7964; GFX6-NEXT:    s_mov_b32 s0, s4
7965; GFX6-NEXT:    s_and_b32 s4, s6, 0xfff
7966; GFX6-NEXT:    s_mov_b32 s1, s5
7967; GFX6-NEXT:    v_mov_b32_e32 v0, s4
7968; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
7969; GFX6-NEXT:    s_endpgm
7970; GFX9-LABEL: urem_i64_pow2k_denom:
7971; GFX9:       ; %bb.0:
7972; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
7973; GFX9-NEXT:    v_mov_b32_e32 v1, 0
7974; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
7975; GFX9-NEXT:    s_and_b32 s2, s2, 0xfff
7976; GFX9-NEXT:    v_mov_b32_e32 v0, s2
7977; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
7978; GFX9-NEXT:    s_endpgm
7979  %r = urem i64 %x, 4096
7980  store i64 %r, i64 addrspace(1)* %out
7981  ret void
7982}
7983
7984define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
7985; CHECK-LABEL: @urem_i64_pow2_shl_denom(
7986; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
7987; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]]
7988; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
7989; CHECK-NEXT:    ret void
7990;
7991; GFX6-LABEL: urem_i64_pow2_shl_denom:
7992; GFX6:       ; %bb.0:
7993; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
7994; GFX6-NEXT:    s_load_dword s8, s[0:1], 0xd
7995; GFX6-NEXT:    s_mov_b32 s3, 0xf000
7996; GFX6-NEXT:    s_mov_b32 s2, -1
7997; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
7998; GFX6-NEXT:    s_mov_b32 s0, s4
7999; GFX6-NEXT:    s_mov_b32 s1, s5
8000; GFX6-NEXT:    s_mov_b32 s5, 0
8001; GFX6-NEXT:    s_movk_i32 s4, 0x1000
8002; GFX6-NEXT:    s_lshl_b64 s[4:5], s[4:5], s8
8003; GFX6-NEXT:    s_add_u32 s4, s4, -1
8004; GFX6-NEXT:    s_addc_u32 s5, s5, -1
8005; GFX6-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
8006; GFX6-NEXT:    v_mov_b32_e32 v0, s4
8007; GFX6-NEXT:    v_mov_b32_e32 v1, s5
8008; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
8009; GFX6-NEXT:    s_endpgm
8010; GFX9-LABEL: urem_i64_pow2_shl_denom:
8011; GFX9:       ; %bb.0:
8012; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
8013; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
8014; GFX9-NEXT:    s_mov_b32 s1, 0
8015; GFX9-NEXT:    s_movk_i32 s0, 0x1000
8016; GFX9-NEXT:    v_mov_b32_e32 v2, 0
8017; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8018; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
8019; GFX9-NEXT:    s_add_u32 s0, s0, -1
8020; GFX9-NEXT:    s_addc_u32 s1, s1, -1
8021; GFX9-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
8022; GFX9-NEXT:    v_mov_b32_e32 v0, s0
8023; GFX9-NEXT:    v_mov_b32_e32 v1, s1
8024; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
8025; GFX9-NEXT:    s_endpgm
8026  %shl.y = shl i64 4096, %y
8027  %r = urem i64 %x, %shl.y
8028  store i64 %r, i64 addrspace(1)* %out
8029  ret void
8030}
8031
8032define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
8033; CHECK-LABEL: @urem_v2i64_pow2k_denom(
8034; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
8035; CHECK-NEXT:    [[TMP2:%.*]] = urem i64 [[TMP1]], 4096
8036; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
8037; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
8038; CHECK-NEXT:    [[TMP5:%.*]] = urem i64 [[TMP4]], 4096
8039; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
8040; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
8041; CHECK-NEXT:    ret void
8042;
8043; GFX6-LABEL: urem_v2i64_pow2k_denom:
8044; GFX6:       ; %bb.0:
8045; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
8046; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
8047; GFX6-NEXT:    s_movk_i32 s8, 0xfff
8048; GFX6-NEXT:    v_mov_b32_e32 v1, 0
8049; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8050; GFX6-NEXT:    s_mov_b32 s6, -1
8051; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8052; GFX6-NEXT:    s_and_b32 s0, s0, s8
8053; GFX6-NEXT:    s_and_b32 s1, s2, s8
8054; GFX6-NEXT:    v_mov_b32_e32 v0, s0
8055; GFX6-NEXT:    v_mov_b32_e32 v2, s1
8056; GFX6-NEXT:    v_mov_b32_e32 v3, v1
8057; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
8058; GFX6-NEXT:    s_endpgm
8059; GFX9-LABEL: urem_v2i64_pow2k_denom:
8060; GFX9:       ; %bb.0:
8061; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8062; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
8063; GFX9-NEXT:    s_movk_i32 s0, 0xfff
8064; GFX9-NEXT:    v_mov_b32_e32 v1, 0
8065; GFX9-NEXT:    v_mov_b32_e32 v3, v1
8066; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8067; GFX9-NEXT:    s_and_b32 s1, s4, s0
8068; GFX9-NEXT:    s_and_b32 s0, s6, s0
8069; GFX9-NEXT:    v_mov_b32_e32 v0, s1
8070; GFX9-NEXT:    v_mov_b32_e32 v2, s0
8071; GFX9-NEXT:    global_store_dwordx4 v1, v[0:3], s[2:3]
8072; GFX9-NEXT:    s_endpgm
8073  %r = urem <2 x i64> %x, <i64 4096, i64 4096>
8074  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
8075  ret void
8076}
8077
8078define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
8079; CHECK-LABEL: @urem_v2i64_pow2_shl_denom(
8080; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
8081; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
8082; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
8083; CHECK-NEXT:    [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]]
8084; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
8085; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
8086; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
8087; CHECK-NEXT:    [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]]
8088; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
8089; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
8090; CHECK-NEXT:    ret void
8091;
8092; GFX6-LABEL: urem_v2i64_pow2_shl_denom:
8093; GFX6:       ; %bb.0:
8094; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
8095; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
8096; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
8097; GFX6-NEXT:    s_mov_b32 s13, 0
8098; GFX6-NEXT:    s_movk_i32 s12, 0x1000
8099; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8100; GFX6-NEXT:    s_mov_b32 s6, -1
8101; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8102; GFX6-NEXT:    s_lshl_b64 s[2:3], s[12:13], s2
8103; GFX6-NEXT:    s_lshl_b64 s[0:1], s[12:13], s0
8104; GFX6-NEXT:    s_add_u32 s0, s0, -1
8105; GFX6-NEXT:    s_addc_u32 s1, s1, -1
8106; GFX6-NEXT:    s_and_b64 s[0:1], s[8:9], s[0:1]
8107; GFX6-NEXT:    s_add_u32 s2, s2, -1
8108; GFX6-NEXT:    s_addc_u32 s3, s3, -1
8109; GFX6-NEXT:    s_and_b64 s[2:3], s[10:11], s[2:3]
8110; GFX6-NEXT:    v_mov_b32_e32 v0, s0
8111; GFX6-NEXT:    v_mov_b32_e32 v1, s1
8112; GFX6-NEXT:    v_mov_b32_e32 v2, s2
8113; GFX6-NEXT:    v_mov_b32_e32 v3, s3
8114; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
8115; GFX6-NEXT:    s_endpgm
8116; GFX9-LABEL: urem_v2i64_pow2_shl_denom:
8117; GFX9:       ; %bb.0:
8118; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8119; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
8120; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x44
8121; GFX9-NEXT:    s_mov_b32 s1, 0
8122; GFX9-NEXT:    s_movk_i32 s0, 0x1000
8123; GFX9-NEXT:    v_mov_b32_e32 v4, 0
8124; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8125; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], s10
8126; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
8127; GFX9-NEXT:    s_add_u32 s0, s0, -1
8128; GFX9-NEXT:    s_addc_u32 s1, s1, -1
8129; GFX9-NEXT:    s_and_b64 s[0:1], s[4:5], s[0:1]
8130; GFX9-NEXT:    s_add_u32 s4, s10, -1
8131; GFX9-NEXT:    s_addc_u32 s5, s11, -1
8132; GFX9-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
8133; GFX9-NEXT:    v_mov_b32_e32 v0, s0
8134; GFX9-NEXT:    v_mov_b32_e32 v1, s1
8135; GFX9-NEXT:    v_mov_b32_e32 v2, s4
8136; GFX9-NEXT:    v_mov_b32_e32 v3, s5
8137; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
8138; GFX9-NEXT:    s_endpgm
8139  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
8140  %r = urem <2 x i64> %x, %shl.y
8141  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
8142  ret void
8143}
8144
8145define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
8146; CHECK-LABEL: @sdiv_i64_oddk_denom(
8147; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195
8148; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
8149; CHECK-NEXT:    ret void
8150;
8151; GFX6-LABEL: sdiv_i64_oddk_denom:
8152; GFX6:       ; %bb.0:
8153; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f800000
8154; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
8155; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
8156; GFX6-NEXT:    s_mov_b32 s2, 0xffed2705
8157; GFX6-NEXT:    v_mov_b32_e32 v8, 0
8158; GFX6-NEXT:    v_mov_b32_e32 v7, 0
8159; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8160; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8161; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
8162; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8163; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
8164; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
8165; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
8166; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8167; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
8168; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s2
8169; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s2
8170; GFX6-NEXT:    s_mov_b32 s6, -1
8171; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8172; GFX6-NEXT:    s_mov_b32 s4, s8
8173; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8174; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
8175; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
8176; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v4
8177; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v2
8178; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v2
8179; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
8180; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
8181; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
8182; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
8183; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
8184; GFX6-NEXT:    s_mov_b32 s5, s9
8185; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
8186; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
8187; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
8188; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8189; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
8190; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
8191; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
8192; GFX6-NEXT:    v_mul_lo_u32 v4, v2, s2
8193; GFX6-NEXT:    v_mul_hi_u32 v5, s2, v0
8194; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
8195; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s2
8196; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
8197; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v4
8198; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v4
8199; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v5
8200; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v5
8201; GFX6-NEXT:    v_mul_lo_u32 v5, v2, v5
8202; GFX6-NEXT:    v_mul_hi_u32 v6, v2, v4
8203; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
8204; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, v8, v12, vcc
8205; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
8206; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
8207; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v11, v9, vcc
8208; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v6, v7, vcc
8209; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
8210; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
8211; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
8212; GFX6-NEXT:    s_ashr_i32 s2, s11, 31
8213; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
8214; GFX6-NEXT:    s_add_u32 s0, s10, s2
8215; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
8216; GFX6-NEXT:    s_mov_b32 s3, s2
8217; GFX6-NEXT:    s_addc_u32 s1, s11, s2
8218; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
8219; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8220; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
8221; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
8222; GFX6-NEXT:    v_mul_hi_u32 v4, s0, v1
8223; GFX6-NEXT:    v_mul_hi_u32 v5, s1, v1
8224; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v1
8225; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8226; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
8227; GFX6-NEXT:    v_mul_lo_u32 v4, s1, v0
8228; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
8229; GFX6-NEXT:    s_mov_b32 s3, 0x12d8fb
8230; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
8231; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
8232; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
8233; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
8234; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
8235; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s3
8236; GFX6-NEXT:    v_mul_hi_u32 v3, s3, v0
8237; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s3
8238; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8239; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s0, v4
8240; GFX6-NEXT:    v_mov_b32_e32 v3, s1
8241; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
8242; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v4
8243; GFX6-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
8244; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fa
8245; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v3
8246; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
8247; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
8248; GFX6-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
8249; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
8250; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
8251; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
8252; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v4
8253; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
8254; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
8255; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
8256; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
8257; GFX6-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[0:1]
8258; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
8259; GFX6-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
8260; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
8261; GFX6-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
8262; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
8263; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
8264; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
8265; GFX6-NEXT:    v_mov_b32_e32 v2, s2
8266; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
8267; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
8268; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
8269; GFX6-NEXT:    s_endpgm
8270; GFX9-LABEL: sdiv_i64_oddk_denom:
8271; GFX9:       ; %bb.0:
8272; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
8273; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
8274; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
8275; GFX9-NEXT:    s_mov_b32 s8, 0xffed2705
8276; GFX9-NEXT:    v_mov_b32_e32 v7, 0
8277; GFX9-NEXT:    v_mov_b32_e32 v5, 0
8278; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8279; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8280; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
8281; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8282; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
8283; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
8284; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
8285; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v0
8286; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s8
8287; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
8288; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
8289; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
8290; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
8291; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
8292; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
8293; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v2
8294; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
8295; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
8296; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v8, vcc
8297; GFX9-NEXT:    v_mul_lo_u32 v8, v1, v4
8298; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
8299; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
8300; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
8301; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
8302; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
8303; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
8304; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
8305; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
8306; GFX9-NEXT:    v_mul_lo_u32 v4, v2, s8
8307; GFX9-NEXT:    v_mul_hi_u32 v6, s8, v0
8308; GFX9-NEXT:    v_mul_lo_u32 v8, v0, s8
8309; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
8310; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
8311; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v0
8312; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v4
8313; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v8
8314; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v4
8315; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v8
8316; GFX9-NEXT:    v_mul_lo_u32 v8, v2, v8
8317; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
8318; GFX9-NEXT:    v_mul_hi_u32 v6, v2, v4
8319; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v7, v12, vcc
8320; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
8321; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
8322; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
8323; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
8324; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
8325; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
8326; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
8327; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8328; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
8329; GFX9-NEXT:    s_add_u32 s0, s6, s2
8330; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
8331; GFX9-NEXT:    s_mov_b32 s3, s2
8332; GFX9-NEXT:    s_addc_u32 s1, s7, s2
8333; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
8334; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
8335; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
8336; GFX9-NEXT:    v_mul_hi_u32 v3, s0, v0
8337; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v1
8338; GFX9-NEXT:    v_mul_hi_u32 v6, s1, v1
8339; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v1
8340; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
8341; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
8342; GFX9-NEXT:    v_mul_lo_u32 v4, s1, v0
8343; GFX9-NEXT:    v_mul_hi_u32 v0, s1, v0
8344; GFX9-NEXT:    s_mov_b32 s3, 0x12d8fb
8345; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
8346; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
8347; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
8348; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
8349; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v2, vcc
8350; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s3
8351; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s3
8352; GFX9-NEXT:    v_mul_hi_u32 v3, s3, v0
8353; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s0, v4
8354; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
8355; GFX9-NEXT:    v_mov_b32_e32 v3, s1
8356; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v2, vcc
8357; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s3, v4
8358; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
8359; GFX9-NEXT:    s_mov_b32 s3, 0x12d8fa
8360; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v3
8361; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
8362; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
8363; GFX9-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
8364; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, 2, v0
8365; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
8366; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v0
8367; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
8368; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v4
8369; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
8370; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
8371; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
8372; GFX9-NEXT:    v_cndmask_b32_e32 v2, -1, v4, vcc
8373; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
8374; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v6, s[0:1]
8375; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
8376; GFX9-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[0:1]
8377; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
8378; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
8379; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
8380; GFX9-NEXT:    v_mov_b32_e32 v2, s2
8381; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s2, v0
8382; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
8383; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
8384; GFX9-NEXT:    s_endpgm
8385  %r = sdiv i64 %x, 1235195
8386  store i64 %r, i64 addrspace(1)* %out
8387  ret void
8388}
8389
8390define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
8391; CHECK-LABEL: @sdiv_i64_pow2k_denom(
8392; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 4096
8393; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
8394; CHECK-NEXT:    ret void
8395;
8396; GFX6-LABEL: sdiv_i64_pow2k_denom:
8397; GFX6:       ; %bb.0:
8398; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
8399; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8400; GFX6-NEXT:    s_mov_b32 s6, -1
8401; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8402; GFX6-NEXT:    s_mov_b32 s4, s0
8403; GFX6-NEXT:    s_ashr_i32 s0, s3, 31
8404; GFX6-NEXT:    s_lshr_b32 s0, s0, 20
8405; GFX6-NEXT:    s_add_u32 s0, s2, s0
8406; GFX6-NEXT:    s_mov_b32 s5, s1
8407; GFX6-NEXT:    s_addc_u32 s1, s3, 0
8408; GFX6-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
8409; GFX6-NEXT:    v_mov_b32_e32 v0, s0
8410; GFX6-NEXT:    v_mov_b32_e32 v1, s1
8411; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
8412; GFX6-NEXT:    s_endpgm
8413; GFX9-LABEL: sdiv_i64_pow2k_denom:
8414; GFX9:       ; %bb.0:
8415; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
8416; GFX9-NEXT:    v_mov_b32_e32 v2, 0
8417; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8418; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
8419; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
8420; GFX9-NEXT:    s_add_u32 s2, s2, s4
8421; GFX9-NEXT:    s_addc_u32 s3, s3, 0
8422; GFX9-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
8423; GFX9-NEXT:    v_mov_b32_e32 v0, s2
8424; GFX9-NEXT:    v_mov_b32_e32 v1, s3
8425; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
8426; GFX9-NEXT:    s_endpgm
8427  %r = sdiv i64 %x, 4096
8428  store i64 %r, i64 addrspace(1)* %out
8429  ret void
8430}
8431
8432define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
8433; CHECK-LABEL: @sdiv_i64_pow2_shl_denom(
8434; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
8435; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]]
8436; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
8437; CHECK-NEXT:    ret void
8438;
8439; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
8440; GFX6:       ; %bb.0:
8441; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xd
8442; GFX6-NEXT:    s_mov_b32 s3, 0
8443; GFX6-NEXT:    s_movk_i32 s2, 0x1000
8444; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
8445; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8446; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8447; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
8448; GFX6-NEXT:    s_ashr_i32 s12, s3, 31
8449; GFX6-NEXT:    s_add_u32 s2, s2, s12
8450; GFX6-NEXT:    s_mov_b32 s13, s12
8451; GFX6-NEXT:    s_addc_u32 s3, s3, s12
8452; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
8453; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
8454; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s3
8455; GFX6-NEXT:    s_sub_u32 s4, 0, s2
8456; GFX6-NEXT:    s_subb_u32 s5, 0, s3
8457; GFX6-NEXT:    s_ashr_i32 s14, s11, 31
8458; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
8459; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
8460; GFX6-NEXT:    s_mov_b32 s15, s14
8461; GFX6-NEXT:    s_mov_b32 s6, -1
8462; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8463; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8464; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
8465; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8466; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
8467; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
8468; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
8469; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
8470; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v0
8471; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v0
8472; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8473; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
8474; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
8475; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
8476; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
8477; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
8478; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
8479; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
8480; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
8481; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
8482; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
8483; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
8484; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
8485; GFX6-NEXT:    v_mov_b32_e32 v4, 0
8486; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
8487; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8488; GFX6-NEXT:    v_mov_b32_e32 v6, 0
8489; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
8490; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
8491; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
8492; GFX6-NEXT:    v_mul_lo_u32 v5, s4, v2
8493; GFX6-NEXT:    v_mul_hi_u32 v7, s4, v0
8494; GFX6-NEXT:    v_mul_lo_u32 v8, s5, v0
8495; GFX6-NEXT:    s_mov_b32 s5, s9
8496; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
8497; GFX6-NEXT:    v_mul_lo_u32 v7, s4, v0
8498; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
8499; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
8500; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
8501; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
8502; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
8503; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
8504; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
8505; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
8506; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
8507; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v5
8508; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
8509; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
8510; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
8511; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
8512; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
8513; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
8514; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
8515; GFX6-NEXT:    s_add_u32 s0, s10, s14
8516; GFX6-NEXT:    s_addc_u32 s1, s11, s14
8517; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
8518; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
8519; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8520; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
8521; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
8522; GFX6-NEXT:    v_mul_hi_u32 v5, s10, v1
8523; GFX6-NEXT:    v_mul_hi_u32 v7, s11, v1
8524; GFX6-NEXT:    v_mul_lo_u32 v1, s11, v1
8525; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8526; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
8527; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v0
8528; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
8529; GFX6-NEXT:    s_mov_b32 s4, s8
8530; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
8531; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
8532; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
8533; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
8534; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
8535; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
8536; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
8537; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
8538; GFX6-NEXT:    v_mov_b32_e32 v5, s3
8539; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8540; GFX6-NEXT:    v_mul_lo_u32 v3, s2, v0
8541; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
8542; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s11, v2
8543; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
8544; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
8545; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s2, v3
8546; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
8547; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v4
8548; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
8549; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v5
8550; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
8551; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v4
8552; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
8553; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
8554; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
8555; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
8556; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
8557; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
8558; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
8559; GFX6-NEXT:    v_mov_b32_e32 v6, s11
8560; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
8561; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
8562; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
8563; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
8564; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
8565; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v2
8566; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
8567; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
8568; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
8569; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
8570; GFX6-NEXT:    s_xor_b64 s[0:1], s[14:15], s[12:13]
8571; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
8572; GFX6-NEXT:    v_xor_b32_e32 v0, s0, v0
8573; GFX6-NEXT:    v_xor_b32_e32 v1, s1, v1
8574; GFX6-NEXT:    v_mov_b32_e32 v2, s1
8575; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
8576; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
8577; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
8578; GFX6-NEXT:    s_endpgm
8579; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
8580; GFX9:       ; %bb.0:
8581; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
8582; GFX9-NEXT:    s_mov_b32 s3, 0
8583; GFX9-NEXT:    s_movk_i32 s2, 0x1000
8584; GFX9-NEXT:    v_mov_b32_e32 v2, 0
8585; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8586; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
8587; GFX9-NEXT:    s_ashr_i32 s8, s3, 31
8588; GFX9-NEXT:    s_add_u32 s2, s2, s8
8589; GFX9-NEXT:    s_mov_b32 s9, s8
8590; GFX9-NEXT:    s_addc_u32 s3, s3, s8
8591; GFX9-NEXT:    s_xor_b64 s[10:11], s[2:3], s[8:9]
8592; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s10
8593; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s11
8594; GFX9-NEXT:    s_sub_u32 s12, 0, s10
8595; GFX9-NEXT:    s_subb_u32 s4, 0, s11
8596; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
8597; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
8598; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8599; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8600; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
8601; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8602; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
8603; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
8604; GFX9-NEXT:    v_mul_hi_u32 v4, s12, v0
8605; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
8606; GFX9-NEXT:    v_mul_lo_u32 v6, s4, v0
8607; GFX9-NEXT:    v_mul_lo_u32 v5, s12, v0
8608; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
8609; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
8610; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v5
8611; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v3
8612; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v3
8613; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v5
8614; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
8615; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
8616; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v3
8617; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v8, vcc
8618; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v3
8619; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
8620; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v7, vcc
8621; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v2, vcc
8622; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
8623; GFX9-NEXT:    v_mov_b32_e32 v6, 0
8624; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v3
8625; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
8626; GFX9-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[2:3]
8627; GFX9-NEXT:    v_mul_lo_u32 v5, s12, v3
8628; GFX9-NEXT:    v_mul_hi_u32 v7, s12, v0
8629; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v0
8630; GFX9-NEXT:    v_mul_lo_u32 v9, s12, v0
8631; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
8632; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
8633; GFX9-NEXT:    v_add_u32_e32 v5, v5, v8
8634; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v5
8635; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v9
8636; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v5
8637; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v9
8638; GFX9-NEXT:    v_mul_lo_u32 v9, v3, v9
8639; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
8640; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v5
8641; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
8642; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v5
8643; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
8644; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
8645; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
8646; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
8647; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8648; GFX9-NEXT:    s_ashr_i32 s12, s7, 31
8649; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
8650; GFX9-NEXT:    v_add_u32_e32 v1, v1, v4
8651; GFX9-NEXT:    s_add_u32 s0, s6, s12
8652; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3]
8653; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
8654; GFX9-NEXT:    s_mov_b32 s13, s12
8655; GFX9-NEXT:    s_addc_u32 s1, s7, s12
8656; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
8657; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
8658; GFX9-NEXT:    v_mul_lo_u32 v3, s6, v1
8659; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v0
8660; GFX9-NEXT:    v_mul_hi_u32 v5, s6, v1
8661; GFX9-NEXT:    v_mul_hi_u32 v7, s7, v1
8662; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
8663; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
8664; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
8665; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v0
8666; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
8667; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
8668; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v4, v0, vcc
8669; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v2, vcc
8670; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
8671; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v3, vcc
8672; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
8673; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v0
8674; GFX9-NEXT:    v_mul_lo_u32 v5, s11, v0
8675; GFX9-NEXT:    v_mov_b32_e32 v6, s11
8676; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
8677; GFX9-NEXT:    v_mul_lo_u32 v4, s10, v0
8678; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
8679; GFX9-NEXT:    v_sub_u32_e32 v5, s7, v3
8680; GFX9-NEXT:    v_sub_co_u32_e64 v4, s[0:1], s6, v4
8681; GFX9-NEXT:    v_subb_co_u32_e64 v5, vcc, v5, v6, s[0:1]
8682; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s10, v4
8683; GFX9-NEXT:    v_subbrev_co_u32_e32 v5, vcc, 0, v5, vcc
8684; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v5
8685; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
8686; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v6
8687; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
8688; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v5
8689; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
8690; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, 2, v0
8691; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
8692; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v0
8693; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
8694; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v5
8695; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v7, s[2:3]
8696; GFX9-NEXT:    v_mov_b32_e32 v7, s7
8697; GFX9-NEXT:    v_subb_co_u32_e64 v3, vcc, v7, v3, s[0:1]
8698; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
8699; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
8700; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v4
8701; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
8702; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v3
8703; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v4, vcc
8704; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
8705; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v6, s[2:3]
8706; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
8707; GFX9-NEXT:    s_xor_b64 s[0:1], s[12:13], s[8:9]
8708; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
8709; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
8710; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v1
8711; GFX9-NEXT:    v_mov_b32_e32 v3, s1
8712; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
8713; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
8714; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
8715; GFX9-NEXT:    s_endpgm
8716  %shl.y = shl i64 4096, %y
8717  %r = sdiv i64 %x, %shl.y
8718  store i64 %r, i64 addrspace(1)* %out
8719  ret void
8720}
8721
8722define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
8723; CHECK-LABEL: @sdiv_v2i64_pow2k_denom(
8724; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
8725; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
8726; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
8727; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
8728; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096
8729; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
8730; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
8731; CHECK-NEXT:    ret void
8732;
8733; GFX6-LABEL: sdiv_v2i64_pow2k_denom:
8734; GFX6:       ; %bb.0:
8735; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
8736; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
8737; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8738; GFX6-NEXT:    s_mov_b32 s6, -1
8739; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8740; GFX6-NEXT:    s_ashr_i32 s8, s1, 31
8741; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
8742; GFX6-NEXT:    s_add_u32 s0, s0, s8
8743; GFX6-NEXT:    s_addc_u32 s1, s1, 0
8744; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
8745; GFX6-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
8746; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
8747; GFX6-NEXT:    s_add_u32 s2, s2, s8
8748; GFX6-NEXT:    s_addc_u32 s3, s3, 0
8749; GFX6-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
8750; GFX6-NEXT:    v_mov_b32_e32 v0, s0
8751; GFX6-NEXT:    v_mov_b32_e32 v1, s1
8752; GFX6-NEXT:    v_mov_b32_e32 v2, s2
8753; GFX6-NEXT:    v_mov_b32_e32 v3, s3
8754; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
8755; GFX6-NEXT:    s_endpgm
8756; GFX9-LABEL: sdiv_v2i64_pow2k_denom:
8757; GFX9:       ; %bb.0:
8758; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
8759; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
8760; GFX9-NEXT:    v_mov_b32_e32 v4, 0
8761; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8762; GFX9-NEXT:    s_ashr_i32 s0, s5, 31
8763; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
8764; GFX9-NEXT:    s_add_u32 s0, s4, s0
8765; GFX9-NEXT:    s_addc_u32 s1, s5, 0
8766; GFX9-NEXT:    s_ashr_i32 s4, s7, 31
8767; GFX9-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
8768; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
8769; GFX9-NEXT:    s_add_u32 s4, s6, s4
8770; GFX9-NEXT:    s_addc_u32 s5, s7, 0
8771; GFX9-NEXT:    s_ashr_i64 s[4:5], s[4:5], 12
8772; GFX9-NEXT:    v_mov_b32_e32 v0, s0
8773; GFX9-NEXT:    v_mov_b32_e32 v1, s1
8774; GFX9-NEXT:    v_mov_b32_e32 v2, s4
8775; GFX9-NEXT:    v_mov_b32_e32 v3, s5
8776; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
8777; GFX9-NEXT:    s_endpgm
8778  %r = sdiv <2 x i64> %x, <i64 4096, i64 4096>
8779  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
8780  ret void
8781}
8782
8783define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
8784; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom(
8785; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
8786; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
8787; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
8788; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
8789; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095
8790; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
8791; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
8792; CHECK-NEXT:    ret void
8793;
8794; GFX6-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
8795; GFX6:       ; %bb.0:
8796; GFX6-NEXT:    v_mov_b32_e32 v0, 0x457ff000
8797; GFX6-NEXT:    v_mov_b32_e32 v1, 0x4f800000
8798; GFX6-NEXT:    v_mac_f32_e32 v0, 0, v1
8799; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
8800; GFX6-NEXT:    s_movk_i32 s6, 0xf001
8801; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
8802; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
8803; GFX6-NEXT:    s_mov_b32 s7, 0xf000
8804; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8805; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8806; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
8807; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8808; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
8809; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
8810; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
8811; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
8812; GFX6-NEXT:    s_lshr_b32 s0, s0, 20
8813; GFX6-NEXT:    v_mul_hi_u32 v2, s6, v0
8814; GFX6-NEXT:    v_mul_lo_u32 v3, v1, s6
8815; GFX6-NEXT:    s_add_u32 s2, s8, s0
8816; GFX6-NEXT:    s_addc_u32 s3, s9, 0
8817; GFX6-NEXT:    s_ashr_i32 s8, s11, 31
8818; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
8819; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s6
8820; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
8821; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
8822; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
8823; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
8824; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
8825; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
8826; GFX6-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
8827; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
8828; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
8829; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
8830; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
8831; GFX6-NEXT:    s_mov_b32 s9, s8
8832; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
8833; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
8834; GFX6-NEXT:    v_mov_b32_e32 v4, 0
8835; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
8836; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8837; GFX6-NEXT:    v_mov_b32_e32 v6, 0
8838; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
8839; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
8840; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
8841; GFX6-NEXT:    v_mul_lo_u32 v5, v2, s6
8842; GFX6-NEXT:    v_mul_hi_u32 v7, s6, v0
8843; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
8844; GFX6-NEXT:    v_mul_lo_u32 v7, v0, s6
8845; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, v0, v5
8846; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
8847; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
8848; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
8849; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
8850; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
8851; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
8852; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
8853; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
8854; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v5
8855; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
8856; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
8857; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
8858; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
8859; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
8860; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
8861; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
8862; GFX6-NEXT:    s_add_u32 s0, s10, s8
8863; GFX6-NEXT:    s_addc_u32 s1, s11, s8
8864; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
8865; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[8:9]
8866; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
8867; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
8868; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
8869; GFX6-NEXT:    v_mul_hi_u32 v5, s0, v1
8870; GFX6-NEXT:    v_mul_hi_u32 v7, s1, v1
8871; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v1
8872; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8873; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
8874; GFX6-NEXT:    v_mul_lo_u32 v5, s1, v0
8875; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
8876; GFX6-NEXT:    s_movk_i32 s9, 0xfff
8877; GFX6-NEXT:    s_mov_b32 s6, -1
8878; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
8879; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
8880; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
8881; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
8882; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
8883; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s9
8884; GFX6-NEXT:    v_mul_hi_u32 v3, s9, v0
8885; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s9
8886; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
8887; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s0, v4
8888; GFX6-NEXT:    v_mov_b32_e32 v3, s1
8889; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
8890; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v4
8891; GFX6-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
8892; GFX6-NEXT:    s_movk_i32 s0, 0xffe
8893; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v3
8894; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
8895; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
8896; GFX6-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
8897; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
8898; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
8899; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
8900; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v4
8901; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
8902; GFX6-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
8903; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
8904; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
8905; GFX6-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[0:1]
8906; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
8907; GFX6-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
8908; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
8909; GFX6-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
8910; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
8911; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
8912; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
8913; GFX6-NEXT:    v_xor_b32_e32 v1, s8, v1
8914; GFX6-NEXT:    v_mov_b32_e32 v3, s8
8915; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
8916; GFX6-NEXT:    v_mov_b32_e32 v0, s2
8917; GFX6-NEXT:    v_mov_b32_e32 v1, s3
8918; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
8919; GFX6-NEXT:    s_endpgm
8920; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
8921; GFX9:       ; %bb.0:
8922; GFX9-NEXT:    v_mov_b32_e32 v0, 0x457ff000
8923; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
8924; GFX9-NEXT:    v_mac_f32_e32 v0, 0, v1
8925; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
8926; GFX9-NEXT:    s_movk_i32 s8, 0xf001
8927; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
8928; GFX9-NEXT:    v_mov_b32_e32 v4, 0
8929; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
8930; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
8931; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
8932; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
8933; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
8934; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
8935; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
8936; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
8937; GFX9-NEXT:    s_lshr_b32 s2, s2, 20
8938; GFX9-NEXT:    v_mul_hi_u32 v2, s8, v0
8939; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s8
8940; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s8
8941; GFX9-NEXT:    s_add_u32 s4, s4, s2
8942; GFX9-NEXT:    s_addc_u32 s5, s5, 0
8943; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
8944; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
8945; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
8946; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v5
8947; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
8948; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
8949; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
8950; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v6, v3
8951; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
8952; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v5
8953; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
8954; GFX9-NEXT:    s_ashr_i64 s[4:5], s[4:5], 12
8955; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
8956; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
8957; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v4, vcc
8958; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
8959; GFX9-NEXT:    v_mov_b32_e32 v6, 0
8960; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
8961; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
8962; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
8963; GFX9-NEXT:    v_mul_lo_u32 v5, v2, s8
8964; GFX9-NEXT:    v_mul_hi_u32 v7, s8, v0
8965; GFX9-NEXT:    v_mul_lo_u32 v8, v0, s8
8966; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
8967; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
8968; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
8969; GFX9-NEXT:    v_sub_u32_e32 v5, v5, v0
8970; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v5
8971; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v8
8972; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v5
8973; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v8
8974; GFX9-NEXT:    v_mul_lo_u32 v8, v2, v8
8975; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
8976; GFX9-NEXT:    v_mul_hi_u32 v7, v2, v5
8977; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
8978; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v5
8979; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
8980; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
8981; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
8982; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
8983; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
8984; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3]
8985; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
8986; GFX9-NEXT:    s_add_u32 s6, s6, s2
8987; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
8988; GFX9-NEXT:    s_mov_b32 s3, s2
8989; GFX9-NEXT:    s_addc_u32 s7, s7, s2
8990; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
8991; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
8992; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
8993; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
8994; GFX9-NEXT:    v_mul_hi_u32 v5, s6, v1
8995; GFX9-NEXT:    v_mul_hi_u32 v7, s7, v1
8996; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
8997; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
8998; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
8999; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v0
9000; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
9001; GFX9-NEXT:    s_movk_i32 s3, 0xfff
9002; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
9003; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
9004; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v4, vcc
9005; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
9006; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v2, vcc
9007; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s3
9008; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s3
9009; GFX9-NEXT:    v_mul_hi_u32 v3, s3, v0
9010; GFX9-NEXT:    v_sub_co_u32_e32 v5, vcc, s6, v5
9011; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
9012; GFX9-NEXT:    v_mov_b32_e32 v3, s7
9013; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v2, vcc
9014; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s3, v5
9015; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
9016; GFX9-NEXT:    s_movk_i32 s3, 0xffe
9017; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v3
9018; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
9019; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
9020; GFX9-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
9021; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, 2, v0
9022; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
9023; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v0
9024; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
9025; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v5
9026; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
9027; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
9028; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
9029; GFX9-NEXT:    v_cndmask_b32_e32 v2, -1, v5, vcc
9030; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
9031; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v6, s[0:1]
9032; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
9033; GFX9-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[0:1]
9034; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
9035; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
9036; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s2, v0
9037; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
9038; GFX9-NEXT:    v_mov_b32_e32 v3, s2
9039; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
9040; GFX9-NEXT:    v_mov_b32_e32 v0, s4
9041; GFX9-NEXT:    v_mov_b32_e32 v1, s5
9042; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9043; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
9044; GFX9-NEXT:    s_endpgm
9045  %r = sdiv <2 x i64> %x, <i64 4096, i64 4095>
9046  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
9047  ret void
9048}
9049
9050define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
9051; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom(
9052; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
9053; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
9054; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
9055; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]]
9056; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
9057; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
9058; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
9059; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]]
9060; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
9061; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
9062; CHECK-NEXT:    ret void
9063;
9064; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom:
9065; GFX6:       ; %bb.0:
9066; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
9067; GFX6-NEXT:    s_mov_b32 s3, 0
9068; GFX6-NEXT:    s_movk_i32 s2, 0x1000
9069; GFX6-NEXT:    s_mov_b32 s18, 0x4f800000
9070; GFX6-NEXT:    s_mov_b32 s19, 0x5f7ffffc
9071; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9072; GFX6-NEXT:    s_lshl_b64 s[12:13], s[2:3], s6
9073; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
9074; GFX6-NEXT:    s_ashr_i32 s16, s3, 31
9075; GFX6-NEXT:    s_add_u32 s2, s2, s16
9076; GFX6-NEXT:    s_mov_b32 s17, s16
9077; GFX6-NEXT:    s_addc_u32 s3, s3, s16
9078; GFX6-NEXT:    s_xor_b64 s[14:15], s[2:3], s[16:17]
9079; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s14
9080; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s15
9081; GFX6-NEXT:    s_mov_b32 s20, 0x2f800000
9082; GFX6-NEXT:    s_mov_b32 s21, 0xcf800000
9083; GFX6-NEXT:    s_sub_u32 s6, 0, s14
9084; GFX6-NEXT:    v_mac_f32_e32 v0, s18, v1
9085; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
9086; GFX6-NEXT:    s_subb_u32 s7, 0, s15
9087; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
9088; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
9089; GFX6-NEXT:    v_mul_f32_e32 v0, s19, v0
9090; GFX6-NEXT:    v_mul_f32_e32 v1, s20, v0
9091; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
9092; GFX6-NEXT:    v_mac_f32_e32 v0, s21, v1
9093; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
9094; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
9095; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
9096; GFX6-NEXT:    v_mul_lo_u32 v2, s6, v1
9097; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
9098; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v0
9099; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9100; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
9101; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
9102; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v5
9103; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
9104; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
9105; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
9106; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
9107; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
9108; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v5
9109; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v5
9110; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
9111; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
9112; GFX6-NEXT:    v_mov_b32_e32 v4, 0
9113; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
9114; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9115; GFX6-NEXT:    v_mov_b32_e32 v6, 0
9116; GFX6-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v2
9117; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
9118; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[2:3]
9119; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v2
9120; GFX6-NEXT:    v_mul_hi_u32 v7, s6, v0
9121; GFX6-NEXT:    v_mul_lo_u32 v8, s7, v0
9122; GFX6-NEXT:    s_mov_b32 s7, 0xf000
9123; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
9124; GFX6-NEXT:    v_mul_lo_u32 v7, s6, v0
9125; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
9126; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
9127; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
9128; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
9129; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
9130; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
9131; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
9132; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
9133; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
9134; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v5
9135; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
9136; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
9137; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
9138; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
9139; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
9140; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
9141; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
9142; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9143; GFX6-NEXT:    s_ashr_i32 s2, s9, 31
9144; GFX6-NEXT:    s_add_u32 s0, s8, s2
9145; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
9146; GFX6-NEXT:    s_mov_b32 s3, s2
9147; GFX6-NEXT:    s_addc_u32 s1, s9, s2
9148; GFX6-NEXT:    s_xor_b64 s[8:9], s[0:1], s[2:3]
9149; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
9150; GFX6-NEXT:    v_mul_lo_u32 v2, s8, v1
9151; GFX6-NEXT:    v_mul_hi_u32 v3, s8, v0
9152; GFX6-NEXT:    v_mul_hi_u32 v5, s8, v1
9153; GFX6-NEXT:    v_mul_hi_u32 v7, s9, v1
9154; GFX6-NEXT:    v_mul_lo_u32 v1, s9, v1
9155; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9156; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
9157; GFX6-NEXT:    v_mul_lo_u32 v5, s9, v0
9158; GFX6-NEXT:    v_mul_hi_u32 v0, s9, v0
9159; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[16:17]
9160; GFX6-NEXT:    s_mov_b32 s6, -1
9161; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
9162; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
9163; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
9164; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
9165; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
9166; GFX6-NEXT:    v_mul_lo_u32 v2, s14, v1
9167; GFX6-NEXT:    v_mul_hi_u32 v3, s14, v0
9168; GFX6-NEXT:    v_mul_lo_u32 v5, s15, v0
9169; GFX6-NEXT:    v_mov_b32_e32 v7, s15
9170; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9171; GFX6-NEXT:    v_mul_lo_u32 v3, s14, v0
9172; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
9173; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s9, v2
9174; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
9175; GFX6-NEXT:    v_subb_u32_e64 v5, s[0:1], v5, v7, vcc
9176; GFX6-NEXT:    v_subrev_i32_e64 v7, s[0:1], s14, v3
9177; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1]
9178; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v5
9179; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
9180; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v7
9181; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
9182; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v5
9183; GFX6-NEXT:    v_cndmask_b32_e64 v5, v8, v7, s[0:1]
9184; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v0
9185; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
9186; GFX6-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v0
9187; GFX6-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1]
9188; GFX6-NEXT:    s_ashr_i32 s8, s13, 31
9189; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
9190; GFX6-NEXT:    s_add_u32 s12, s12, s8
9191; GFX6-NEXT:    v_cndmask_b32_e64 v5, v10, v8, s[0:1]
9192; GFX6-NEXT:    v_mov_b32_e32 v8, s9
9193; GFX6-NEXT:    s_mov_b32 s9, s8
9194; GFX6-NEXT:    s_addc_u32 s13, s13, s8
9195; GFX6-NEXT:    s_xor_b64 s[12:13], s[12:13], s[8:9]
9196; GFX6-NEXT:    v_cvt_f32_u32_e32 v10, s12
9197; GFX6-NEXT:    v_cvt_f32_u32_e32 v11, s13
9198; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v8, v2, vcc
9199; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s15, v2
9200; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
9201; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s14, v3
9202; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
9203; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v2
9204; GFX6-NEXT:    v_mac_f32_e32 v10, s18, v11
9205; GFX6-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
9206; GFX6-NEXT:    v_rcp_f32_e32 v3, v10
9207; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
9208; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
9209; GFX6-NEXT:    s_sub_u32 s14, 0, s12
9210; GFX6-NEXT:    v_mul_f32_e32 v3, s19, v3
9211; GFX6-NEXT:    v_mul_f32_e32 v5, s20, v3
9212; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
9213; GFX6-NEXT:    v_mac_f32_e32 v3, s21, v5
9214; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
9215; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v5
9216; GFX6-NEXT:    v_cndmask_b32_e64 v2, v9, v7, s[0:1]
9217; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
9218; GFX6-NEXT:    v_mul_hi_u32 v2, s14, v3
9219; GFX6-NEXT:    v_mul_lo_u32 v7, s14, v5
9220; GFX6-NEXT:    s_subb_u32 s15, 0, s13
9221; GFX6-NEXT:    v_mul_lo_u32 v8, s15, v3
9222; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
9223; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
9224; GFX6-NEXT:    v_mul_lo_u32 v7, s14, v3
9225; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
9226; GFX6-NEXT:    v_mul_lo_u32 v8, v3, v2
9227; GFX6-NEXT:    v_mul_hi_u32 v10, v3, v2
9228; GFX6-NEXT:    v_mul_hi_u32 v9, v3, v7
9229; GFX6-NEXT:    v_mul_hi_u32 v11, v5, v2
9230; GFX6-NEXT:    v_mul_lo_u32 v2, v5, v2
9231; GFX6-NEXT:    v_xor_b32_e32 v1, s3, v1
9232; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
9233; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
9234; GFX6-NEXT:    v_mul_lo_u32 v10, v5, v7
9235; GFX6-NEXT:    v_mul_hi_u32 v7, v5, v7
9236; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
9237; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
9238; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v11, v4, vcc
9239; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
9240; GFX6-NEXT:    v_add_i32_e64 v2, s[0:1], v3, v2
9241; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v6, v8, vcc
9242; GFX6-NEXT:    v_addc_u32_e64 v3, vcc, v5, v7, s[0:1]
9243; GFX6-NEXT:    v_mul_lo_u32 v8, s14, v3
9244; GFX6-NEXT:    v_mul_hi_u32 v9, s14, v2
9245; GFX6-NEXT:    v_mul_lo_u32 v10, s15, v2
9246; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
9247; GFX6-NEXT:    v_mul_lo_u32 v9, s14, v2
9248; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
9249; GFX6-NEXT:    v_mul_lo_u32 v12, v2, v8
9250; GFX6-NEXT:    v_mul_hi_u32 v14, v2, v8
9251; GFX6-NEXT:    v_mul_hi_u32 v13, v2, v9
9252; GFX6-NEXT:    v_mul_hi_u32 v11, v3, v9
9253; GFX6-NEXT:    v_mul_lo_u32 v9, v3, v9
9254; GFX6-NEXT:    v_mul_hi_u32 v10, v3, v8
9255; GFX6-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
9256; GFX6-NEXT:    v_addc_u32_e32 v13, vcc, 0, v14, vcc
9257; GFX6-NEXT:    v_mul_lo_u32 v3, v3, v8
9258; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
9259; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v13, v11, vcc
9260; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v10, v4, vcc
9261; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
9262; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v6, v8, vcc
9263; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
9264; GFX6-NEXT:    s_ashr_i32 s14, s11, 31
9265; GFX6-NEXT:    v_addc_u32_e64 v5, vcc, v5, v8, s[0:1]
9266; GFX6-NEXT:    s_add_u32 s0, s10, s14
9267; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
9268; GFX6-NEXT:    s_mov_b32 s15, s14
9269; GFX6-NEXT:    s_addc_u32 s1, s11, s14
9270; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
9271; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
9272; GFX6-NEXT:    v_mul_lo_u32 v5, s10, v3
9273; GFX6-NEXT:    v_mul_hi_u32 v7, s10, v2
9274; GFX6-NEXT:    v_mul_hi_u32 v9, s10, v3
9275; GFX6-NEXT:    v_mul_hi_u32 v10, s11, v3
9276; GFX6-NEXT:    v_mul_lo_u32 v3, s11, v3
9277; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
9278; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
9279; GFX6-NEXT:    v_mul_lo_u32 v9, s11, v2
9280; GFX6-NEXT:    v_mul_hi_u32 v2, s11, v2
9281; GFX6-NEXT:    v_mov_b32_e32 v8, s3
9282; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
9283; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v2, vcc
9284; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v10, v4, vcc
9285; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
9286; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v4, vcc
9287; GFX6-NEXT:    v_mul_lo_u32 v4, s12, v3
9288; GFX6-NEXT:    v_mul_hi_u32 v5, s12, v2
9289; GFX6-NEXT:    v_mul_lo_u32 v6, s13, v2
9290; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
9291; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
9292; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
9293; GFX6-NEXT:    v_mul_lo_u32 v5, s12, v2
9294; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
9295; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s11, v4
9296; GFX6-NEXT:    v_mov_b32_e32 v7, s13
9297; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s10, v5
9298; GFX6-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v7, vcc
9299; GFX6-NEXT:    v_subrev_i32_e64 v7, s[0:1], s12, v5
9300; GFX6-NEXT:    v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
9301; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v6
9302; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
9303; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v7
9304; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
9305; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v6
9306; GFX6-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[0:1]
9307; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v2
9308; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1]
9309; GFX6-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v2
9310; GFX6-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
9311; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
9312; GFX6-NEXT:    v_cndmask_b32_e64 v6, v10, v8, s[0:1]
9313; GFX6-NEXT:    v_mov_b32_e32 v8, s11
9314; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v8, v4, vcc
9315; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s13, v4
9316; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
9317; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v5
9318; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
9319; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v4
9320; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
9321; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
9322; GFX6-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
9323; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
9324; GFX6-NEXT:    s_xor_b64 s[0:1], s[14:15], s[8:9]
9325; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
9326; GFX6-NEXT:    v_xor_b32_e32 v2, s0, v2
9327; GFX6-NEXT:    v_xor_b32_e32 v3, s1, v3
9328; GFX6-NEXT:    v_mov_b32_e32 v4, s1
9329; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v2
9330; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
9331; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
9332; GFX6-NEXT:    s_endpgm
9333; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom:
9334; GFX9:       ; %bb.0:
9335; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
9336; GFX9-NEXT:    s_mov_b32 s3, 0
9337; GFX9-NEXT:    s_movk_i32 s2, 0x1000
9338; GFX9-NEXT:    s_mov_b32 s18, 0x4f800000
9339; GFX9-NEXT:    s_mov_b32 s19, 0x5f7ffffc
9340; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9341; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s6
9342; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
9343; GFX9-NEXT:    s_ashr_i32 s12, s3, 31
9344; GFX9-NEXT:    s_add_u32 s2, s2, s12
9345; GFX9-NEXT:    s_mov_b32 s13, s12
9346; GFX9-NEXT:    s_addc_u32 s3, s3, s12
9347; GFX9-NEXT:    s_xor_b64 s[10:11], s[2:3], s[12:13]
9348; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s10
9349; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s11
9350; GFX9-NEXT:    s_mov_b32 s20, 0x2f800000
9351; GFX9-NEXT:    s_mov_b32 s21, 0xcf800000
9352; GFX9-NEXT:    s_sub_u32 s14, 0, s10
9353; GFX9-NEXT:    v_mac_f32_e32 v0, s18, v1
9354; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
9355; GFX9-NEXT:    s_subb_u32 s4, 0, s11
9356; GFX9-NEXT:    v_mov_b32_e32 v6, 0
9357; GFX9-NEXT:    v_mul_f32_e32 v0, s19, v0
9358; GFX9-NEXT:    v_mul_f32_e32 v1, s20, v0
9359; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
9360; GFX9-NEXT:    v_mac_f32_e32 v0, s21, v1
9361; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
9362; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
9363; GFX9-NEXT:    v_mul_hi_u32 v3, s14, v0
9364; GFX9-NEXT:    v_mul_lo_u32 v2, s14, v1
9365; GFX9-NEXT:    v_mul_lo_u32 v5, s4, v0
9366; GFX9-NEXT:    v_mul_lo_u32 v4, s14, v0
9367; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
9368; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
9369; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
9370; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
9371; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
9372; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
9373; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
9374; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
9375; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
9376; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v4
9377; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
9378; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
9379; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
9380; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v6, vcc
9381; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
9382; GFX9-NEXT:    v_mov_b32_e32 v5, 0
9383; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
9384; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
9385; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
9386; GFX9-NEXT:    v_mul_lo_u32 v4, s14, v2
9387; GFX9-NEXT:    v_mul_hi_u32 v7, s14, v0
9388; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v0
9389; GFX9-NEXT:    v_mul_lo_u32 v9, s14, v0
9390; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
9391; GFX9-NEXT:    v_add_u32_e32 v4, v7, v4
9392; GFX9-NEXT:    v_add_u32_e32 v4, v4, v8
9393; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v4
9394; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v9
9395; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v4
9396; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v9
9397; GFX9-NEXT:    v_mul_lo_u32 v9, v2, v9
9398; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
9399; GFX9-NEXT:    v_mul_hi_u32 v7, v2, v4
9400; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
9401; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
9402; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
9403; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
9404; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v6, vcc
9405; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
9406; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9407; GFX9-NEXT:    s_ashr_i32 s14, s5, 31
9408; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v4, vcc
9409; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
9410; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
9411; GFX9-NEXT:    s_add_u32 s2, s4, s14
9412; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
9413; GFX9-NEXT:    s_addc_u32 s3, s5, s14
9414; GFX9-NEXT:    s_mov_b32 s15, s14
9415; GFX9-NEXT:    s_xor_b64 s[16:17], s[2:3], s[14:15]
9416; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
9417; GFX9-NEXT:    v_mul_lo_u32 v2, s16, v1
9418; GFX9-NEXT:    v_mul_hi_u32 v3, s16, v0
9419; GFX9-NEXT:    v_mul_hi_u32 v4, s16, v1
9420; GFX9-NEXT:    v_mul_hi_u32 v7, s17, v1
9421; GFX9-NEXT:    v_mul_lo_u32 v1, s17, v1
9422; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
9423; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
9424; GFX9-NEXT:    v_mul_lo_u32 v4, s17, v0
9425; GFX9-NEXT:    v_mul_hi_u32 v0, s17, v0
9426; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
9427; GFX9-NEXT:    s_xor_b64 s[12:13], s[14:15], s[12:13]
9428; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
9429; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
9430; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v6, vcc
9431; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
9432; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
9433; GFX9-NEXT:    v_mul_lo_u32 v2, s10, v1
9434; GFX9-NEXT:    v_mul_hi_u32 v3, s10, v0
9435; GFX9-NEXT:    v_mul_lo_u32 v4, s11, v0
9436; GFX9-NEXT:    v_mov_b32_e32 v7, s11
9437; GFX9-NEXT:    s_ashr_i32 s14, s9, 31
9438; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
9439; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v0
9440; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
9441; GFX9-NEXT:    v_sub_u32_e32 v4, s17, v2
9442; GFX9-NEXT:    s_mov_b32 s15, s14
9443; GFX9-NEXT:    v_sub_co_u32_e64 v3, s[0:1], s16, v3
9444; GFX9-NEXT:    v_subb_co_u32_e64 v4, vcc, v4, v7, s[0:1]
9445; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s10, v3
9446; GFX9-NEXT:    v_subbrev_co_u32_e32 v4, vcc, 0, v4, vcc
9447; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v4
9448; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
9449; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v7
9450; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
9451; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v4
9452; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v7, vcc
9453; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 2, v0
9454; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v1, vcc
9455; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, 1, v0
9456; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, 0, v1, vcc
9457; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v4
9458; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, v8, s[2:3]
9459; GFX9-NEXT:    v_mov_b32_e32 v8, s17
9460; GFX9-NEXT:    v_subb_co_u32_e64 v2, vcc, v8, v2, s[0:1]
9461; GFX9-NEXT:    s_add_u32 s0, s8, s14
9462; GFX9-NEXT:    s_addc_u32 s1, s9, s14
9463; GFX9-NEXT:    s_xor_b64 s[8:9], s[0:1], s[14:15]
9464; GFX9-NEXT:    v_cvt_f32_u32_e32 v10, s8
9465; GFX9-NEXT:    v_cvt_f32_u32_e32 v11, s9
9466; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v2
9467; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
9468; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
9469; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
9470; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v2
9471; GFX9-NEXT:    v_mac_f32_e32 v10, s18, v11
9472; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
9473; GFX9-NEXT:    v_rcp_f32_e32 v3, v10
9474; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
9475; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
9476; GFX9-NEXT:    v_cndmask_b32_e64 v2, v9, v7, s[2:3]
9477; GFX9-NEXT:    v_mul_f32_e32 v3, s19, v3
9478; GFX9-NEXT:    v_mul_f32_e32 v4, s20, v3
9479; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
9480; GFX9-NEXT:    v_mac_f32_e32 v3, s21, v4
9481; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
9482; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
9483; GFX9-NEXT:    s_sub_u32 s2, 0, s8
9484; GFX9-NEXT:    s_subb_u32 s3, 0, s9
9485; GFX9-NEXT:    v_mul_hi_u32 v7, s2, v3
9486; GFX9-NEXT:    v_mul_lo_u32 v8, s2, v4
9487; GFX9-NEXT:    v_mul_lo_u32 v9, s3, v3
9488; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
9489; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v3
9490; GFX9-NEXT:    v_add_u32_e32 v7, v7, v8
9491; GFX9-NEXT:    v_add_u32_e32 v7, v7, v9
9492; GFX9-NEXT:    v_mul_lo_u32 v8, v3, v7
9493; GFX9-NEXT:    v_mul_hi_u32 v9, v3, v2
9494; GFX9-NEXT:    v_mul_hi_u32 v10, v3, v7
9495; GFX9-NEXT:    v_mul_hi_u32 v11, v4, v7
9496; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v7
9497; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
9498; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v10, vcc
9499; GFX9-NEXT:    v_mul_lo_u32 v10, v4, v2
9500; GFX9-NEXT:    v_mul_hi_u32 v2, v4, v2
9501; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
9502; GFX9-NEXT:    s_mov_b32 s11, s10
9503; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
9504; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v9, v2, vcc
9505; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v6, vcc
9506; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
9507; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v3, v2
9508; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v5, v8, vcc
9509; GFX9-NEXT:    v_addc_co_u32_e64 v3, vcc, v4, v7, s[0:1]
9510; GFX9-NEXT:    v_mul_lo_u32 v8, s2, v3
9511; GFX9-NEXT:    v_mul_hi_u32 v9, s2, v2
9512; GFX9-NEXT:    v_mul_lo_u32 v10, s3, v2
9513; GFX9-NEXT:    v_mul_lo_u32 v11, s2, v2
9514; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
9515; GFX9-NEXT:    v_add_u32_e32 v8, v9, v8
9516; GFX9-NEXT:    v_add_u32_e32 v8, v8, v10
9517; GFX9-NEXT:    v_mul_lo_u32 v12, v2, v8
9518; GFX9-NEXT:    v_mul_hi_u32 v13, v2, v11
9519; GFX9-NEXT:    v_mul_hi_u32 v14, v2, v8
9520; GFX9-NEXT:    v_mul_hi_u32 v10, v3, v11
9521; GFX9-NEXT:    v_mul_lo_u32 v11, v3, v11
9522; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v13, v12
9523; GFX9-NEXT:    v_mul_hi_u32 v9, v3, v8
9524; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v14, vcc
9525; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v8
9526; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v12, v11
9527; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v13, v10, vcc
9528; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v9, v6, vcc
9529; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v10, v3
9530; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v5, v8, vcc
9531; GFX9-NEXT:    v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1]
9532; GFX9-NEXT:    s_add_u32 s0, s6, s10
9533; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
9534; GFX9-NEXT:    s_addc_u32 s1, s7, s10
9535; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
9536; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
9537; GFX9-NEXT:    v_mul_lo_u32 v4, s6, v3
9538; GFX9-NEXT:    v_mul_hi_u32 v7, s6, v2
9539; GFX9-NEXT:    v_mul_hi_u32 v9, s6, v3
9540; GFX9-NEXT:    v_mul_hi_u32 v10, s7, v3
9541; GFX9-NEXT:    v_mul_lo_u32 v3, s7, v3
9542; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v7, v4
9543; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v9, vcc
9544; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v2
9545; GFX9-NEXT:    v_mul_hi_u32 v2, s7, v2
9546; GFX9-NEXT:    v_xor_b32_e32 v0, s12, v0
9547; GFX9-NEXT:    v_xor_b32_e32 v1, s13, v1
9548; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
9549; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v2, vcc
9550; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v10, v6, vcc
9551; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
9552; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
9553; GFX9-NEXT:    v_mul_lo_u32 v4, s8, v3
9554; GFX9-NEXT:    v_mul_hi_u32 v5, s8, v2
9555; GFX9-NEXT:    v_mul_lo_u32 v7, s9, v2
9556; GFX9-NEXT:    v_mov_b32_e32 v8, s13
9557; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s12, v0
9558; GFX9-NEXT:    v_add_u32_e32 v4, v5, v4
9559; GFX9-NEXT:    v_mul_lo_u32 v5, s8, v2
9560; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
9561; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
9562; GFX9-NEXT:    v_sub_u32_e32 v7, s7, v4
9563; GFX9-NEXT:    v_mov_b32_e32 v8, s9
9564; GFX9-NEXT:    v_sub_co_u32_e64 v5, s[0:1], s6, v5
9565; GFX9-NEXT:    v_subb_co_u32_e64 v7, vcc, v7, v8, s[0:1]
9566; GFX9-NEXT:    v_subrev_co_u32_e32 v8, vcc, s8, v5
9567; GFX9-NEXT:    v_subbrev_co_u32_e32 v7, vcc, 0, v7, vcc
9568; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v7
9569; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
9570; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v8
9571; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
9572; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v7
9573; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v8, vcc
9574; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, 2, v2
9575; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v3, vcc
9576; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, 1, v2
9577; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v3, vcc
9578; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v7
9579; GFX9-NEXT:    v_cndmask_b32_e64 v7, v11, v9, s[2:3]
9580; GFX9-NEXT:    v_mov_b32_e32 v9, s7
9581; GFX9-NEXT:    v_subb_co_u32_e64 v4, vcc, v9, v4, s[0:1]
9582; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v4
9583; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
9584; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
9585; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
9586; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v4
9587; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v5, vcc
9588; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
9589; GFX9-NEXT:    v_cndmask_b32_e64 v4, v10, v8, s[2:3]
9590; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
9591; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[14:15]
9592; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
9593; GFX9-NEXT:    v_xor_b32_e32 v2, s0, v2
9594; GFX9-NEXT:    v_xor_b32_e32 v3, s1, v3
9595; GFX9-NEXT:    v_mov_b32_e32 v4, s1
9596; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v2
9597; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
9598; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9599; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
9600; GFX9-NEXT:    s_endpgm
9601  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
9602  %r = sdiv <2 x i64> %x, %shl.y
9603  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
9604  ret void
9605}
9606
9607define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
9608; CHECK-LABEL: @srem_i64_oddk_denom(
9609; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 1235195
9610; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
9611; CHECK-NEXT:    ret void
9612;
9613; GFX6-LABEL: srem_i64_oddk_denom:
9614; GFX6:       ; %bb.0:
9615; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f800000
9616; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
9617; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
9618; GFX6-NEXT:    s_mov_b32 s2, 0xffed2705
9619; GFX6-NEXT:    v_mov_b32_e32 v8, 0
9620; GFX6-NEXT:    v_mov_b32_e32 v7, 0
9621; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9622; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9623; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
9624; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9625; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
9626; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
9627; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
9628; GFX6-NEXT:    s_mov_b32 s7, 0xf000
9629; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
9630; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s2
9631; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s2
9632; GFX6-NEXT:    s_mov_b32 s6, -1
9633; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9634; GFX6-NEXT:    s_mov_b32 s4, s8
9635; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9636; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
9637; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
9638; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v4
9639; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v2
9640; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v2
9641; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
9642; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
9643; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
9644; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
9645; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
9646; GFX6-NEXT:    s_mov_b32 s5, s9
9647; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
9648; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
9649; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
9650; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9651; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
9652; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
9653; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
9654; GFX6-NEXT:    v_mul_lo_u32 v4, v2, s2
9655; GFX6-NEXT:    v_mul_hi_u32 v5, s2, v0
9656; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
9657; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s2
9658; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
9659; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v4
9660; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v4
9661; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v5
9662; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v5
9663; GFX6-NEXT:    v_mul_lo_u32 v5, v2, v5
9664; GFX6-NEXT:    v_mul_hi_u32 v6, v2, v4
9665; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
9666; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, v8, v12, vcc
9667; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
9668; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
9669; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v11, v9, vcc
9670; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v6, v7, vcc
9671; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
9672; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
9673; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
9674; GFX6-NEXT:    s_ashr_i32 s2, s11, 31
9675; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
9676; GFX6-NEXT:    s_add_u32 s0, s10, s2
9677; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
9678; GFX6-NEXT:    s_mov_b32 s3, s2
9679; GFX6-NEXT:    s_addc_u32 s1, s11, s2
9680; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
9681; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
9682; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
9683; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
9684; GFX6-NEXT:    v_mul_hi_u32 v4, s0, v1
9685; GFX6-NEXT:    v_mul_hi_u32 v5, s1, v1
9686; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v1
9687; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9688; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
9689; GFX6-NEXT:    v_mul_lo_u32 v4, s1, v0
9690; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
9691; GFX6-NEXT:    s_mov_b32 s3, 0x12d8fb
9692; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
9693; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
9694; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
9695; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
9696; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
9697; GFX6-NEXT:    v_mul_hi_u32 v2, s3, v0
9698; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s3
9699; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s3
9700; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
9701; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
9702; GFX6-NEXT:    v_mov_b32_e32 v2, s1
9703; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
9704; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v0
9705; GFX6-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
9706; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s3, v2
9707; GFX6-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v3, vcc
9708; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fa
9709; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v2
9710; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
9711; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
9712; GFX6-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
9713; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
9714; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v0
9715; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
9716; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
9717; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
9718; GFX6-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[0:1]
9719; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
9720; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
9721; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
9722; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
9723; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
9724; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
9725; GFX6-NEXT:    v_mov_b32_e32 v2, s2
9726; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
9727; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
9728; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
9729; GFX6-NEXT:    s_endpgm
9730; GFX9-LABEL: srem_i64_oddk_denom:
9731; GFX9:       ; %bb.0:
9732; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
9733; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
9734; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
9735; GFX9-NEXT:    s_mov_b32 s8, 0xffed2705
9736; GFX9-NEXT:    v_mov_b32_e32 v7, 0
9737; GFX9-NEXT:    v_mov_b32_e32 v5, 0
9738; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9739; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9740; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
9741; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9742; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
9743; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
9744; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
9745; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v0
9746; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s8
9747; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
9748; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
9749; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
9750; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
9751; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
9752; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
9753; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v2
9754; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
9755; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
9756; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v8, vcc
9757; GFX9-NEXT:    v_mul_lo_u32 v8, v1, v4
9758; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
9759; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
9760; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
9761; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
9762; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
9763; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
9764; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
9765; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
9766; GFX9-NEXT:    v_mul_lo_u32 v4, v2, s8
9767; GFX9-NEXT:    v_mul_hi_u32 v6, s8, v0
9768; GFX9-NEXT:    v_mul_lo_u32 v8, v0, s8
9769; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
9770; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
9771; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v0
9772; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v4
9773; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v8
9774; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v4
9775; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v8
9776; GFX9-NEXT:    v_mul_lo_u32 v8, v2, v8
9777; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
9778; GFX9-NEXT:    v_mul_hi_u32 v6, v2, v4
9779; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v7, v12, vcc
9780; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
9781; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
9782; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
9783; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
9784; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
9785; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
9786; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
9787; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9788; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
9789; GFX9-NEXT:    s_add_u32 s0, s6, s2
9790; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
9791; GFX9-NEXT:    s_mov_b32 s3, s2
9792; GFX9-NEXT:    s_addc_u32 s1, s7, s2
9793; GFX9-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
9794; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
9795; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
9796; GFX9-NEXT:    v_mul_hi_u32 v3, s0, v0
9797; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v1
9798; GFX9-NEXT:    v_mul_hi_u32 v6, s1, v1
9799; GFX9-NEXT:    v_mul_lo_u32 v1, s1, v1
9800; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
9801; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
9802; GFX9-NEXT:    v_mul_lo_u32 v4, s1, v0
9803; GFX9-NEXT:    v_mul_hi_u32 v0, s1, v0
9804; GFX9-NEXT:    s_mov_b32 s3, 0x12d8fb
9805; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
9806; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
9807; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
9808; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
9809; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v2, vcc
9810; GFX9-NEXT:    v_mul_hi_u32 v2, s3, v0
9811; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s3
9812; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s3
9813; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
9814; GFX9-NEXT:    v_mov_b32_e32 v2, s1
9815; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
9816; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
9817; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s3, v0
9818; GFX9-NEXT:    v_subbrev_co_u32_e32 v3, vcc, 0, v1, vcc
9819; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s3, v2
9820; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc
9821; GFX9-NEXT:    s_mov_b32 s3, 0x12d8fa
9822; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v2
9823; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
9824; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
9825; GFX9-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
9826; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
9827; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v0
9828; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
9829; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
9830; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
9831; GFX9-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
9832; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
9833; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
9834; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
9835; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
9836; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
9837; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
9838; GFX9-NEXT:    v_mov_b32_e32 v2, s2
9839; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s2, v0
9840; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
9841; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
9842; GFX9-NEXT:    s_endpgm
9843  %r = srem i64 %x, 1235195
9844  store i64 %r, i64 addrspace(1)* %out
9845  ret void
9846}
9847
9848define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
9849; CHECK-LABEL: @srem_i64_pow2k_denom(
9850; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 4096
9851; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
9852; CHECK-NEXT:    ret void
9853;
9854; GFX6-LABEL: srem_i64_pow2k_denom:
9855; GFX6:       ; %bb.0:
9856; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
9857; GFX6-NEXT:    s_mov_b32 s3, 0xf000
9858; GFX6-NEXT:    s_mov_b32 s2, -1
9859; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9860; GFX6-NEXT:    s_mov_b32 s0, s4
9861; GFX6-NEXT:    s_ashr_i32 s4, s7, 31
9862; GFX6-NEXT:    s_lshr_b32 s4, s4, 20
9863; GFX6-NEXT:    s_add_u32 s4, s6, s4
9864; GFX6-NEXT:    s_mov_b32 s1, s5
9865; GFX6-NEXT:    s_addc_u32 s5, s7, 0
9866; GFX6-NEXT:    s_and_b32 s4, s4, 0xfffff000
9867; GFX6-NEXT:    s_sub_u32 s4, s6, s4
9868; GFX6-NEXT:    s_subb_u32 s5, s7, s5
9869; GFX6-NEXT:    v_mov_b32_e32 v0, s4
9870; GFX6-NEXT:    v_mov_b32_e32 v1, s5
9871; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
9872; GFX6-NEXT:    s_endpgm
9873; GFX9-LABEL: srem_i64_pow2k_denom:
9874; GFX9:       ; %bb.0:
9875; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
9876; GFX9-NEXT:    v_mov_b32_e32 v2, 0
9877; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
9878; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
9879; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
9880; GFX9-NEXT:    s_add_u32 s4, s2, s4
9881; GFX9-NEXT:    s_addc_u32 s5, s3, 0
9882; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff000
9883; GFX9-NEXT:    s_sub_u32 s2, s2, s4
9884; GFX9-NEXT:    s_subb_u32 s3, s3, s5
9885; GFX9-NEXT:    v_mov_b32_e32 v0, s2
9886; GFX9-NEXT:    v_mov_b32_e32 v1, s3
9887; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
9888; GFX9-NEXT:    s_endpgm
9889  %r = srem i64 %x, 4096
9890  store i64 %r, i64 addrspace(1)* %out
9891  ret void
9892}
9893
9894define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
9895; CHECK-LABEL: @srem_i64_pow2_shl_denom(
9896; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
9897; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]]
9898; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
9899; CHECK-NEXT:    ret void
9900;
9901; GFX6-LABEL: srem_i64_pow2_shl_denom:
9902; GFX6:       ; %bb.0:
9903; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xd
9904; GFX6-NEXT:    s_mov_b32 s3, 0
9905; GFX6-NEXT:    s_movk_i32 s2, 0x1000
9906; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
9907; GFX6-NEXT:    s_mov_b32 s7, 0xf000
9908; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
9909; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
9910; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
9911; GFX6-NEXT:    s_add_u32 s2, s2, s4
9912; GFX6-NEXT:    s_mov_b32 s5, s4
9913; GFX6-NEXT:    s_addc_u32 s3, s3, s4
9914; GFX6-NEXT:    s_xor_b64 s[12:13], s[2:3], s[4:5]
9915; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
9916; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s13
9917; GFX6-NEXT:    s_sub_u32 s2, 0, s12
9918; GFX6-NEXT:    s_subb_u32 s3, 0, s13
9919; GFX6-NEXT:    s_ashr_i32 s14, s11, 31
9920; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
9921; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
9922; GFX6-NEXT:    s_mov_b32 s15, s14
9923; GFX6-NEXT:    s_mov_b32 s6, -1
9924; GFX6-NEXT:    s_mov_b32 s4, s8
9925; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
9926; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
9927; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
9928; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
9929; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
9930; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
9931; GFX6-NEXT:    s_mov_b32 s5, s9
9932; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
9933; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
9934; GFX6-NEXT:    v_mul_lo_u32 v5, s3, v0
9935; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v0
9936; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9937; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
9938; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
9939; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
9940; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
9941; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
9942; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
9943; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
9944; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
9945; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
9946; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
9947; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
9948; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
9949; GFX6-NEXT:    v_mov_b32_e32 v4, 0
9950; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
9951; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9952; GFX6-NEXT:    v_mov_b32_e32 v6, 0
9953; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
9954; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
9955; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
9956; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v2
9957; GFX6-NEXT:    v_mul_hi_u32 v7, s2, v0
9958; GFX6-NEXT:    v_mul_lo_u32 v8, s3, v0
9959; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
9960; GFX6-NEXT:    v_mul_lo_u32 v7, s2, v0
9961; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
9962; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
9963; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
9964; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
9965; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
9966; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
9967; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
9968; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
9969; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
9970; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v5
9971; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
9972; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
9973; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
9974; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
9975; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
9976; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
9977; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
9978; GFX6-NEXT:    s_add_u32 s0, s10, s14
9979; GFX6-NEXT:    s_addc_u32 s1, s11, s14
9980; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
9981; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
9982; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
9983; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
9984; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
9985; GFX6-NEXT:    v_mul_hi_u32 v5, s10, v1
9986; GFX6-NEXT:    v_mul_hi_u32 v7, s11, v1
9987; GFX6-NEXT:    v_mul_lo_u32 v1, s11, v1
9988; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
9989; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
9990; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v0
9991; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
9992; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
9993; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
9994; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
9995; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
9996; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
9997; GFX6-NEXT:    v_mul_lo_u32 v1, s12, v1
9998; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v0
9999; GFX6-NEXT:    v_mul_lo_u32 v3, s13, v0
10000; GFX6-NEXT:    v_mul_lo_u32 v0, s12, v0
10001; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
10002; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
10003; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s11, v1
10004; GFX6-NEXT:    v_mov_b32_e32 v3, s13
10005; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
10006; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
10007; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
10008; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
10009; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v5
10010; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
10011; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
10012; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
10013; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
10014; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
10015; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v5
10016; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
10017; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
10018; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
10019; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
10020; GFX6-NEXT:    v_mov_b32_e32 v5, s11
10021; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
10022; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
10023; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
10024; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
10025; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
10026; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
10027; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
10028; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
10029; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
10030; GFX6-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
10031; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
10032; GFX6-NEXT:    v_xor_b32_e32 v0, s14, v0
10033; GFX6-NEXT:    v_xor_b32_e32 v1, s14, v1
10034; GFX6-NEXT:    v_mov_b32_e32 v2, s14
10035; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s14, v0
10036; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
10037; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
10038; GFX6-NEXT:    s_endpgm
10039; GFX9-LABEL: srem_i64_pow2_shl_denom:
10040; GFX9:       ; %bb.0:
10041; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
10042; GFX9-NEXT:    s_mov_b32 s3, 0
10043; GFX9-NEXT:    s_movk_i32 s2, 0x1000
10044; GFX9-NEXT:    v_mov_b32_e32 v2, 0
10045; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10046; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
10047; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
10048; GFX9-NEXT:    s_add_u32 s2, s2, s4
10049; GFX9-NEXT:    s_mov_b32 s5, s4
10050; GFX9-NEXT:    s_addc_u32 s3, s3, s4
10051; GFX9-NEXT:    s_xor_b64 s[8:9], s[2:3], s[4:5]
10052; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
10053; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
10054; GFX9-NEXT:    s_sub_u32 s10, 0, s8
10055; GFX9-NEXT:    s_subb_u32 s4, 0, s9
10056; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
10057; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
10058; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
10059; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
10060; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
10061; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
10062; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
10063; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
10064; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v0
10065; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
10066; GFX9-NEXT:    v_mul_lo_u32 v6, s4, v0
10067; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v0
10068; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
10069; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
10070; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v5
10071; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v3
10072; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v3
10073; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v5
10074; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
10075; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
10076; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v3
10077; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v8, vcc
10078; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v3
10079; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
10080; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v7, vcc
10081; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v2, vcc
10082; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
10083; GFX9-NEXT:    v_mov_b32_e32 v6, 0
10084; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v3
10085; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
10086; GFX9-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[2:3]
10087; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v3
10088; GFX9-NEXT:    v_mul_hi_u32 v7, s10, v0
10089; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v0
10090; GFX9-NEXT:    v_mul_lo_u32 v9, s10, v0
10091; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
10092; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
10093; GFX9-NEXT:    v_add_u32_e32 v5, v5, v8
10094; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v5
10095; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v9
10096; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v5
10097; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v9
10098; GFX9-NEXT:    v_mul_lo_u32 v9, v3, v9
10099; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
10100; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v5
10101; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
10102; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v5
10103; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
10104; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
10105; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
10106; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
10107; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10108; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
10109; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
10110; GFX9-NEXT:    v_add_u32_e32 v1, v1, v4
10111; GFX9-NEXT:    s_add_u32 s0, s6, s10
10112; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3]
10113; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
10114; GFX9-NEXT:    s_mov_b32 s11, s10
10115; GFX9-NEXT:    s_addc_u32 s1, s7, s10
10116; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
10117; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
10118; GFX9-NEXT:    v_mul_lo_u32 v3, s6, v1
10119; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v0
10120; GFX9-NEXT:    v_mul_hi_u32 v5, s6, v1
10121; GFX9-NEXT:    v_mul_hi_u32 v7, s7, v1
10122; GFX9-NEXT:    v_mul_lo_u32 v1, s7, v1
10123; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
10124; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
10125; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v0
10126; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
10127; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
10128; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v4, v0, vcc
10129; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v2, vcc
10130; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
10131; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v3, vcc
10132; GFX9-NEXT:    v_mul_lo_u32 v1, s8, v1
10133; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v0
10134; GFX9-NEXT:    v_mul_lo_u32 v4, s9, v0
10135; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
10136; GFX9-NEXT:    v_add_u32_e32 v1, v3, v1
10137; GFX9-NEXT:    v_add_u32_e32 v1, v1, v4
10138; GFX9-NEXT:    v_sub_co_u32_e64 v0, s[0:1], s6, v0
10139; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v1
10140; GFX9-NEXT:    v_mov_b32_e32 v4, s9
10141; GFX9-NEXT:    v_subb_co_u32_e64 v3, vcc, v3, v4, s[0:1]
10142; GFX9-NEXT:    v_subrev_co_u32_e64 v5, s[2:3], s8, v0
10143; GFX9-NEXT:    v_subbrev_co_u32_e64 v6, vcc, 0, v3, s[2:3]
10144; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v6
10145; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
10146; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v5
10147; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
10148; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v6
10149; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
10150; GFX9-NEXT:    v_subb_co_u32_e64 v3, vcc, v3, v4, s[2:3]
10151; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s8, v5
10152; GFX9-NEXT:    v_subbrev_co_u32_e32 v3, vcc, 0, v3, vcc
10153; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v7
10154; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[2:3]
10155; GFX9-NEXT:    v_mov_b32_e32 v6, s7
10156; GFX9-NEXT:    v_subb_co_u32_e64 v1, vcc, v6, v1, s[0:1]
10157; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
10158; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
10159; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
10160; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
10161; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v1
10162; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
10163; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
10164; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
10165; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v4, s[2:3]
10166; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
10167; GFX9-NEXT:    v_xor_b32_e32 v0, s10, v0
10168; GFX9-NEXT:    v_xor_b32_e32 v1, s10, v1
10169; GFX9-NEXT:    v_mov_b32_e32 v3, s10
10170; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s10, v0
10171; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
10172; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
10173; GFX9-NEXT:    s_endpgm
10174  %shl.y = shl i64 4096, %y
10175  %r = srem i64 %x, %shl.y
10176  store i64 %r, i64 addrspace(1)* %out
10177  ret void
10178}
10179
10180define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
10181; CHECK-LABEL: @srem_v2i64_pow2k_denom(
10182; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
10183; CHECK-NEXT:    [[TMP2:%.*]] = srem i64 [[TMP1]], 4096
10184; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
10185; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
10186; CHECK-NEXT:    [[TMP5:%.*]] = srem i64 [[TMP4]], 4096
10187; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
10188; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
10189; CHECK-NEXT:    ret void
10190;
10191; GFX6-LABEL: srem_v2i64_pow2k_denom:
10192; GFX6:       ; %bb.0:
10193; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
10194; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
10195; GFX6-NEXT:    s_movk_i32 s8, 0xf000
10196; GFX6-NEXT:    s_mov_b32 s7, 0xf000
10197; GFX6-NEXT:    s_mov_b32 s6, -1
10198; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10199; GFX6-NEXT:    s_ashr_i32 s9, s1, 31
10200; GFX6-NEXT:    s_lshr_b32 s9, s9, 20
10201; GFX6-NEXT:    s_add_u32 s9, s0, s9
10202; GFX6-NEXT:    s_addc_u32 s10, s1, 0
10203; GFX6-NEXT:    s_and_b32 s9, s9, s8
10204; GFX6-NEXT:    s_sub_u32 s0, s0, s9
10205; GFX6-NEXT:    s_subb_u32 s1, s1, s10
10206; GFX6-NEXT:    s_ashr_i32 s9, s3, 31
10207; GFX6-NEXT:    s_lshr_b32 s9, s9, 20
10208; GFX6-NEXT:    s_add_u32 s9, s2, s9
10209; GFX6-NEXT:    s_addc_u32 s10, s3, 0
10210; GFX6-NEXT:    s_and_b32 s8, s9, s8
10211; GFX6-NEXT:    s_sub_u32 s2, s2, s8
10212; GFX6-NEXT:    s_subb_u32 s3, s3, s10
10213; GFX6-NEXT:    v_mov_b32_e32 v0, s0
10214; GFX6-NEXT:    v_mov_b32_e32 v1, s1
10215; GFX6-NEXT:    v_mov_b32_e32 v2, s2
10216; GFX6-NEXT:    v_mov_b32_e32 v3, s3
10217; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
10218; GFX6-NEXT:    s_endpgm
10219; GFX9-LABEL: srem_v2i64_pow2k_denom:
10220; GFX9:       ; %bb.0:
10221; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
10222; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
10223; GFX9-NEXT:    s_movk_i32 s8, 0xf000
10224; GFX9-NEXT:    v_mov_b32_e32 v4, 0
10225; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10226; GFX9-NEXT:    s_ashr_i32 s0, s5, 31
10227; GFX9-NEXT:    s_lshr_b32 s0, s0, 20
10228; GFX9-NEXT:    s_add_u32 s0, s4, s0
10229; GFX9-NEXT:    s_addc_u32 s1, s5, 0
10230; GFX9-NEXT:    s_and_b32 s0, s0, s8
10231; GFX9-NEXT:    s_sub_u32 s0, s4, s0
10232; GFX9-NEXT:    s_subb_u32 s1, s5, s1
10233; GFX9-NEXT:    s_ashr_i32 s4, s7, 31
10234; GFX9-NEXT:    s_lshr_b32 s4, s4, 20
10235; GFX9-NEXT:    s_add_u32 s4, s6, s4
10236; GFX9-NEXT:    s_addc_u32 s5, s7, 0
10237; GFX9-NEXT:    s_and_b32 s4, s4, s8
10238; GFX9-NEXT:    s_sub_u32 s4, s6, s4
10239; GFX9-NEXT:    s_subb_u32 s5, s7, s5
10240; GFX9-NEXT:    v_mov_b32_e32 v0, s0
10241; GFX9-NEXT:    v_mov_b32_e32 v1, s1
10242; GFX9-NEXT:    v_mov_b32_e32 v2, s4
10243; GFX9-NEXT:    v_mov_b32_e32 v3, s5
10244; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
10245; GFX9-NEXT:    s_endpgm
10246  %r = srem <2 x i64> %x, <i64 4096, i64 4096>
10247  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
10248  ret void
10249}
10250
10251define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
10252; CHECK-LABEL: @srem_v2i64_pow2_shl_denom(
10253; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
10254; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
10255; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
10256; CHECK-NEXT:    [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]]
10257; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
10258; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
10259; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
10260; CHECK-NEXT:    [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]]
10261; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
10262; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
10263; CHECK-NEXT:    ret void
10264;
10265; GFX6-LABEL: srem_v2i64_pow2_shl_denom:
10266; GFX6:       ; %bb.0:
10267; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
10268; GFX6-NEXT:    s_mov_b32 s3, 0
10269; GFX6-NEXT:    s_movk_i32 s2, 0x1000
10270; GFX6-NEXT:    s_mov_b32 s18, 0x4f800000
10271; GFX6-NEXT:    s_mov_b32 s19, 0x5f7ffffc
10272; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10273; GFX6-NEXT:    s_lshl_b64 s[14:15], s[2:3], s6
10274; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
10275; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
10276; GFX6-NEXT:    s_add_u32 s2, s2, s4
10277; GFX6-NEXT:    s_mov_b32 s5, s4
10278; GFX6-NEXT:    s_addc_u32 s3, s3, s4
10279; GFX6-NEXT:    s_xor_b64 s[16:17], s[2:3], s[4:5]
10280; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s16
10281; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s17
10282; GFX6-NEXT:    s_mov_b32 s20, 0x2f800000
10283; GFX6-NEXT:    s_mov_b32 s21, 0xcf800000
10284; GFX6-NEXT:    s_sub_u32 s6, 0, s16
10285; GFX6-NEXT:    v_mac_f32_e32 v0, s18, v1
10286; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
10287; GFX6-NEXT:    s_subb_u32 s7, 0, s17
10288; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
10289; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
10290; GFX6-NEXT:    v_mul_f32_e32 v0, s19, v0
10291; GFX6-NEXT:    v_mul_f32_e32 v1, s20, v0
10292; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
10293; GFX6-NEXT:    v_mac_f32_e32 v0, s21, v1
10294; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
10295; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
10296; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
10297; GFX6-NEXT:    s_ashr_i32 s12, s9, 31
10298; GFX6-NEXT:    s_add_u32 s0, s8, s12
10299; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
10300; GFX6-NEXT:    v_mul_lo_u32 v2, s6, v1
10301; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
10302; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v0
10303; GFX6-NEXT:    s_mov_b32 s13, s12
10304; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10305; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
10306; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
10307; GFX6-NEXT:    v_mul_hi_u32 v4, v0, v5
10308; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
10309; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
10310; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
10311; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
10312; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
10313; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v5
10314; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v5
10315; GFX6-NEXT:    s_addc_u32 s1, s9, s12
10316; GFX6-NEXT:    s_xor_b64 s[8:9], s[0:1], s[12:13]
10317; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
10318; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
10319; GFX6-NEXT:    v_mov_b32_e32 v4, 0
10320; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
10321; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10322; GFX6-NEXT:    v_mov_b32_e32 v6, 0
10323; GFX6-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v2
10324; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
10325; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[2:3]
10326; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v2
10327; GFX6-NEXT:    v_mul_hi_u32 v7, s6, v0
10328; GFX6-NEXT:    v_mul_lo_u32 v8, s7, v0
10329; GFX6-NEXT:    s_mov_b32 s7, 0xf000
10330; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
10331; GFX6-NEXT:    v_mul_lo_u32 v7, s6, v0
10332; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
10333; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
10334; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
10335; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
10336; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
10337; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
10338; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
10339; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
10340; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
10341; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v5
10342; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
10343; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
10344; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
10345; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
10346; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
10347; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
10348; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
10349; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
10350; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
10351; GFX6-NEXT:    v_mul_lo_u32 v2, s8, v1
10352; GFX6-NEXT:    v_mul_hi_u32 v3, s8, v0
10353; GFX6-NEXT:    v_mul_hi_u32 v5, s8, v1
10354; GFX6-NEXT:    v_mul_hi_u32 v7, s9, v1
10355; GFX6-NEXT:    v_mul_lo_u32 v1, s9, v1
10356; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
10357; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
10358; GFX6-NEXT:    v_mul_lo_u32 v5, s9, v0
10359; GFX6-NEXT:    v_mul_hi_u32 v0, s9, v0
10360; GFX6-NEXT:    s_mov_b32 s6, -1
10361; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
10362; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
10363; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
10364; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
10365; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
10366; GFX6-NEXT:    v_mul_lo_u32 v1, s16, v1
10367; GFX6-NEXT:    v_mul_hi_u32 v2, s16, v0
10368; GFX6-NEXT:    v_mul_lo_u32 v3, s17, v0
10369; GFX6-NEXT:    v_mul_lo_u32 v0, s16, v0
10370; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
10371; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
10372; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s9, v1
10373; GFX6-NEXT:    v_mov_b32_e32 v3, s17
10374; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
10375; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
10376; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s16, v0
10377; GFX6-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v2, s[0:1]
10378; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s17, v7
10379; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
10380; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
10381; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s16, v5
10382; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s16, v5
10383; GFX6-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
10384; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s17, v7
10385; GFX6-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
10386; GFX6-NEXT:    s_ashr_i32 s2, s15, 31
10387; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
10388; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
10389; GFX6-NEXT:    s_add_u32 s8, s14, s2
10390; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[0:1]
10391; GFX6-NEXT:    v_mov_b32_e32 v7, s9
10392; GFX6-NEXT:    s_mov_b32 s3, s2
10393; GFX6-NEXT:    s_addc_u32 s9, s15, s2
10394; GFX6-NEXT:    s_xor_b64 s[8:9], s[8:9], s[2:3]
10395; GFX6-NEXT:    v_cvt_f32_u32_e32 v8, s8
10396; GFX6-NEXT:    v_cvt_f32_u32_e32 v9, s9
10397; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v7, v1, vcc
10398; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s17, v1
10399; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
10400; GFX6-NEXT:    v_mac_f32_e32 v8, s18, v9
10401; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s16, v0
10402; GFX6-NEXT:    v_rcp_f32_e32 v8, v8
10403; GFX6-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
10404; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s17, v1
10405; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
10406; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
10407; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
10408; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v3, s[0:1]
10409; GFX6-NEXT:    v_mul_f32_e32 v3, s19, v8
10410; GFX6-NEXT:    v_mul_f32_e32 v5, s20, v3
10411; GFX6-NEXT:    v_trunc_f32_e32 v5, v5
10412; GFX6-NEXT:    v_mac_f32_e32 v3, s21, v5
10413; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
10414; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v5
10415; GFX6-NEXT:    s_sub_u32 s2, 0, s8
10416; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
10417; GFX6-NEXT:    v_mul_hi_u32 v2, s2, v3
10418; GFX6-NEXT:    v_mul_lo_u32 v7, s2, v5
10419; GFX6-NEXT:    s_subb_u32 s3, 0, s9
10420; GFX6-NEXT:    v_mul_lo_u32 v8, s3, v3
10421; GFX6-NEXT:    s_ashr_i32 s14, s11, 31
10422; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
10423; GFX6-NEXT:    v_mul_lo_u32 v7, s2, v3
10424; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
10425; GFX6-NEXT:    v_mul_lo_u32 v8, v3, v2
10426; GFX6-NEXT:    v_mul_hi_u32 v10, v3, v2
10427; GFX6-NEXT:    v_mul_hi_u32 v9, v3, v7
10428; GFX6-NEXT:    v_mul_hi_u32 v11, v5, v2
10429; GFX6-NEXT:    v_mul_lo_u32 v2, v5, v2
10430; GFX6-NEXT:    s_mov_b32 s15, s14
10431; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
10432; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
10433; GFX6-NEXT:    v_mul_lo_u32 v10, v5, v7
10434; GFX6-NEXT:    v_mul_hi_u32 v7, v5, v7
10435; GFX6-NEXT:    v_xor_b32_e32 v0, s12, v0
10436; GFX6-NEXT:    v_xor_b32_e32 v1, s12, v1
10437; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
10438; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
10439; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v11, v4, vcc
10440; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
10441; GFX6-NEXT:    v_add_i32_e64 v2, s[0:1], v3, v2
10442; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v6, v8, vcc
10443; GFX6-NEXT:    v_addc_u32_e64 v3, vcc, v5, v7, s[0:1]
10444; GFX6-NEXT:    v_mul_lo_u32 v8, s2, v3
10445; GFX6-NEXT:    v_mul_hi_u32 v9, s2, v2
10446; GFX6-NEXT:    v_mul_lo_u32 v10, s3, v2
10447; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
10448; GFX6-NEXT:    v_mul_lo_u32 v9, s2, v2
10449; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
10450; GFX6-NEXT:    v_mul_lo_u32 v12, v2, v8
10451; GFX6-NEXT:    v_mul_hi_u32 v14, v2, v8
10452; GFX6-NEXT:    v_mul_hi_u32 v13, v2, v9
10453; GFX6-NEXT:    v_mul_hi_u32 v11, v3, v9
10454; GFX6-NEXT:    v_mul_lo_u32 v9, v3, v9
10455; GFX6-NEXT:    v_mul_hi_u32 v10, v3, v8
10456; GFX6-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
10457; GFX6-NEXT:    v_addc_u32_e32 v13, vcc, 0, v14, vcc
10458; GFX6-NEXT:    v_mul_lo_u32 v3, v3, v8
10459; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
10460; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v13, v11, vcc
10461; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v10, v4, vcc
10462; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
10463; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v6, v8, vcc
10464; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
10465; GFX6-NEXT:    v_addc_u32_e64 v5, vcc, v5, v8, s[0:1]
10466; GFX6-NEXT:    s_add_u32 s0, s10, s14
10467; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
10468; GFX6-NEXT:    s_addc_u32 s1, s11, s14
10469; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
10470; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
10471; GFX6-NEXT:    v_mul_lo_u32 v5, s10, v3
10472; GFX6-NEXT:    v_mul_hi_u32 v7, s10, v2
10473; GFX6-NEXT:    v_mul_hi_u32 v9, s10, v3
10474; GFX6-NEXT:    v_mul_hi_u32 v10, s11, v3
10475; GFX6-NEXT:    v_mul_lo_u32 v3, s11, v3
10476; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
10477; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
10478; GFX6-NEXT:    v_mul_lo_u32 v9, s11, v2
10479; GFX6-NEXT:    v_mul_hi_u32 v2, s11, v2
10480; GFX6-NEXT:    v_mov_b32_e32 v8, s12
10481; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
10482; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v2, vcc
10483; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v10, v4, vcc
10484; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
10485; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v4, vcc
10486; GFX6-NEXT:    v_mul_lo_u32 v3, s8, v3
10487; GFX6-NEXT:    v_mul_hi_u32 v4, s8, v2
10488; GFX6-NEXT:    v_mul_lo_u32 v5, s9, v2
10489; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
10490; GFX6-NEXT:    v_mul_lo_u32 v2, s8, v2
10491; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
10492; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
10493; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
10494; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s11, v3
10495; GFX6-NEXT:    v_mov_b32_e32 v5, s9
10496; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s10, v2
10497; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
10498; GFX6-NEXT:    v_subrev_i32_e64 v6, s[0:1], s8, v2
10499; GFX6-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1]
10500; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v7
10501; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1]
10502; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
10503; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v6
10504; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s8, v6
10505; GFX6-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
10506; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v7
10507; GFX6-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
10508; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
10509; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
10510; GFX6-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[0:1]
10511; GFX6-NEXT:    v_mov_b32_e32 v7, s11
10512; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
10513; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
10514; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
10515; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
10516; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
10517; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
10518; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
10519; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
10520; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
10521; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
10522; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
10523; GFX6-NEXT:    v_xor_b32_e32 v2, s14, v2
10524; GFX6-NEXT:    v_xor_b32_e32 v3, s14, v3
10525; GFX6-NEXT:    v_mov_b32_e32 v4, s14
10526; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s14, v2
10527; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
10528; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
10529; GFX6-NEXT:    s_endpgm
10530; GFX9-LABEL: srem_v2i64_pow2_shl_denom:
10531; GFX9:       ; %bb.0:
10532; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
10533; GFX9-NEXT:    s_mov_b32 s3, 0
10534; GFX9-NEXT:    s_movk_i32 s2, 0x1000
10535; GFX9-NEXT:    s_mov_b32 s16, 0x4f800000
10536; GFX9-NEXT:    s_mov_b32 s17, 0x5f7ffffc
10537; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10538; GFX9-NEXT:    s_lshl_b64 s[12:13], s[2:3], s6
10539; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
10540; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
10541; GFX9-NEXT:    s_add_u32 s2, s2, s4
10542; GFX9-NEXT:    s_mov_b32 s5, s4
10543; GFX9-NEXT:    s_addc_u32 s3, s3, s4
10544; GFX9-NEXT:    s_xor_b64 s[14:15], s[2:3], s[4:5]
10545; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s14
10546; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s15
10547; GFX9-NEXT:    s_mov_b32 s18, 0x2f800000
10548; GFX9-NEXT:    s_mov_b32 s19, 0xcf800000
10549; GFX9-NEXT:    s_sub_u32 s4, 0, s14
10550; GFX9-NEXT:    v_mac_f32_e32 v0, s16, v1
10551; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
10552; GFX9-NEXT:    s_subb_u32 s5, 0, s15
10553; GFX9-NEXT:    v_mov_b32_e32 v6, 0
10554; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
10555; GFX9-NEXT:    v_mul_f32_e32 v0, s17, v0
10556; GFX9-NEXT:    v_mul_f32_e32 v1, s18, v0
10557; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
10558; GFX9-NEXT:    v_mac_f32_e32 v0, s19, v1
10559; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
10560; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
10561; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10562; GFX9-NEXT:    s_ashr_i32 s6, s9, 31
10563; GFX9-NEXT:    s_mov_b32 s7, s6
10564; GFX9-NEXT:    v_mul_hi_u32 v3, s4, v0
10565; GFX9-NEXT:    v_mul_lo_u32 v2, s4, v1
10566; GFX9-NEXT:    v_mul_lo_u32 v5, s5, v0
10567; GFX9-NEXT:    v_mul_lo_u32 v4, s4, v0
10568; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
10569; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
10570; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
10571; GFX9-NEXT:    v_mul_lo_u32 v5, v0, v2
10572; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
10573; GFX9-NEXT:    v_mul_hi_u32 v8, v1, v2
10574; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
10575; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
10576; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
10577; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v4
10578; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
10579; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
10580; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
10581; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v6, vcc
10582; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
10583; GFX9-NEXT:    v_mov_b32_e32 v5, 0
10584; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
10585; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
10586; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
10587; GFX9-NEXT:    v_mul_lo_u32 v4, s4, v2
10588; GFX9-NEXT:    v_mul_hi_u32 v7, s4, v0
10589; GFX9-NEXT:    v_mul_lo_u32 v8, s5, v0
10590; GFX9-NEXT:    v_mul_lo_u32 v9, s4, v0
10591; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
10592; GFX9-NEXT:    v_add_u32_e32 v4, v7, v4
10593; GFX9-NEXT:    v_add_u32_e32 v4, v4, v8
10594; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v4
10595; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v9
10596; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v4
10597; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v9
10598; GFX9-NEXT:    v_mul_lo_u32 v9, v2, v9
10599; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
10600; GFX9-NEXT:    v_mul_hi_u32 v7, v2, v4
10601; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
10602; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
10603; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
10604; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
10605; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v6, vcc
10606; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
10607; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v4, vcc
10608; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
10609; GFX9-NEXT:    s_add_u32 s2, s8, s6
10610; GFX9-NEXT:    s_addc_u32 s3, s9, s6
10611; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
10612; GFX9-NEXT:    s_xor_b64 s[8:9], s[2:3], s[6:7]
10613; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
10614; GFX9-NEXT:    v_mul_lo_u32 v2, s8, v1
10615; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v0
10616; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v1
10617; GFX9-NEXT:    v_mul_hi_u32 v7, s9, v1
10618; GFX9-NEXT:    v_mul_lo_u32 v1, s9, v1
10619; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
10620; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
10621; GFX9-NEXT:    v_mul_lo_u32 v4, s9, v0
10622; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
10623; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
10624; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
10625; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
10626; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v6, vcc
10627; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
10628; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v2, vcc
10629; GFX9-NEXT:    v_mul_lo_u32 v1, s14, v1
10630; GFX9-NEXT:    v_mul_hi_u32 v2, s14, v0
10631; GFX9-NEXT:    v_mul_lo_u32 v3, s15, v0
10632; GFX9-NEXT:    v_mul_lo_u32 v0, s14, v0
10633; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
10634; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
10635; GFX9-NEXT:    v_sub_co_u32_e64 v0, s[0:1], s8, v0
10636; GFX9-NEXT:    v_sub_u32_e32 v2, s9, v1
10637; GFX9-NEXT:    v_mov_b32_e32 v3, s15
10638; GFX9-NEXT:    v_subb_co_u32_e64 v2, vcc, v2, v3, s[0:1]
10639; GFX9-NEXT:    v_subrev_co_u32_e64 v4, s[2:3], s14, v0
10640; GFX9-NEXT:    v_subbrev_co_u32_e64 v7, vcc, 0, v2, s[2:3]
10641; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s15, v7
10642; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
10643; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v4
10644; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
10645; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v7
10646; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
10647; GFX9-NEXT:    v_subb_co_u32_e64 v2, vcc, v2, v3, s[2:3]
10648; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s14, v4
10649; GFX9-NEXT:    v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
10650; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v8
10651; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[2:3]
10652; GFX9-NEXT:    v_mov_b32_e32 v7, s9
10653; GFX9-NEXT:    v_subb_co_u32_e64 v1, vcc, v7, v1, s[0:1]
10654; GFX9-NEXT:    s_ashr_i32 s0, s13, 31
10655; GFX9-NEXT:    s_add_u32 s8, s12, s0
10656; GFX9-NEXT:    s_mov_b32 s1, s0
10657; GFX9-NEXT:    s_addc_u32 s9, s13, s0
10658; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[0:1]
10659; GFX9-NEXT:    v_cvt_f32_u32_e32 v9, s8
10660; GFX9-NEXT:    v_cvt_f32_u32_e32 v10, s9
10661; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s15, v1
10662; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
10663; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s14, v0
10664; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
10665; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v1
10666; GFX9-NEXT:    v_mac_f32_e32 v9, s16, v10
10667; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
10668; GFX9-NEXT:    v_rcp_f32_e32 v8, v9
10669; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
10670; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
10671; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[2:3]
10672; GFX9-NEXT:    v_mul_f32_e32 v3, s17, v8
10673; GFX9-NEXT:    v_mul_f32_e32 v4, s18, v3
10674; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
10675; GFX9-NEXT:    v_mac_f32_e32 v3, s19, v4
10676; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
10677; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
10678; GFX9-NEXT:    s_sub_u32 s2, 0, s8
10679; GFX9-NEXT:    s_subb_u32 s3, 0, s9
10680; GFX9-NEXT:    v_mul_hi_u32 v7, s2, v3
10681; GFX9-NEXT:    v_mul_lo_u32 v8, s2, v4
10682; GFX9-NEXT:    v_mul_lo_u32 v9, s3, v3
10683; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
10684; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v3
10685; GFX9-NEXT:    v_add_u32_e32 v7, v7, v8
10686; GFX9-NEXT:    v_add_u32_e32 v7, v7, v9
10687; GFX9-NEXT:    v_mul_lo_u32 v8, v3, v7
10688; GFX9-NEXT:    v_mul_hi_u32 v9, v3, v2
10689; GFX9-NEXT:    v_mul_hi_u32 v10, v3, v7
10690; GFX9-NEXT:    v_mul_hi_u32 v11, v4, v7
10691; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v7
10692; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
10693; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v10, vcc
10694; GFX9-NEXT:    v_mul_lo_u32 v10, v4, v2
10695; GFX9-NEXT:    v_mul_hi_u32 v2, v4, v2
10696; GFX9-NEXT:    s_ashr_i32 s12, s11, 31
10697; GFX9-NEXT:    s_mov_b32 s13, s12
10698; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
10699; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v9, v2, vcc
10700; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v6, vcc
10701; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
10702; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v3, v2
10703; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v5, v8, vcc
10704; GFX9-NEXT:    v_addc_co_u32_e64 v3, vcc, v4, v7, s[0:1]
10705; GFX9-NEXT:    v_mul_lo_u32 v8, s2, v3
10706; GFX9-NEXT:    v_mul_hi_u32 v9, s2, v2
10707; GFX9-NEXT:    v_mul_lo_u32 v10, s3, v2
10708; GFX9-NEXT:    v_mul_lo_u32 v11, s2, v2
10709; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
10710; GFX9-NEXT:    v_add_u32_e32 v8, v9, v8
10711; GFX9-NEXT:    v_add_u32_e32 v8, v8, v10
10712; GFX9-NEXT:    v_mul_lo_u32 v12, v2, v8
10713; GFX9-NEXT:    v_mul_hi_u32 v13, v2, v11
10714; GFX9-NEXT:    v_mul_hi_u32 v14, v2, v8
10715; GFX9-NEXT:    v_mul_hi_u32 v10, v3, v11
10716; GFX9-NEXT:    v_mul_lo_u32 v11, v3, v11
10717; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v13, v12
10718; GFX9-NEXT:    v_mul_hi_u32 v9, v3, v8
10719; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v14, vcc
10720; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v8
10721; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v12, v11
10722; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v13, v10, vcc
10723; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v9, v6, vcc
10724; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v10, v3
10725; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v5, v8, vcc
10726; GFX9-NEXT:    v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1]
10727; GFX9-NEXT:    s_add_u32 s0, s10, s12
10728; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
10729; GFX9-NEXT:    s_addc_u32 s1, s11, s12
10730; GFX9-NEXT:    s_xor_b64 s[10:11], s[0:1], s[12:13]
10731; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
10732; GFX9-NEXT:    v_mul_lo_u32 v4, s10, v3
10733; GFX9-NEXT:    v_mul_hi_u32 v7, s10, v2
10734; GFX9-NEXT:    v_mul_hi_u32 v9, s10, v3
10735; GFX9-NEXT:    v_mul_hi_u32 v10, s11, v3
10736; GFX9-NEXT:    v_mul_lo_u32 v3, s11, v3
10737; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v7, v4
10738; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v9, vcc
10739; GFX9-NEXT:    v_mul_lo_u32 v9, s11, v2
10740; GFX9-NEXT:    v_mul_hi_u32 v2, s11, v2
10741; GFX9-NEXT:    v_xor_b32_e32 v0, s6, v0
10742; GFX9-NEXT:    v_xor_b32_e32 v1, s6, v1
10743; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v9
10744; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v2, vcc
10745; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v10, v6, vcc
10746; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
10747; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
10748; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v3
10749; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v2
10750; GFX9-NEXT:    v_mul_lo_u32 v5, s9, v2
10751; GFX9-NEXT:    v_mul_lo_u32 v2, s8, v2
10752; GFX9-NEXT:    v_mov_b32_e32 v8, s6
10753; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
10754; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
10755; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s6, v0
10756; GFX9-NEXT:    v_sub_co_u32_e64 v2, s[0:1], s10, v2
10757; GFX9-NEXT:    v_sub_u32_e32 v4, s11, v3
10758; GFX9-NEXT:    v_mov_b32_e32 v5, s9
10759; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v8, vcc
10760; GFX9-NEXT:    v_subb_co_u32_e64 v4, vcc, v4, v5, s[0:1]
10761; GFX9-NEXT:    v_subrev_co_u32_e64 v7, s[2:3], s8, v2
10762; GFX9-NEXT:    v_subbrev_co_u32_e64 v8, vcc, 0, v4, s[2:3]
10763; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v8
10764; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
10765; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v7
10766; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
10767; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v8
10768; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
10769; GFX9-NEXT:    v_subb_co_u32_e64 v4, vcc, v4, v5, s[2:3]
10770; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s8, v7
10771; GFX9-NEXT:    v_subbrev_co_u32_e32 v4, vcc, 0, v4, vcc
10772; GFX9-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v9
10773; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[2:3]
10774; GFX9-NEXT:    v_mov_b32_e32 v8, s11
10775; GFX9-NEXT:    v_subb_co_u32_e64 v3, vcc, v8, v3, s[0:1]
10776; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
10777; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
10778; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
10779; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
10780; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
10781; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
10782; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
10783; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
10784; GFX9-NEXT:    v_cndmask_b32_e64 v4, v7, v5, s[2:3]
10785; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
10786; GFX9-NEXT:    v_xor_b32_e32 v2, s12, v2
10787; GFX9-NEXT:    v_xor_b32_e32 v3, s12, v3
10788; GFX9-NEXT:    v_mov_b32_e32 v4, s12
10789; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s12, v2
10790; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
10791; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
10792; GFX9-NEXT:    global_store_dwordx4 v6, v[0:3], s[4:5]
10793; GFX9-NEXT:    s_endpgm
10794  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
10795  %r = srem <2 x i64> %x, %shl.y
10796  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
10797  ret void
10798}
10799