1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
3; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s
5
6define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
7; CHECK-LABEL: @udiv_i32(
8; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
9; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
10; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000
11; CHECK-NEXT:    [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
12; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 0, [[Y]]
13; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]]
14; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP4]] to i64
15; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
16; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
17; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
18; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
19; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
20; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]]
21; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64
22; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
23; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
24; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
25; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
26; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
27; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]]
28; CHECK-NEXT:    [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]]
29; CHECK-NEXT:    [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]]
30; CHECK-NEXT:    [[TMP23:%.*]] = add i32 [[TMP19]], 1
31; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP19]]
32; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP21]], [[Y]]
33; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP22]], i32 [[TMP25]], i32 [[TMP21]]
34; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]]
35; CHECK-NEXT:    [[TMP28:%.*]] = add i32 [[TMP24]], 1
36; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]]
37; CHECK-NEXT:    store i32 [[TMP29]], i32 addrspace(1)* [[OUT:%.*]], align 4
38; CHECK-NEXT:    ret void
39;
40; GCN-LABEL: udiv_i32:
41; GCN:       ; %bb.0:
42; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
43; GCN-NEXT:    s_mov_b32 s7, 0xf000
44; GCN-NEXT:    s_mov_b32 s6, -1
45; GCN-NEXT:    s_waitcnt lgkmcnt(0)
46; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s3
47; GCN-NEXT:    s_sub_i32 s4, 0, s3
48; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
49; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
50; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
51; GCN-NEXT:    v_mul_lo_u32 v1, s4, v0
52; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
53; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
54; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
55; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
56; GCN-NEXT:    v_mul_lo_u32 v1, v0, s3
57; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
58; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
59; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
60; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
61; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
62; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
63; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
64; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
65; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
66; GCN-NEXT:    s_waitcnt lgkmcnt(0)
67; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
68; GCN-NEXT:    s_endpgm
69  %r = udiv i32 %x, %y
70  store i32 %r, i32 addrspace(1)* %out
71  ret void
72}
73
74define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
75; CHECK-LABEL: @urem_i32(
76; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
77; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
78; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000
79; CHECK-NEXT:    [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
80; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 0, [[Y]]
81; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]]
82; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP4]] to i64
83; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
84; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
85; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
86; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
87; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
88; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]]
89; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64
90; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
91; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
92; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
93; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
94; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
95; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]]
96; CHECK-NEXT:    [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]]
97; CHECK-NEXT:    [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]]
98; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP21]], [[Y]]
99; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP21]]
100; CHECK-NEXT:    [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]]
101; CHECK-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]]
102; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]]
103; CHECK-NEXT:    store i32 [[TMP27]], i32 addrspace(1)* [[OUT:%.*]], align 4
104; CHECK-NEXT:    ret void
105;
106; GCN-LABEL: urem_i32:
107; GCN:       ; %bb.0:
108; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
109; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
110; GCN-NEXT:    s_mov_b32 s3, 0xf000
111; GCN-NEXT:    s_waitcnt lgkmcnt(0)
112; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s5
113; GCN-NEXT:    s_sub_i32 s2, 0, s5
114; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
115; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
116; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
117; GCN-NEXT:    v_mul_lo_u32 v1, s2, v0
118; GCN-NEXT:    s_mov_b32 s2, -1
119; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
120; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
121; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
122; GCN-NEXT:    v_mul_lo_u32 v0, v0, s5
123; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
124; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s5, v0
125; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s5, v0
126; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
127; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s5, v0
128; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s5, v0
129; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
130; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
131; GCN-NEXT:    s_endpgm
132  %r = urem i32 %x, %y
133  store i32 %r, i32 addrspace(1)* %out
134  ret void
135}
136
137define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
138; CHECK-LABEL: @sdiv_i32(
139; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
140; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
141; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
142; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[X]], [[TMP1]]
143; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]]
144; CHECK-NEXT:    [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]]
145; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]]
146; CHECK-NEXT:    [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float
147; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]])
148; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41EFFFFFC0000000
149; CHECK-NEXT:    [[TMP11:%.*]] = fptoui float [[TMP10]] to i32
150; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 0, [[TMP7]]
151; CHECK-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP11]]
152; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP11]] to i64
153; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
154; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
155; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
156; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
157; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
158; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP11]], [[TMP19]]
159; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP6]] to i64
160; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP20]] to i64
161; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]]
162; CHECK-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
163; CHECK-NEXT:    [[TMP25:%.*]] = lshr i64 [[TMP23]], 32
164; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
165; CHECK-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], [[TMP7]]
166; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP6]], [[TMP27]]
167; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP7]]
168; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
169; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
170; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP28]], [[TMP7]]
171; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP29]], i32 [[TMP32]], i32 [[TMP28]]
172; CHECK-NEXT:    [[TMP34:%.*]] = icmp uge i32 [[TMP33]], [[TMP7]]
173; CHECK-NEXT:    [[TMP35:%.*]] = add i32 [[TMP31]], 1
174; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]]
175; CHECK-NEXT:    [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]]
176; CHECK-NEXT:    [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]]
177; CHECK-NEXT:    store i32 [[TMP38]], i32 addrspace(1)* [[OUT:%.*]], align 4
178; CHECK-NEXT:    ret void
179;
180; GCN-LABEL: sdiv_i32:
181; GCN:       ; %bb.0:
182; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
183; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
184; GCN-NEXT:    s_mov_b32 s7, 0xf000
185; GCN-NEXT:    s_mov_b32 s6, -1
186; GCN-NEXT:    s_waitcnt lgkmcnt(0)
187; GCN-NEXT:    s_ashr_i32 s8, s3, 31
188; GCN-NEXT:    s_add_i32 s3, s3, s8
189; GCN-NEXT:    s_xor_b32 s9, s3, s8
190; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
191; GCN-NEXT:    s_sub_i32 s3, 0, s9
192; GCN-NEXT:    s_ashr_i32 s0, s2, 31
193; GCN-NEXT:    s_add_i32 s1, s2, s0
194; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
195; GCN-NEXT:    s_xor_b32 s1, s1, s0
196; GCN-NEXT:    s_xor_b32 s2, s0, s8
197; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
198; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
199; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
200; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
201; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
202; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
203; GCN-NEXT:    v_mul_lo_u32 v1, v0, s9
204; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
205; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
206; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v1
207; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
208; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s9, v1
209; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
210; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
211; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
212; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
213; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
214; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
215; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
216; GCN-NEXT:    s_endpgm
217  %r = sdiv i32 %x, %y
218  store i32 %r, i32 addrspace(1)* %out
219  ret void
220}
221
222define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
223; CHECK-LABEL: @srem_i32(
224; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
225; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
226; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[X]], [[TMP1]]
227; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]]
228; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]]
229; CHECK-NEXT:    [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]]
230; CHECK-NEXT:    [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float
231; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
232; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41EFFFFFC0000000
233; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP9]] to i32
234; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 0, [[TMP6]]
235; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP10]]
236; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP10]] to i64
237; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP12]] to i64
238; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]]
239; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
240; CHECK-NEXT:    [[TMP17:%.*]] = lshr i64 [[TMP15]], 32
241; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
242; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP10]], [[TMP18]]
243; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
244; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP19]] to i64
245; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]]
246; CHECK-NEXT:    [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32
247; CHECK-NEXT:    [[TMP24:%.*]] = lshr i64 [[TMP22]], 32
248; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
249; CHECK-NEXT:    [[TMP26:%.*]] = mul i32 [[TMP25]], [[TMP6]]
250; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP5]], [[TMP26]]
251; CHECK-NEXT:    [[TMP28:%.*]] = icmp uge i32 [[TMP27]], [[TMP6]]
252; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP6]]
253; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP28]], i32 [[TMP29]], i32 [[TMP27]]
254; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP6]]
255; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP30]], [[TMP6]]
256; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]]
257; CHECK-NEXT:    [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]]
258; CHECK-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]]
259; CHECK-NEXT:    store i32 [[TMP35]], i32 addrspace(1)* [[OUT:%.*]], align 4
260; CHECK-NEXT:    ret void
261;
262; GCN-LABEL: srem_i32:
263; GCN:       ; %bb.0:
264; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
265; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
266; GCN-NEXT:    s_waitcnt lgkmcnt(0)
267; GCN-NEXT:    s_ashr_i32 s4, s3, 31
268; GCN-NEXT:    s_add_i32 s3, s3, s4
269; GCN-NEXT:    s_xor_b32 s4, s3, s4
270; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
271; GCN-NEXT:    s_sub_i32 s3, 0, s4
272; GCN-NEXT:    s_ashr_i32 s5, s2, 31
273; GCN-NEXT:    s_add_i32 s2, s2, s5
274; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
275; GCN-NEXT:    s_xor_b32 s6, s2, s5
276; GCN-NEXT:    s_mov_b32 s2, -1
277; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
278; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
279; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
280; GCN-NEXT:    s_mov_b32 s3, 0xf000
281; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
282; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
283; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
284; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
285; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
286; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
287; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
288; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
289; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
290; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
291; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
292; GCN-NEXT:    v_xor_b32_e32 v0, s5, v0
293; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v0
294; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
295; GCN-NEXT:    s_endpgm
296  %r = srem i32 %x, %y
297  store i32 %r, i32 addrspace(1)* %out
298  ret void
299}
300
301define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
302; CHECK-LABEL: @udiv_i16(
303; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
304; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
305; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
306; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
307; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
308; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
309; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
310; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
311; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
312; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
313; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
314; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
315; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
316; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
317; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
318; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 65535
319; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
320; CHECK-NEXT:    store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]], align 2
321; CHECK-NEXT:    ret void
322;
323; GCN-LABEL: udiv_i16:
324; GCN:       ; %bb.0:
325; GCN-NEXT:    s_load_dword s2, s[0:1], 0xb
326; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
327; GCN-NEXT:    s_waitcnt lgkmcnt(0)
328; GCN-NEXT:    s_lshr_b32 s3, s2, 16
329; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s3
330; GCN-NEXT:    s_and_b32 s2, s2, 0xffff
331; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s2
332; GCN-NEXT:    s_mov_b32 s3, 0xf000
333; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
334; GCN-NEXT:    s_mov_b32 s2, -1
335; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
336; GCN-NEXT:    v_trunc_f32_e32 v2, v2
337; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
338; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
339; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
340; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
341; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
342; GCN-NEXT:    s_endpgm
343  %r = udiv i16 %x, %y
344  store i16 %r, i16 addrspace(1)* %out
345  ret void
346}
347
348define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
349; CHECK-LABEL: @urem_i16(
350; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
351; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
352; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
353; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
354; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
355; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
356; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
357; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
358; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
359; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
360; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
361; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
362; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
363; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
364; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
365; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
366; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
367; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
368; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
369; CHECK-NEXT:    store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]], align 2
370; CHECK-NEXT:    ret void
371;
372; GCN-LABEL: urem_i16:
373; GCN:       ; %bb.0:
374; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
375; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
376; GCN-NEXT:    s_waitcnt lgkmcnt(0)
377; GCN-NEXT:    s_lshr_b32 s2, s4, 16
378; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
379; GCN-NEXT:    s_and_b32 s3, s4, 0xffff
380; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
381; GCN-NEXT:    s_mov_b32 s3, 0xf000
382; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
383; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
384; GCN-NEXT:    v_trunc_f32_e32 v2, v2
385; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
386; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
387; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
388; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
389; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
390; GCN-NEXT:    s_mov_b32 s2, -1
391; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
392; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
393; GCN-NEXT:    s_endpgm
394  %r = urem i16 %x, %y
395  store i16 %r, i16 addrspace(1)* %out
396  ret void
397}
398
399define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
400; CHECK-LABEL: @sdiv_i16(
401; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
402; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
403; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
404; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
405; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
406; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
407; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
408; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
409; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
410; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
411; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
412; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
413; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
414; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
415; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
416; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
417; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
418; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
419; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 16
420; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 16
421; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
422; CHECK-NEXT:    store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]], align 2
423; CHECK-NEXT:    ret void
424;
425; GCN-LABEL: sdiv_i16:
426; GCN:       ; %bb.0:
427; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
428; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
429; GCN-NEXT:    s_mov_b32 s7, 0xf000
430; GCN-NEXT:    s_mov_b32 s6, -1
431; GCN-NEXT:    s_waitcnt lgkmcnt(0)
432; GCN-NEXT:    s_ashr_i32 s1, s0, 16
433; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
434; GCN-NEXT:    s_sext_i32_i16 s0, s0
435; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
436; GCN-NEXT:    s_xor_b32 s0, s0, s1
437; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
438; GCN-NEXT:    s_ashr_i32 s0, s0, 30
439; GCN-NEXT:    s_or_b32 s2, s0, 1
440; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
441; GCN-NEXT:    v_trunc_f32_e32 v2, v2
442; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
443; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
444; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
445; GCN-NEXT:    s_cmp_lg_u32 s0, 0
446; GCN-NEXT:    s_cselect_b32 s0, s2, 0
447; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
448; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0
449; GCN-NEXT:    s_endpgm
450  %r = sdiv i16 %x, %y
451  store i16 %r, i16 addrspace(1)* %out
452  ret void
453}
454
455define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
456; CHECK-LABEL: @srem_i16(
457; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
458; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
459; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
460; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
461; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
462; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
463; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
464; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
465; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
466; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
467; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
468; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
469; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
470; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
471; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
472; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
473; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
474; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
475; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
476; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
477; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
478; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
479; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
480; CHECK-NEXT:    store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]], align 2
481; CHECK-NEXT:    ret void
482;
483; GCN-LABEL: srem_i16:
484; GCN:       ; %bb.0:
485; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
486; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
487; GCN-NEXT:    s_waitcnt lgkmcnt(0)
488; GCN-NEXT:    s_ashr_i32 s5, s4, 16
489; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s5
490; GCN-NEXT:    s_sext_i32_i16 s2, s4
491; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
492; GCN-NEXT:    s_xor_b32 s2, s2, s5
493; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
494; GCN-NEXT:    s_ashr_i32 s2, s2, 30
495; GCN-NEXT:    s_or_b32 s6, s2, 1
496; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
497; GCN-NEXT:    v_trunc_f32_e32 v2, v2
498; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
499; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
500; GCN-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
501; GCN-NEXT:    s_cmp_lg_u32 s2, 0
502; GCN-NEXT:    s_cselect_b32 s2, s6, 0
503; GCN-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
504; GCN-NEXT:    v_mul_lo_u32 v0, v0, s5
505; GCN-NEXT:    s_mov_b32 s3, 0xf000
506; GCN-NEXT:    s_mov_b32 s2, -1
507; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
508; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
509; GCN-NEXT:    s_endpgm
510  %r = srem i16 %x, %y
511  store i16 %r, i16 addrspace(1)* %out
512  ret void
513}
514
515define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
516; CHECK-LABEL: @udiv_i8(
517; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
518; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
519; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
520; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
521; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
522; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
523; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
524; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
525; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
526; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
527; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
528; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
529; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
530; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
531; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
532; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 255
533; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
534; CHECK-NEXT:    store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]], align 1
535; CHECK-NEXT:    ret void
536;
537; GCN-LABEL: udiv_i8:
538; GCN:       ; %bb.0:
539; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
540; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
541; GCN-NEXT:    s_mov_b32 s7, 0xf000
542; GCN-NEXT:    s_mov_b32 s6, -1
543; GCN-NEXT:    s_waitcnt lgkmcnt(0)
544; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v0, s0
545; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
546; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
547; GCN-NEXT:    v_mul_f32_e32 v1, v2, v1
548; GCN-NEXT:    v_trunc_f32_e32 v1, v1
549; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v1
550; GCN-NEXT:    v_mad_f32 v1, -v1, v0, v2
551; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
552; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
553; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
554; GCN-NEXT:    s_endpgm
555  %r = udiv i8 %x, %y
556  store i8 %r, i8 addrspace(1)* %out
557  ret void
558}
559
560define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
561; CHECK-LABEL: @urem_i8(
562; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
563; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
564; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
565; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
566; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
567; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
568; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
569; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
570; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
571; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
572; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
573; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
574; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
575; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
576; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
577; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
578; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
579; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 255
580; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8
581; CHECK-NEXT:    store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]], align 1
582; CHECK-NEXT:    ret void
583;
584; GCN-LABEL: urem_i8:
585; GCN:       ; %bb.0:
586; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
587; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
588; GCN-NEXT:    s_mov_b32 s3, 0xf000
589; GCN-NEXT:    s_waitcnt lgkmcnt(0)
590; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v0, s4
591; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
592; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
593; GCN-NEXT:    s_lshr_b32 s2, s4, 8
594; GCN-NEXT:    v_mul_f32_e32 v1, v2, v1
595; GCN-NEXT:    v_trunc_f32_e32 v1, v1
596; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v1
597; GCN-NEXT:    v_mad_f32 v1, -v1, v0, v2
598; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
599; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
600; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
601; GCN-NEXT:    s_mov_b32 s2, -1
602; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
603; GCN-NEXT:    buffer_store_byte v0, off, s[0:3], 0
604; GCN-NEXT:    s_endpgm
605  %r = urem i8 %x, %y
606  store i8 %r, i8 addrspace(1)* %out
607  ret void
608}
609
610define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
611; CHECK-LABEL: @sdiv_i8(
612; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
613; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
614; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
615; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
616; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
617; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
618; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
619; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
620; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
621; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
622; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
623; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
624; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
625; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
626; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
627; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
628; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
629; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
630; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 24
631; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 24
632; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8
633; CHECK-NEXT:    store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]], align 1
634; CHECK-NEXT:    ret void
635;
636; GCN-LABEL: sdiv_i8:
637; GCN:       ; %bb.0:
638; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
639; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
640; GCN-NEXT:    s_mov_b32 s7, 0xf000
641; GCN-NEXT:    s_mov_b32 s6, -1
642; GCN-NEXT:    s_waitcnt lgkmcnt(0)
643; GCN-NEXT:    s_bfe_i32 s1, s0, 0x80008
644; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
645; GCN-NEXT:    s_sext_i32_i8 s0, s0
646; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
647; GCN-NEXT:    s_xor_b32 s0, s0, s1
648; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
649; GCN-NEXT:    s_ashr_i32 s0, s0, 30
650; GCN-NEXT:    s_or_b32 s2, s0, 1
651; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
652; GCN-NEXT:    v_trunc_f32_e32 v2, v2
653; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
654; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
655; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
656; GCN-NEXT:    s_cmp_lg_u32 s0, 0
657; GCN-NEXT:    s_cselect_b32 s0, s2, 0
658; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
659; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
660; GCN-NEXT:    s_endpgm
661  %r = sdiv i8 %x, %y
662  store i8 %r, i8 addrspace(1)* %out
663  ret void
664}
665
666define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
667; CHECK-LABEL: @srem_i8(
668; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
669; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
670; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
671; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
672; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
673; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
674; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
675; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
676; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
677; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
678; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
679; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
680; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
681; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
682; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
683; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
684; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
685; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
686; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
687; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
688; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 24
689; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 24
690; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8
691; CHECK-NEXT:    store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]], align 1
692; CHECK-NEXT:    ret void
693;
694; GCN-LABEL: srem_i8:
695; GCN:       ; %bb.0:
696; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
697; GCN-NEXT:    s_load_dword s2, s[0:1], 0xb
698; GCN-NEXT:    s_mov_b32 s7, 0xf000
699; GCN-NEXT:    s_waitcnt lgkmcnt(0)
700; GCN-NEXT:    s_bfe_i32 s0, s2, 0x80008
701; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
702; GCN-NEXT:    s_sext_i32_i8 s1, s2
703; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s1
704; GCN-NEXT:    s_xor_b32 s0, s1, s0
705; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
706; GCN-NEXT:    s_ashr_i32 s0, s0, 30
707; GCN-NEXT:    s_lshr_b32 s3, s2, 8
708; GCN-NEXT:    s_or_b32 s6, s0, 1
709; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
710; GCN-NEXT:    v_trunc_f32_e32 v2, v2
711; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
712; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
713; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
714; GCN-NEXT:    s_cmp_lg_u32 s0, 0
715; GCN-NEXT:    s_cselect_b32 s0, s6, 0
716; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
717; GCN-NEXT:    v_mul_lo_u32 v0, v0, s3
718; GCN-NEXT:    s_mov_b32 s6, -1
719; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
720; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
721; GCN-NEXT:    s_endpgm
722  %r = srem i8 %x, %y
723  store i8 %r, i8 addrspace(1)* %out
724  ret void
725}
726
727define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
728; CHECK-LABEL: @udiv_v4i32(
729; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
730; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
731; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
732; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
733; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
734; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
735; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
736; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
737; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
738; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
739; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
740; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
741; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
742; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
743; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
744; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
745; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
746; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
747; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
748; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
749; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
750; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
751; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
752; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
753; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP21]], 1
754; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]]
755; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]]
756; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]]
757; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]]
758; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
759; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
760; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i64 0
761; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i32> [[X]], i64 1
762; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[Y]], i64 1
763; CHECK-NEXT:    [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float
764; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
765; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000
766; CHECK-NEXT:    [[TMP38:%.*]] = fptoui float [[TMP37]] to i32
767; CHECK-NEXT:    [[TMP39:%.*]] = sub i32 0, [[TMP34]]
768; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]]
769; CHECK-NEXT:    [[TMP41:%.*]] = zext i32 [[TMP38]] to i64
770; CHECK-NEXT:    [[TMP42:%.*]] = zext i32 [[TMP40]] to i64
771; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]]
772; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
773; CHECK-NEXT:    [[TMP45:%.*]] = lshr i64 [[TMP43]], 32
774; CHECK-NEXT:    [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32
775; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]]
776; CHECK-NEXT:    [[TMP48:%.*]] = zext i32 [[TMP33]] to i64
777; CHECK-NEXT:    [[TMP49:%.*]] = zext i32 [[TMP47]] to i64
778; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]]
779; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
780; CHECK-NEXT:    [[TMP52:%.*]] = lshr i64 [[TMP50]], 32
781; CHECK-NEXT:    [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32
782; CHECK-NEXT:    [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]]
783; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]]
784; CHECK-NEXT:    [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]]
785; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP53]], 1
786; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]]
787; CHECK-NEXT:    [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]]
788; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]]
789; CHECK-NEXT:    [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]]
790; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP58]], 1
791; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]]
792; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP63]], i64 1
793; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <4 x i32> [[X]], i64 2
794; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <4 x i32> [[Y]], i64 2
795; CHECK-NEXT:    [[TMP67:%.*]] = uitofp i32 [[TMP66]] to float
796; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP67]])
797; CHECK-NEXT:    [[TMP69:%.*]] = fmul fast float [[TMP68]], 0x41EFFFFFC0000000
798; CHECK-NEXT:    [[TMP70:%.*]] = fptoui float [[TMP69]] to i32
799; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 0, [[TMP66]]
800; CHECK-NEXT:    [[TMP72:%.*]] = mul i32 [[TMP71]], [[TMP70]]
801; CHECK-NEXT:    [[TMP73:%.*]] = zext i32 [[TMP70]] to i64
802; CHECK-NEXT:    [[TMP74:%.*]] = zext i32 [[TMP72]] to i64
803; CHECK-NEXT:    [[TMP75:%.*]] = mul i64 [[TMP73]], [[TMP74]]
804; CHECK-NEXT:    [[TMP76:%.*]] = trunc i64 [[TMP75]] to i32
805; CHECK-NEXT:    [[TMP77:%.*]] = lshr i64 [[TMP75]], 32
806; CHECK-NEXT:    [[TMP78:%.*]] = trunc i64 [[TMP77]] to i32
807; CHECK-NEXT:    [[TMP79:%.*]] = add i32 [[TMP70]], [[TMP78]]
808; CHECK-NEXT:    [[TMP80:%.*]] = zext i32 [[TMP65]] to i64
809; CHECK-NEXT:    [[TMP81:%.*]] = zext i32 [[TMP79]] to i64
810; CHECK-NEXT:    [[TMP82:%.*]] = mul i64 [[TMP80]], [[TMP81]]
811; CHECK-NEXT:    [[TMP83:%.*]] = trunc i64 [[TMP82]] to i32
812; CHECK-NEXT:    [[TMP84:%.*]] = lshr i64 [[TMP82]], 32
813; CHECK-NEXT:    [[TMP85:%.*]] = trunc i64 [[TMP84]] to i32
814; CHECK-NEXT:    [[TMP86:%.*]] = mul i32 [[TMP85]], [[TMP66]]
815; CHECK-NEXT:    [[TMP87:%.*]] = sub i32 [[TMP65]], [[TMP86]]
816; CHECK-NEXT:    [[TMP88:%.*]] = icmp uge i32 [[TMP87]], [[TMP66]]
817; CHECK-NEXT:    [[TMP89:%.*]] = add i32 [[TMP85]], 1
818; CHECK-NEXT:    [[TMP90:%.*]] = select i1 [[TMP88]], i32 [[TMP89]], i32 [[TMP85]]
819; CHECK-NEXT:    [[TMP91:%.*]] = sub i32 [[TMP87]], [[TMP66]]
820; CHECK-NEXT:    [[TMP92:%.*]] = select i1 [[TMP88]], i32 [[TMP91]], i32 [[TMP87]]
821; CHECK-NEXT:    [[TMP93:%.*]] = icmp uge i32 [[TMP92]], [[TMP66]]
822; CHECK-NEXT:    [[TMP94:%.*]] = add i32 [[TMP90]], 1
823; CHECK-NEXT:    [[TMP95:%.*]] = select i1 [[TMP93]], i32 [[TMP94]], i32 [[TMP90]]
824; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP95]], i64 2
825; CHECK-NEXT:    [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 3
826; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 3
827; CHECK-NEXT:    [[TMP99:%.*]] = uitofp i32 [[TMP98]] to float
828; CHECK-NEXT:    [[TMP100:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP99]])
829; CHECK-NEXT:    [[TMP101:%.*]] = fmul fast float [[TMP100]], 0x41EFFFFFC0000000
830; CHECK-NEXT:    [[TMP102:%.*]] = fptoui float [[TMP101]] to i32
831; CHECK-NEXT:    [[TMP103:%.*]] = sub i32 0, [[TMP98]]
832; CHECK-NEXT:    [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP102]]
833; CHECK-NEXT:    [[TMP105:%.*]] = zext i32 [[TMP102]] to i64
834; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP104]] to i64
835; CHECK-NEXT:    [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]]
836; CHECK-NEXT:    [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32
837; CHECK-NEXT:    [[TMP109:%.*]] = lshr i64 [[TMP107]], 32
838; CHECK-NEXT:    [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32
839; CHECK-NEXT:    [[TMP111:%.*]] = add i32 [[TMP102]], [[TMP110]]
840; CHECK-NEXT:    [[TMP112:%.*]] = zext i32 [[TMP97]] to i64
841; CHECK-NEXT:    [[TMP113:%.*]] = zext i32 [[TMP111]] to i64
842; CHECK-NEXT:    [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]]
843; CHECK-NEXT:    [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32
844; CHECK-NEXT:    [[TMP116:%.*]] = lshr i64 [[TMP114]], 32
845; CHECK-NEXT:    [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32
846; CHECK-NEXT:    [[TMP118:%.*]] = mul i32 [[TMP117]], [[TMP98]]
847; CHECK-NEXT:    [[TMP119:%.*]] = sub i32 [[TMP97]], [[TMP118]]
848; CHECK-NEXT:    [[TMP120:%.*]] = icmp uge i32 [[TMP119]], [[TMP98]]
849; CHECK-NEXT:    [[TMP121:%.*]] = add i32 [[TMP117]], 1
850; CHECK-NEXT:    [[TMP122:%.*]] = select i1 [[TMP120]], i32 [[TMP121]], i32 [[TMP117]]
851; CHECK-NEXT:    [[TMP123:%.*]] = sub i32 [[TMP119]], [[TMP98]]
852; CHECK-NEXT:    [[TMP124:%.*]] = select i1 [[TMP120]], i32 [[TMP123]], i32 [[TMP119]]
853; CHECK-NEXT:    [[TMP125:%.*]] = icmp uge i32 [[TMP124]], [[TMP98]]
854; CHECK-NEXT:    [[TMP126:%.*]] = add i32 [[TMP122]], 1
855; CHECK-NEXT:    [[TMP127:%.*]] = select i1 [[TMP125]], i32 [[TMP126]], i32 [[TMP122]]
856; CHECK-NEXT:    [[TMP128:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP127]], i64 3
857; CHECK-NEXT:    store <4 x i32> [[TMP128]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
858; CHECK-NEXT:    ret void
859;
860; GCN-LABEL: udiv_v4i32:
861; GCN:       ; %bb.0:
862; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
863; GCN-NEXT:    s_mov_b32 s12, 0x4f7ffffe
864; GCN-NEXT:    s_mov_b32 s15, 0xf000
865; GCN-NEXT:    s_mov_b32 s14, -1
866; GCN-NEXT:    s_waitcnt lgkmcnt(0)
867; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
868; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
869; GCN-NEXT:    s_sub_i32 s2, 0, s8
870; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s10
871; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
872; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
873; GCN-NEXT:    v_mul_f32_e32 v0, s12, v0
874; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
875; GCN-NEXT:    v_mul_f32_e32 v1, s12, v1
876; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
877; GCN-NEXT:    v_mul_lo_u32 v2, s2, v0
878; GCN-NEXT:    s_sub_i32 s2, 0, s9
879; GCN-NEXT:    v_mul_lo_u32 v4, s2, v1
880; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
881; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
882; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
883; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v3
884; GCN-NEXT:    v_mul_hi_u32 v3, v1, v4
885; GCN-NEXT:    v_mul_lo_u32 v4, v0, s8
886; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v0
887; GCN-NEXT:    v_mul_f32_e32 v2, s12, v2
888; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
889; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
890; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v4
891; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v5, s[2:3]
892; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s8, v4
893; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[2:3]
894; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v0
895; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v4
896; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
897; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
898; GCN-NEXT:    v_mul_hi_u32 v1, s5, v1
899; GCN-NEXT:    s_sub_i32 s4, 0, s10
900; GCN-NEXT:    v_mul_lo_u32 v5, s4, v2
901; GCN-NEXT:    s_sub_i32 s4, 0, s11
902; GCN-NEXT:    v_mul_lo_u32 v3, v1, s9
903; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
904; GCN-NEXT:    v_mul_hi_u32 v5, v2, v5
905; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s5, v3
906; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v3
907; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[2:3]
908; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v3
909; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[2:3]
910; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
911; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
912; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
913; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s11
914; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
915; GCN-NEXT:    v_mul_hi_u32 v2, s6, v2
916; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v4
917; GCN-NEXT:    v_mul_lo_u32 v3, v2, s10
918; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
919; GCN-NEXT:    v_mul_f32_e32 v4, s12, v4
920; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
921; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
922; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s10, v3
923; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[2:3]
924; GCN-NEXT:    v_mul_lo_u32 v6, s4, v4
925; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v3
926; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[2:3]
927; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
928; GCN-NEXT:    v_mul_hi_u32 v6, v4, v6
929; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
930; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
931; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
932; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v4
933; GCN-NEXT:    v_mul_hi_u32 v3, s7, v3
934; GCN-NEXT:    v_mul_lo_u32 v4, v3, s11
935; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
936; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s7, v4
937; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v4
938; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
939; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s11, v4
940; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
941; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
942; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s11, v4
943; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
944; GCN-NEXT:    s_waitcnt lgkmcnt(0)
945; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
946; GCN-NEXT:    s_endpgm
947  %r = udiv <4 x i32> %x, %y
948  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
949  ret void
950}
951
952define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
953; CHECK-LABEL: @urem_v4i32(
954; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
955; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
956; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
957; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
958; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
959; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
960; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
961; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
962; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
963; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
964; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
965; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
966; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
967; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
968; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
969; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
970; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
971; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
972; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
973; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
974; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
975; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
976; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
977; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
978; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]]
979; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]]
980; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]]
981; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]]
982; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]]
983; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> undef, i32 [[TMP29]], i64 0
984; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[X]], i64 1
985; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[Y]], i64 1
986; CHECK-NEXT:    [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float
987; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
988; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000
989; CHECK-NEXT:    [[TMP36:%.*]] = fptoui float [[TMP35]] to i32
990; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 0, [[TMP32]]
991; CHECK-NEXT:    [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]]
992; CHECK-NEXT:    [[TMP39:%.*]] = zext i32 [[TMP36]] to i64
993; CHECK-NEXT:    [[TMP40:%.*]] = zext i32 [[TMP38]] to i64
994; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]]
995; CHECK-NEXT:    [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32
996; CHECK-NEXT:    [[TMP43:%.*]] = lshr i64 [[TMP41]], 32
997; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
998; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]]
999; CHECK-NEXT:    [[TMP46:%.*]] = zext i32 [[TMP31]] to i64
1000; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP45]] to i64
1001; CHECK-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]]
1002; CHECK-NEXT:    [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32
1003; CHECK-NEXT:    [[TMP50:%.*]] = lshr i64 [[TMP48]], 32
1004; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
1005; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]]
1006; CHECK-NEXT:    [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]]
1007; CHECK-NEXT:    [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]]
1008; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]]
1009; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]]
1010; CHECK-NEXT:    [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]]
1011; CHECK-NEXT:    [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]]
1012; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]]
1013; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP59]], i64 1
1014; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <4 x i32> [[X]], i64 2
1015; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1016; CHECK-NEXT:    [[TMP63:%.*]] = uitofp i32 [[TMP62]] to float
1017; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP63]])
1018; CHECK-NEXT:    [[TMP65:%.*]] = fmul fast float [[TMP64]], 0x41EFFFFFC0000000
1019; CHECK-NEXT:    [[TMP66:%.*]] = fptoui float [[TMP65]] to i32
1020; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 0, [[TMP62]]
1021; CHECK-NEXT:    [[TMP68:%.*]] = mul i32 [[TMP67]], [[TMP66]]
1022; CHECK-NEXT:    [[TMP69:%.*]] = zext i32 [[TMP66]] to i64
1023; CHECK-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP68]] to i64
1024; CHECK-NEXT:    [[TMP71:%.*]] = mul i64 [[TMP69]], [[TMP70]]
1025; CHECK-NEXT:    [[TMP72:%.*]] = trunc i64 [[TMP71]] to i32
1026; CHECK-NEXT:    [[TMP73:%.*]] = lshr i64 [[TMP71]], 32
1027; CHECK-NEXT:    [[TMP74:%.*]] = trunc i64 [[TMP73]] to i32
1028; CHECK-NEXT:    [[TMP75:%.*]] = add i32 [[TMP66]], [[TMP74]]
1029; CHECK-NEXT:    [[TMP76:%.*]] = zext i32 [[TMP61]] to i64
1030; CHECK-NEXT:    [[TMP77:%.*]] = zext i32 [[TMP75]] to i64
1031; CHECK-NEXT:    [[TMP78:%.*]] = mul i64 [[TMP76]], [[TMP77]]
1032; CHECK-NEXT:    [[TMP79:%.*]] = trunc i64 [[TMP78]] to i32
1033; CHECK-NEXT:    [[TMP80:%.*]] = lshr i64 [[TMP78]], 32
1034; CHECK-NEXT:    [[TMP81:%.*]] = trunc i64 [[TMP80]] to i32
1035; CHECK-NEXT:    [[TMP82:%.*]] = mul i32 [[TMP81]], [[TMP62]]
1036; CHECK-NEXT:    [[TMP83:%.*]] = sub i32 [[TMP61]], [[TMP82]]
1037; CHECK-NEXT:    [[TMP84:%.*]] = icmp uge i32 [[TMP83]], [[TMP62]]
1038; CHECK-NEXT:    [[TMP85:%.*]] = sub i32 [[TMP83]], [[TMP62]]
1039; CHECK-NEXT:    [[TMP86:%.*]] = select i1 [[TMP84]], i32 [[TMP85]], i32 [[TMP83]]
1040; CHECK-NEXT:    [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP62]]
1041; CHECK-NEXT:    [[TMP88:%.*]] = sub i32 [[TMP86]], [[TMP62]]
1042; CHECK-NEXT:    [[TMP89:%.*]] = select i1 [[TMP87]], i32 [[TMP88]], i32 [[TMP86]]
1043; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP89]], i64 2
1044; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <4 x i32> [[X]], i64 3
1045; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1046; CHECK-NEXT:    [[TMP93:%.*]] = uitofp i32 [[TMP92]] to float
1047; CHECK-NEXT:    [[TMP94:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP93]])
1048; CHECK-NEXT:    [[TMP95:%.*]] = fmul fast float [[TMP94]], 0x41EFFFFFC0000000
1049; CHECK-NEXT:    [[TMP96:%.*]] = fptoui float [[TMP95]] to i32
1050; CHECK-NEXT:    [[TMP97:%.*]] = sub i32 0, [[TMP92]]
1051; CHECK-NEXT:    [[TMP98:%.*]] = mul i32 [[TMP97]], [[TMP96]]
1052; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP96]] to i64
1053; CHECK-NEXT:    [[TMP100:%.*]] = zext i32 [[TMP98]] to i64
1054; CHECK-NEXT:    [[TMP101:%.*]] = mul i64 [[TMP99]], [[TMP100]]
1055; CHECK-NEXT:    [[TMP102:%.*]] = trunc i64 [[TMP101]] to i32
1056; CHECK-NEXT:    [[TMP103:%.*]] = lshr i64 [[TMP101]], 32
1057; CHECK-NEXT:    [[TMP104:%.*]] = trunc i64 [[TMP103]] to i32
1058; CHECK-NEXT:    [[TMP105:%.*]] = add i32 [[TMP96]], [[TMP104]]
1059; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP91]] to i64
1060; CHECK-NEXT:    [[TMP107:%.*]] = zext i32 [[TMP105]] to i64
1061; CHECK-NEXT:    [[TMP108:%.*]] = mul i64 [[TMP106]], [[TMP107]]
1062; CHECK-NEXT:    [[TMP109:%.*]] = trunc i64 [[TMP108]] to i32
1063; CHECK-NEXT:    [[TMP110:%.*]] = lshr i64 [[TMP108]], 32
1064; CHECK-NEXT:    [[TMP111:%.*]] = trunc i64 [[TMP110]] to i32
1065; CHECK-NEXT:    [[TMP112:%.*]] = mul i32 [[TMP111]], [[TMP92]]
1066; CHECK-NEXT:    [[TMP113:%.*]] = sub i32 [[TMP91]], [[TMP112]]
1067; CHECK-NEXT:    [[TMP114:%.*]] = icmp uge i32 [[TMP113]], [[TMP92]]
1068; CHECK-NEXT:    [[TMP115:%.*]] = sub i32 [[TMP113]], [[TMP92]]
1069; CHECK-NEXT:    [[TMP116:%.*]] = select i1 [[TMP114]], i32 [[TMP115]], i32 [[TMP113]]
1070; CHECK-NEXT:    [[TMP117:%.*]] = icmp uge i32 [[TMP116]], [[TMP92]]
1071; CHECK-NEXT:    [[TMP118:%.*]] = sub i32 [[TMP116]], [[TMP92]]
1072; CHECK-NEXT:    [[TMP119:%.*]] = select i1 [[TMP117]], i32 [[TMP118]], i32 [[TMP116]]
1073; CHECK-NEXT:    [[TMP120:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP119]], i64 3
1074; CHECK-NEXT:    store <4 x i32> [[TMP120]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1075; CHECK-NEXT:    ret void
1076;
1077; GCN-LABEL: urem_v4i32:
1078; GCN:       ; %bb.0:
1079; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
1080; GCN-NEXT:    s_mov_b32 s12, 0x4f7ffffe
1081; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1082; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1083; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
1084; GCN-NEXT:    s_sub_i32 s2, 0, s8
1085; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
1086; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s11
1087; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1088; GCN-NEXT:    s_sub_i32 s3, 0, s9
1089; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1090; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s10
1091; GCN-NEXT:    v_mul_f32_e32 v0, s12, v0
1092; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
1093; GCN-NEXT:    v_mul_f32_e32 v1, s12, v1
1094; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1095; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
1096; GCN-NEXT:    v_mul_lo_u32 v3, s2, v0
1097; GCN-NEXT:    s_sub_i32 s2, 0, s10
1098; GCN-NEXT:    v_mul_f32_e32 v2, s12, v2
1099; GCN-NEXT:    v_mul_hi_u32 v3, v0, v3
1100; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
1101; GCN-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
1102; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
1103; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v4
1104; GCN-NEXT:    v_mul_lo_u32 v4, s3, v1
1105; GCN-NEXT:    s_mov_b32 s3, 0xf000
1106; GCN-NEXT:    v_mul_lo_u32 v0, v0, s8
1107; GCN-NEXT:    v_mul_f32_e32 v3, s12, v3
1108; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
1109; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
1110; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1111; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s8, v0
1112; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1113; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
1114; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s8, v0
1115; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1116; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
1117; GCN-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
1118; GCN-NEXT:    v_mul_hi_u32 v1, s5, v1
1119; GCN-NEXT:    v_mul_lo_u32 v4, s2, v2
1120; GCN-NEXT:    s_sub_i32 s2, 0, s11
1121; GCN-NEXT:    v_mul_lo_u32 v1, v1, s9
1122; GCN-NEXT:    v_mul_hi_u32 v4, v2, v4
1123; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
1124; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v1
1125; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1126; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1127; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v1
1128; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1129; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
1130; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
1131; GCN-NEXT:    v_mul_hi_u32 v2, s6, v2
1132; GCN-NEXT:    v_mul_lo_u32 v4, s2, v3
1133; GCN-NEXT:    s_mov_b32 s2, -1
1134; GCN-NEXT:    v_mul_lo_u32 v2, v2, s10
1135; GCN-NEXT:    v_mul_hi_u32 v4, v3, v4
1136; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
1137; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
1138; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1139; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1140; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
1141; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1142; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1143; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
1144; GCN-NEXT:    v_mul_hi_u32 v3, s7, v3
1145; GCN-NEXT:    v_mul_lo_u32 v3, v3, s11
1146; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
1147; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s11, v3
1148; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1149; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1150; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s11, v3
1151; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1152; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1153; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1154; GCN-NEXT:    s_endpgm
1155  %r = urem <4 x i32> %x, %y
1156  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
1157  ret void
1158}
1159
1160define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1161; CHECK-LABEL: @sdiv_v4i32(
1162; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1163; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1164; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
1165; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
1166; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
1167; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]]
1168; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]]
1169; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]]
1170; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]]
1171; CHECK-NEXT:    [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
1172; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]])
1173; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000
1174; CHECK-NEXT:    [[TMP13:%.*]] = fptoui float [[TMP12]] to i32
1175; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 0, [[TMP9]]
1176; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]]
1177; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP13]] to i64
1178; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1179; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1180; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1181; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1182; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1183; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]]
1184; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP8]] to i64
1185; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP22]] to i64
1186; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
1187; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
1188; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
1189; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
1190; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]]
1191; CHECK-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]]
1192; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]]
1193; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP28]], 1
1194; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]]
1195; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]]
1196; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]]
1197; CHECK-NEXT:    [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]]
1198; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP33]], 1
1199; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]]
1200; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]]
1201; CHECK-NEXT:    [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]]
1202; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> undef, i32 [[TMP40]], i64 0
1203; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i32> [[X]], i64 1
1204; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1205; CHECK-NEXT:    [[TMP44:%.*]] = ashr i32 [[TMP42]], 31
1206; CHECK-NEXT:    [[TMP45:%.*]] = ashr i32 [[TMP43]], 31
1207; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]]
1208; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]]
1209; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]]
1210; CHECK-NEXT:    [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]]
1211; CHECK-NEXT:    [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]]
1212; CHECK-NEXT:    [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float
1213; CHECK-NEXT:    [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]])
1214; CHECK-NEXT:    [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000
1215; CHECK-NEXT:    [[TMP54:%.*]] = fptoui float [[TMP53]] to i32
1216; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 0, [[TMP50]]
1217; CHECK-NEXT:    [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]]
1218; CHECK-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP54]] to i64
1219; CHECK-NEXT:    [[TMP58:%.*]] = zext i32 [[TMP56]] to i64
1220; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]]
1221; CHECK-NEXT:    [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32
1222; CHECK-NEXT:    [[TMP61:%.*]] = lshr i64 [[TMP59]], 32
1223; CHECK-NEXT:    [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32
1224; CHECK-NEXT:    [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]]
1225; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[TMP49]] to i64
1226; CHECK-NEXT:    [[TMP65:%.*]] = zext i32 [[TMP63]] to i64
1227; CHECK-NEXT:    [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]]
1228; CHECK-NEXT:    [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32
1229; CHECK-NEXT:    [[TMP68:%.*]] = lshr i64 [[TMP66]], 32
1230; CHECK-NEXT:    [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32
1231; CHECK-NEXT:    [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]]
1232; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]]
1233; CHECK-NEXT:    [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]]
1234; CHECK-NEXT:    [[TMP73:%.*]] = add i32 [[TMP69]], 1
1235; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]]
1236; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]]
1237; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]]
1238; CHECK-NEXT:    [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]]
1239; CHECK-NEXT:    [[TMP78:%.*]] = add i32 [[TMP74]], 1
1240; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]]
1241; CHECK-NEXT:    [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]]
1242; CHECK-NEXT:    [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]]
1243; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP81]], i64 1
1244; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <4 x i32> [[X]], i64 2
1245; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1246; CHECK-NEXT:    [[TMP85:%.*]] = ashr i32 [[TMP83]], 31
1247; CHECK-NEXT:    [[TMP86:%.*]] = ashr i32 [[TMP84]], 31
1248; CHECK-NEXT:    [[TMP87:%.*]] = xor i32 [[TMP85]], [[TMP86]]
1249; CHECK-NEXT:    [[TMP88:%.*]] = add i32 [[TMP83]], [[TMP85]]
1250; CHECK-NEXT:    [[TMP89:%.*]] = add i32 [[TMP84]], [[TMP86]]
1251; CHECK-NEXT:    [[TMP90:%.*]] = xor i32 [[TMP88]], [[TMP85]]
1252; CHECK-NEXT:    [[TMP91:%.*]] = xor i32 [[TMP89]], [[TMP86]]
1253; CHECK-NEXT:    [[TMP92:%.*]] = uitofp i32 [[TMP91]] to float
1254; CHECK-NEXT:    [[TMP93:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP92]])
1255; CHECK-NEXT:    [[TMP94:%.*]] = fmul fast float [[TMP93]], 0x41EFFFFFC0000000
1256; CHECK-NEXT:    [[TMP95:%.*]] = fptoui float [[TMP94]] to i32
1257; CHECK-NEXT:    [[TMP96:%.*]] = sub i32 0, [[TMP91]]
1258; CHECK-NEXT:    [[TMP97:%.*]] = mul i32 [[TMP96]], [[TMP95]]
1259; CHECK-NEXT:    [[TMP98:%.*]] = zext i32 [[TMP95]] to i64
1260; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP97]] to i64
1261; CHECK-NEXT:    [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]]
1262; CHECK-NEXT:    [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32
1263; CHECK-NEXT:    [[TMP102:%.*]] = lshr i64 [[TMP100]], 32
1264; CHECK-NEXT:    [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32
1265; CHECK-NEXT:    [[TMP104:%.*]] = add i32 [[TMP95]], [[TMP103]]
1266; CHECK-NEXT:    [[TMP105:%.*]] = zext i32 [[TMP90]] to i64
1267; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP104]] to i64
1268; CHECK-NEXT:    [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]]
1269; CHECK-NEXT:    [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32
1270; CHECK-NEXT:    [[TMP109:%.*]] = lshr i64 [[TMP107]], 32
1271; CHECK-NEXT:    [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32
1272; CHECK-NEXT:    [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP91]]
1273; CHECK-NEXT:    [[TMP112:%.*]] = sub i32 [[TMP90]], [[TMP111]]
1274; CHECK-NEXT:    [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP91]]
1275; CHECK-NEXT:    [[TMP114:%.*]] = add i32 [[TMP110]], 1
1276; CHECK-NEXT:    [[TMP115:%.*]] = select i1 [[TMP113]], i32 [[TMP114]], i32 [[TMP110]]
1277; CHECK-NEXT:    [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP91]]
1278; CHECK-NEXT:    [[TMP117:%.*]] = select i1 [[TMP113]], i32 [[TMP116]], i32 [[TMP112]]
1279; CHECK-NEXT:    [[TMP118:%.*]] = icmp uge i32 [[TMP117]], [[TMP91]]
1280; CHECK-NEXT:    [[TMP119:%.*]] = add i32 [[TMP115]], 1
1281; CHECK-NEXT:    [[TMP120:%.*]] = select i1 [[TMP118]], i32 [[TMP119]], i32 [[TMP115]]
1282; CHECK-NEXT:    [[TMP121:%.*]] = xor i32 [[TMP120]], [[TMP87]]
1283; CHECK-NEXT:    [[TMP122:%.*]] = sub i32 [[TMP121]], [[TMP87]]
1284; CHECK-NEXT:    [[TMP123:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP122]], i64 2
1285; CHECK-NEXT:    [[TMP124:%.*]] = extractelement <4 x i32> [[X]], i64 3
1286; CHECK-NEXT:    [[TMP125:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1287; CHECK-NEXT:    [[TMP126:%.*]] = ashr i32 [[TMP124]], 31
1288; CHECK-NEXT:    [[TMP127:%.*]] = ashr i32 [[TMP125]], 31
1289; CHECK-NEXT:    [[TMP128:%.*]] = xor i32 [[TMP126]], [[TMP127]]
1290; CHECK-NEXT:    [[TMP129:%.*]] = add i32 [[TMP124]], [[TMP126]]
1291; CHECK-NEXT:    [[TMP130:%.*]] = add i32 [[TMP125]], [[TMP127]]
1292; CHECK-NEXT:    [[TMP131:%.*]] = xor i32 [[TMP129]], [[TMP126]]
1293; CHECK-NEXT:    [[TMP132:%.*]] = xor i32 [[TMP130]], [[TMP127]]
1294; CHECK-NEXT:    [[TMP133:%.*]] = uitofp i32 [[TMP132]] to float
1295; CHECK-NEXT:    [[TMP134:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP133]])
1296; CHECK-NEXT:    [[TMP135:%.*]] = fmul fast float [[TMP134]], 0x41EFFFFFC0000000
1297; CHECK-NEXT:    [[TMP136:%.*]] = fptoui float [[TMP135]] to i32
1298; CHECK-NEXT:    [[TMP137:%.*]] = sub i32 0, [[TMP132]]
1299; CHECK-NEXT:    [[TMP138:%.*]] = mul i32 [[TMP137]], [[TMP136]]
1300; CHECK-NEXT:    [[TMP139:%.*]] = zext i32 [[TMP136]] to i64
1301; CHECK-NEXT:    [[TMP140:%.*]] = zext i32 [[TMP138]] to i64
1302; CHECK-NEXT:    [[TMP141:%.*]] = mul i64 [[TMP139]], [[TMP140]]
1303; CHECK-NEXT:    [[TMP142:%.*]] = trunc i64 [[TMP141]] to i32
1304; CHECK-NEXT:    [[TMP143:%.*]] = lshr i64 [[TMP141]], 32
1305; CHECK-NEXT:    [[TMP144:%.*]] = trunc i64 [[TMP143]] to i32
1306; CHECK-NEXT:    [[TMP145:%.*]] = add i32 [[TMP136]], [[TMP144]]
1307; CHECK-NEXT:    [[TMP146:%.*]] = zext i32 [[TMP131]] to i64
1308; CHECK-NEXT:    [[TMP147:%.*]] = zext i32 [[TMP145]] to i64
1309; CHECK-NEXT:    [[TMP148:%.*]] = mul i64 [[TMP146]], [[TMP147]]
1310; CHECK-NEXT:    [[TMP149:%.*]] = trunc i64 [[TMP148]] to i32
1311; CHECK-NEXT:    [[TMP150:%.*]] = lshr i64 [[TMP148]], 32
1312; CHECK-NEXT:    [[TMP151:%.*]] = trunc i64 [[TMP150]] to i32
1313; CHECK-NEXT:    [[TMP152:%.*]] = mul i32 [[TMP151]], [[TMP132]]
1314; CHECK-NEXT:    [[TMP153:%.*]] = sub i32 [[TMP131]], [[TMP152]]
1315; CHECK-NEXT:    [[TMP154:%.*]] = icmp uge i32 [[TMP153]], [[TMP132]]
1316; CHECK-NEXT:    [[TMP155:%.*]] = add i32 [[TMP151]], 1
1317; CHECK-NEXT:    [[TMP156:%.*]] = select i1 [[TMP154]], i32 [[TMP155]], i32 [[TMP151]]
1318; CHECK-NEXT:    [[TMP157:%.*]] = sub i32 [[TMP153]], [[TMP132]]
1319; CHECK-NEXT:    [[TMP158:%.*]] = select i1 [[TMP154]], i32 [[TMP157]], i32 [[TMP153]]
1320; CHECK-NEXT:    [[TMP159:%.*]] = icmp uge i32 [[TMP158]], [[TMP132]]
1321; CHECK-NEXT:    [[TMP160:%.*]] = add i32 [[TMP156]], 1
1322; CHECK-NEXT:    [[TMP161:%.*]] = select i1 [[TMP159]], i32 [[TMP160]], i32 [[TMP156]]
1323; CHECK-NEXT:    [[TMP162:%.*]] = xor i32 [[TMP161]], [[TMP128]]
1324; CHECK-NEXT:    [[TMP163:%.*]] = sub i32 [[TMP162]], [[TMP128]]
1325; CHECK-NEXT:    [[TMP164:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP163]], i64 3
1326; CHECK-NEXT:    store <4 x i32> [[TMP164]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1327; CHECK-NEXT:    ret void
1328;
1329; GCN-LABEL: sdiv_v4i32:
1330; GCN:       ; %bb.0:
1331; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
1332; GCN-NEXT:    s_mov_b32 s16, 0x4f7ffffe
1333; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1334; GCN-NEXT:    s_ashr_i32 s14, s8, 31
1335; GCN-NEXT:    s_add_i32 s2, s8, s14
1336; GCN-NEXT:    s_xor_b32 s12, s2, s14
1337; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
1338; GCN-NEXT:    s_ashr_i32 s8, s9, 31
1339; GCN-NEXT:    s_add_i32 s2, s9, s8
1340; GCN-NEXT:    s_xor_b32 s15, s2, s8
1341; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1342; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s15
1343; GCN-NEXT:    s_sub_i32 s3, 0, s12
1344; GCN-NEXT:    s_ashr_i32 s9, s4, 31
1345; GCN-NEXT:    v_mul_f32_e32 v0, s16, v0
1346; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
1347; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1348; GCN-NEXT:    s_add_i32 s2, s4, s9
1349; GCN-NEXT:    s_xor_b32 s2, s2, s9
1350; GCN-NEXT:    v_mul_lo_u32 v2, s3, v0
1351; GCN-NEXT:    v_mul_f32_e32 v1, s16, v1
1352; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1353; GCN-NEXT:    s_sub_i32 s3, 0, s15
1354; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
1355; GCN-NEXT:    s_ashr_i32 s4, s5, 31
1356; GCN-NEXT:    v_mul_lo_u32 v3, s3, v1
1357; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
1358; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
1359; GCN-NEXT:    v_mul_hi_u32 v2, v1, v3
1360; GCN-NEXT:    v_mul_lo_u32 v3, v0, s12
1361; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
1362; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
1363; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v3
1364; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[2:3]
1365; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s12, v3
1366; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[2:3]
1367; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
1368; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
1369; GCN-NEXT:    s_add_i32 s2, s5, s4
1370; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
1371; GCN-NEXT:    s_xor_b32 s2, s2, s4
1372; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
1373; GCN-NEXT:    v_mul_hi_u32 v1, s2, v1
1374; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
1375; GCN-NEXT:    s_xor_b32 s0, s9, s14
1376; GCN-NEXT:    v_xor_b32_e32 v0, s0, v0
1377; GCN-NEXT:    v_mul_lo_u32 v2, v1, s15
1378; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
1379; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1380; GCN-NEXT:    s_ashr_i32 s3, s6, 31
1381; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
1382; GCN-NEXT:    s_ashr_i32 s2, s10, 31
1383; GCN-NEXT:    s_add_i32 s0, s10, s2
1384; GCN-NEXT:    s_xor_b32 s5, s0, s2
1385; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s5
1386; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v2
1387; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
1388; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s15, v2
1389; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1390; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
1391; GCN-NEXT:    s_sub_i32 s0, 0, s5
1392; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1393; GCN-NEXT:    v_mul_f32_e32 v3, s16, v3
1394; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
1395; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s15, v2
1396; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1397; GCN-NEXT:    s_xor_b32 s1, s4, s8
1398; GCN-NEXT:    v_mul_lo_u32 v5, s0, v3
1399; GCN-NEXT:    s_add_i32 s0, s6, s3
1400; GCN-NEXT:    s_xor_b32 s0, s0, s3
1401; GCN-NEXT:    s_ashr_i32 s4, s11, 31
1402; GCN-NEXT:    v_mul_hi_u32 v2, v3, v5
1403; GCN-NEXT:    v_xor_b32_e32 v1, s1, v1
1404; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s1, v1
1405; GCN-NEXT:    s_xor_b32 s2, s3, s2
1406; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
1407; GCN-NEXT:    v_mul_hi_u32 v2, s0, v2
1408; GCN-NEXT:    s_mov_b32 s15, 0xf000
1409; GCN-NEXT:    s_mov_b32 s14, -1
1410; GCN-NEXT:    v_mul_lo_u32 v3, v2, s5
1411; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
1412; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s0, v3
1413; GCN-NEXT:    s_add_i32 s0, s11, s4
1414; GCN-NEXT:    s_xor_b32 s6, s0, s4
1415; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s6
1416; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s5, v3
1417; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
1418; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v3
1419; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v4
1420; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1421; GCN-NEXT:    s_sub_i32 s0, 0, s6
1422; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
1423; GCN-NEXT:    v_mul_f32_e32 v4, s16, v4
1424; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
1425; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
1426; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1427; GCN-NEXT:    v_xor_b32_e32 v2, s2, v2
1428; GCN-NEXT:    v_mul_lo_u32 v6, s0, v4
1429; GCN-NEXT:    s_ashr_i32 s0, s7, 31
1430; GCN-NEXT:    s_add_i32 s1, s7, s0
1431; GCN-NEXT:    s_xor_b32 s1, s1, s0
1432; GCN-NEXT:    v_mul_hi_u32 v3, v4, v6
1433; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s2, v2
1434; GCN-NEXT:    s_xor_b32 s2, s0, s4
1435; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
1436; GCN-NEXT:    v_mul_hi_u32 v3, s1, v3
1437; GCN-NEXT:    v_mul_lo_u32 v4, v3, s6
1438; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
1439; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s1, v4
1440; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v4
1441; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1442; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s6, v4
1443; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
1444; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v3
1445; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v4
1446; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
1447; GCN-NEXT:    v_xor_b32_e32 v3, s2, v3
1448; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v3
1449; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1450; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
1451; GCN-NEXT:    s_endpgm
1452  %r = sdiv <4 x i32> %x, %y
1453  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
1454  ret void
1455}
1456
1457define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1458; CHECK-LABEL: @srem_v4i32(
1459; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1460; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1461; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
1462; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
1463; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
1464; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]]
1465; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]]
1466; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]]
1467; CHECK-NEXT:    [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
1468; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
1469; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000
1470; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP11]] to i32
1471; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 0, [[TMP8]]
1472; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]]
1473; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP12]] to i64
1474; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
1475; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]]
1476; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
1477; CHECK-NEXT:    [[TMP19:%.*]] = lshr i64 [[TMP17]], 32
1478; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
1479; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
1480; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
1481; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP21]] to i64
1482; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]]
1483; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
1484; CHECK-NEXT:    [[TMP26:%.*]] = lshr i64 [[TMP24]], 32
1485; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
1486; CHECK-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]]
1487; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]]
1488; CHECK-NEXT:    [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]]
1489; CHECK-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]]
1490; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]]
1491; CHECK-NEXT:    [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]]
1492; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]]
1493; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]]
1494; CHECK-NEXT:    [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]]
1495; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]]
1496; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i32> undef, i32 [[TMP37]], i64 0
1497; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <4 x i32> [[X]], i64 1
1498; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1499; CHECK-NEXT:    [[TMP41:%.*]] = ashr i32 [[TMP39]], 31
1500; CHECK-NEXT:    [[TMP42:%.*]] = ashr i32 [[TMP40]], 31
1501; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]]
1502; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]]
1503; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]]
1504; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]]
1505; CHECK-NEXT:    [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float
1506; CHECK-NEXT:    [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]])
1507; CHECK-NEXT:    [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000
1508; CHECK-NEXT:    [[TMP50:%.*]] = fptoui float [[TMP49]] to i32
1509; CHECK-NEXT:    [[TMP51:%.*]] = sub i32 0, [[TMP46]]
1510; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]]
1511; CHECK-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP50]] to i64
1512; CHECK-NEXT:    [[TMP54:%.*]] = zext i32 [[TMP52]] to i64
1513; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]]
1514; CHECK-NEXT:    [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32
1515; CHECK-NEXT:    [[TMP57:%.*]] = lshr i64 [[TMP55]], 32
1516; CHECK-NEXT:    [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32
1517; CHECK-NEXT:    [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]]
1518; CHECK-NEXT:    [[TMP60:%.*]] = zext i32 [[TMP45]] to i64
1519; CHECK-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP59]] to i64
1520; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]]
1521; CHECK-NEXT:    [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32
1522; CHECK-NEXT:    [[TMP64:%.*]] = lshr i64 [[TMP62]], 32
1523; CHECK-NEXT:    [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32
1524; CHECK-NEXT:    [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]]
1525; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]]
1526; CHECK-NEXT:    [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]]
1527; CHECK-NEXT:    [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]]
1528; CHECK-NEXT:    [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]]
1529; CHECK-NEXT:    [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]]
1530; CHECK-NEXT:    [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]]
1531; CHECK-NEXT:    [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]]
1532; CHECK-NEXT:    [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]]
1533; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]]
1534; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP75]], i64 1
1535; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <4 x i32> [[X]], i64 2
1536; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1537; CHECK-NEXT:    [[TMP79:%.*]] = ashr i32 [[TMP77]], 31
1538; CHECK-NEXT:    [[TMP80:%.*]] = ashr i32 [[TMP78]], 31
1539; CHECK-NEXT:    [[TMP81:%.*]] = add i32 [[TMP77]], [[TMP79]]
1540; CHECK-NEXT:    [[TMP82:%.*]] = add i32 [[TMP78]], [[TMP80]]
1541; CHECK-NEXT:    [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP79]]
1542; CHECK-NEXT:    [[TMP84:%.*]] = xor i32 [[TMP82]], [[TMP80]]
1543; CHECK-NEXT:    [[TMP85:%.*]] = uitofp i32 [[TMP84]] to float
1544; CHECK-NEXT:    [[TMP86:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP85]])
1545; CHECK-NEXT:    [[TMP87:%.*]] = fmul fast float [[TMP86]], 0x41EFFFFFC0000000
1546; CHECK-NEXT:    [[TMP88:%.*]] = fptoui float [[TMP87]] to i32
1547; CHECK-NEXT:    [[TMP89:%.*]] = sub i32 0, [[TMP84]]
1548; CHECK-NEXT:    [[TMP90:%.*]] = mul i32 [[TMP89]], [[TMP88]]
1549; CHECK-NEXT:    [[TMP91:%.*]] = zext i32 [[TMP88]] to i64
1550; CHECK-NEXT:    [[TMP92:%.*]] = zext i32 [[TMP90]] to i64
1551; CHECK-NEXT:    [[TMP93:%.*]] = mul i64 [[TMP91]], [[TMP92]]
1552; CHECK-NEXT:    [[TMP94:%.*]] = trunc i64 [[TMP93]] to i32
1553; CHECK-NEXT:    [[TMP95:%.*]] = lshr i64 [[TMP93]], 32
1554; CHECK-NEXT:    [[TMP96:%.*]] = trunc i64 [[TMP95]] to i32
1555; CHECK-NEXT:    [[TMP97:%.*]] = add i32 [[TMP88]], [[TMP96]]
1556; CHECK-NEXT:    [[TMP98:%.*]] = zext i32 [[TMP83]] to i64
1557; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP97]] to i64
1558; CHECK-NEXT:    [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]]
1559; CHECK-NEXT:    [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32
1560; CHECK-NEXT:    [[TMP102:%.*]] = lshr i64 [[TMP100]], 32
1561; CHECK-NEXT:    [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32
1562; CHECK-NEXT:    [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP84]]
1563; CHECK-NEXT:    [[TMP105:%.*]] = sub i32 [[TMP83]], [[TMP104]]
1564; CHECK-NEXT:    [[TMP106:%.*]] = icmp uge i32 [[TMP105]], [[TMP84]]
1565; CHECK-NEXT:    [[TMP107:%.*]] = sub i32 [[TMP105]], [[TMP84]]
1566; CHECK-NEXT:    [[TMP108:%.*]] = select i1 [[TMP106]], i32 [[TMP107]], i32 [[TMP105]]
1567; CHECK-NEXT:    [[TMP109:%.*]] = icmp uge i32 [[TMP108]], [[TMP84]]
1568; CHECK-NEXT:    [[TMP110:%.*]] = sub i32 [[TMP108]], [[TMP84]]
1569; CHECK-NEXT:    [[TMP111:%.*]] = select i1 [[TMP109]], i32 [[TMP110]], i32 [[TMP108]]
1570; CHECK-NEXT:    [[TMP112:%.*]] = xor i32 [[TMP111]], [[TMP79]]
1571; CHECK-NEXT:    [[TMP113:%.*]] = sub i32 [[TMP112]], [[TMP79]]
1572; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP113]], i64 2
1573; CHECK-NEXT:    [[TMP115:%.*]] = extractelement <4 x i32> [[X]], i64 3
1574; CHECK-NEXT:    [[TMP116:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1575; CHECK-NEXT:    [[TMP117:%.*]] = ashr i32 [[TMP115]], 31
1576; CHECK-NEXT:    [[TMP118:%.*]] = ashr i32 [[TMP116]], 31
1577; CHECK-NEXT:    [[TMP119:%.*]] = add i32 [[TMP115]], [[TMP117]]
1578; CHECK-NEXT:    [[TMP120:%.*]] = add i32 [[TMP116]], [[TMP118]]
1579; CHECK-NEXT:    [[TMP121:%.*]] = xor i32 [[TMP119]], [[TMP117]]
1580; CHECK-NEXT:    [[TMP122:%.*]] = xor i32 [[TMP120]], [[TMP118]]
1581; CHECK-NEXT:    [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float
1582; CHECK-NEXT:    [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]])
1583; CHECK-NEXT:    [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41EFFFFFC0000000
1584; CHECK-NEXT:    [[TMP126:%.*]] = fptoui float [[TMP125]] to i32
1585; CHECK-NEXT:    [[TMP127:%.*]] = sub i32 0, [[TMP122]]
1586; CHECK-NEXT:    [[TMP128:%.*]] = mul i32 [[TMP127]], [[TMP126]]
1587; CHECK-NEXT:    [[TMP129:%.*]] = zext i32 [[TMP126]] to i64
1588; CHECK-NEXT:    [[TMP130:%.*]] = zext i32 [[TMP128]] to i64
1589; CHECK-NEXT:    [[TMP131:%.*]] = mul i64 [[TMP129]], [[TMP130]]
1590; CHECK-NEXT:    [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32
1591; CHECK-NEXT:    [[TMP133:%.*]] = lshr i64 [[TMP131]], 32
1592; CHECK-NEXT:    [[TMP134:%.*]] = trunc i64 [[TMP133]] to i32
1593; CHECK-NEXT:    [[TMP135:%.*]] = add i32 [[TMP126]], [[TMP134]]
1594; CHECK-NEXT:    [[TMP136:%.*]] = zext i32 [[TMP121]] to i64
1595; CHECK-NEXT:    [[TMP137:%.*]] = zext i32 [[TMP135]] to i64
1596; CHECK-NEXT:    [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]]
1597; CHECK-NEXT:    [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32
1598; CHECK-NEXT:    [[TMP140:%.*]] = lshr i64 [[TMP138]], 32
1599; CHECK-NEXT:    [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32
1600; CHECK-NEXT:    [[TMP142:%.*]] = mul i32 [[TMP141]], [[TMP122]]
1601; CHECK-NEXT:    [[TMP143:%.*]] = sub i32 [[TMP121]], [[TMP142]]
1602; CHECK-NEXT:    [[TMP144:%.*]] = icmp uge i32 [[TMP143]], [[TMP122]]
1603; CHECK-NEXT:    [[TMP145:%.*]] = sub i32 [[TMP143]], [[TMP122]]
1604; CHECK-NEXT:    [[TMP146:%.*]] = select i1 [[TMP144]], i32 [[TMP145]], i32 [[TMP143]]
1605; CHECK-NEXT:    [[TMP147:%.*]] = icmp uge i32 [[TMP146]], [[TMP122]]
1606; CHECK-NEXT:    [[TMP148:%.*]] = sub i32 [[TMP146]], [[TMP122]]
1607; CHECK-NEXT:    [[TMP149:%.*]] = select i1 [[TMP147]], i32 [[TMP148]], i32 [[TMP146]]
1608; CHECK-NEXT:    [[TMP150:%.*]] = xor i32 [[TMP149]], [[TMP117]]
1609; CHECK-NEXT:    [[TMP151:%.*]] = sub i32 [[TMP150]], [[TMP117]]
1610; CHECK-NEXT:    [[TMP152:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP151]], i64 3
1611; CHECK-NEXT:    store <4 x i32> [[TMP152]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1612; CHECK-NEXT:    ret void
1613;
1614; GCN-LABEL: srem_v4i32:
1615; GCN:       ; %bb.0:
1616; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
1617; GCN-NEXT:    s_mov_b32 s14, 0x4f7ffffe
1618; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1619; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1620; GCN-NEXT:    s_ashr_i32 s2, s8, 31
1621; GCN-NEXT:    s_add_i32 s3, s8, s2
1622; GCN-NEXT:    s_xor_b32 s2, s3, s2
1623; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
1624; GCN-NEXT:    s_sub_i32 s13, 0, s2
1625; GCN-NEXT:    s_ashr_i32 s12, s9, 31
1626; GCN-NEXT:    s_add_i32 s9, s9, s12
1627; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1628; GCN-NEXT:    s_xor_b32 s9, s9, s12
1629; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
1630; GCN-NEXT:    s_ashr_i32 s3, s4, 31
1631; GCN-NEXT:    v_mul_f32_e32 v0, s14, v0
1632; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
1633; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1634; GCN-NEXT:    s_add_i32 s4, s4, s3
1635; GCN-NEXT:    s_xor_b32 s4, s4, s3
1636; GCN-NEXT:    v_mul_lo_u32 v2, s13, v0
1637; GCN-NEXT:    v_mul_f32_e32 v1, s14, v1
1638; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1639; GCN-NEXT:    s_sub_i32 s13, 0, s9
1640; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
1641; GCN-NEXT:    s_ashr_i32 s12, s10, 31
1642; GCN-NEXT:    s_ashr_i32 s8, s5, 31
1643; GCN-NEXT:    s_add_i32 s5, s5, s8
1644; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
1645; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
1646; GCN-NEXT:    v_mul_lo_u32 v2, s13, v1
1647; GCN-NEXT:    s_xor_b32 s5, s5, s8
1648; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
1649; GCN-NEXT:    v_mul_hi_u32 v2, v1, v2
1650; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1651; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v0
1652; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
1653; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1654; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v0
1655; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
1656; GCN-NEXT:    s_add_i32 s2, s10, s12
1657; GCN-NEXT:    s_xor_b32 s2, s2, s12
1658; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1659; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
1660; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s2
1661; GCN-NEXT:    v_mul_hi_u32 v1, s5, v1
1662; GCN-NEXT:    v_xor_b32_e32 v0, s3, v0
1663; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
1664; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
1665; GCN-NEXT:    v_mul_lo_u32 v1, v1, s9
1666; GCN-NEXT:    s_sub_i32 s3, 0, s2
1667; GCN-NEXT:    s_ashr_i32 s4, s6, 31
1668; GCN-NEXT:    v_mul_f32_e32 v2, s14, v2
1669; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
1670; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
1671; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v1
1672; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1673; GCN-NEXT:    v_mul_lo_u32 v4, s3, v2
1674; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
1675; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v1
1676; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1677; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
1678; GCN-NEXT:    v_mul_hi_u32 v3, v2, v4
1679; GCN-NEXT:    s_ashr_i32 s5, s11, 31
1680; GCN-NEXT:    s_add_i32 s3, s6, s4
1681; GCN-NEXT:    s_add_i32 s6, s11, s5
1682; GCN-NEXT:    s_xor_b32 s5, s6, s5
1683; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
1684; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s5
1685; GCN-NEXT:    s_xor_b32 s3, s3, s4
1686; GCN-NEXT:    v_mul_hi_u32 v2, s3, v2
1687; GCN-NEXT:    v_xor_b32_e32 v1, s8, v1
1688; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1689; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s8, v1
1690; GCN-NEXT:    v_mul_lo_u32 v2, v2, s2
1691; GCN-NEXT:    s_ashr_i32 s6, s7, 31
1692; GCN-NEXT:    v_mul_f32_e32 v3, s14, v3
1693; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
1694; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s3, v2
1695; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s2, v2
1696; GCN-NEXT:    s_sub_i32 s3, 0, s5
1697; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
1698; GCN-NEXT:    v_mul_lo_u32 v5, s3, v3
1699; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
1700; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s2, v2
1701; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
1702; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
1703; GCN-NEXT:    v_mul_hi_u32 v4, v3, v5
1704; GCN-NEXT:    s_add_i32 s2, s7, s6
1705; GCN-NEXT:    s_xor_b32 s7, s2, s6
1706; GCN-NEXT:    v_xor_b32_e32 v2, s4, v2
1707; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
1708; GCN-NEXT:    v_mul_hi_u32 v3, s7, v3
1709; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s4, v2
1710; GCN-NEXT:    s_mov_b32 s3, 0xf000
1711; GCN-NEXT:    s_mov_b32 s2, -1
1712; GCN-NEXT:    v_mul_lo_u32 v3, v3, s5
1713; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
1714; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s5, v3
1715; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
1716; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1717; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s5, v3
1718; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
1719; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1720; GCN-NEXT:    v_xor_b32_e32 v3, s6, v3
1721; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v3
1722; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1723; GCN-NEXT:    s_endpgm
1724  %r = srem <4 x i32> %x, %y
1725  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
1726  ret void
1727}
1728
1729define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
1730; CHECK-LABEL: @udiv_v4i16(
1731; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
1732; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
1733; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
1734; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
1735; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
1736; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
1737; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
1738; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
1739; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
1740; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
1741; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
1742; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
1743; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
1744; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
1745; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
1746; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
1747; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
1748; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
1749; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
1750; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> undef, i16 [[TMP19]], i64 0
1751; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1
1752; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1
1753; CHECK-NEXT:    [[TMP23:%.*]] = zext i16 [[TMP21]] to i32
1754; CHECK-NEXT:    [[TMP24:%.*]] = zext i16 [[TMP22]] to i32
1755; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
1756; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
1757; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
1758; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
1759; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
1760; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
1761; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
1762; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
1763; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
1764; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
1765; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
1766; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
1767; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
1768; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 65535
1769; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16
1770; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1
1771; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2
1772; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2
1773; CHECK-NEXT:    [[TMP43:%.*]] = zext i16 [[TMP41]] to i32
1774; CHECK-NEXT:    [[TMP44:%.*]] = zext i16 [[TMP42]] to i32
1775; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
1776; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
1777; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
1778; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
1779; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
1780; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
1781; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
1782; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
1783; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
1784; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
1785; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
1786; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
1787; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
1788; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 65535
1789; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
1790; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2
1791; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3
1792; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3
1793; CHECK-NEXT:    [[TMP63:%.*]] = zext i16 [[TMP61]] to i32
1794; CHECK-NEXT:    [[TMP64:%.*]] = zext i16 [[TMP62]] to i32
1795; CHECK-NEXT:    [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float
1796; CHECK-NEXT:    [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float
1797; CHECK-NEXT:    [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]])
1798; CHECK-NEXT:    [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]]
1799; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]])
1800; CHECK-NEXT:    [[TMP70:%.*]] = fneg fast float [[TMP69]]
1801; CHECK-NEXT:    [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]])
1802; CHECK-NEXT:    [[TMP72:%.*]] = fptoui float [[TMP69]] to i32
1803; CHECK-NEXT:    [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]])
1804; CHECK-NEXT:    [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
1805; CHECK-NEXT:    [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]]
1806; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0
1807; CHECK-NEXT:    [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]]
1808; CHECK-NEXT:    [[TMP78:%.*]] = and i32 [[TMP77]], 65535
1809; CHECK-NEXT:    [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16
1810; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3
1811; CHECK-NEXT:    store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
1812; CHECK-NEXT:    ret void
1813;
1814; GCN-LABEL: udiv_v4i16:
1815; GCN:       ; %bb.0:
1816; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1817; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
1818; GCN-NEXT:    s_mov_b32 s8, 0xffff
1819; GCN-NEXT:    s_mov_b32 s7, 0xf000
1820; GCN-NEXT:    s_mov_b32 s6, -1
1821; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1822; GCN-NEXT:    s_and_b32 s9, s2, s8
1823; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
1824; GCN-NEXT:    s_lshr_b32 s9, s0, 16
1825; GCN-NEXT:    s_and_b32 s0, s0, s8
1826; GCN-NEXT:    s_lshr_b32 s2, s2, 16
1827; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s2
1828; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s0
1829; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1830; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s9
1831; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v3
1832; GCN-NEXT:    s_and_b32 s2, s3, s8
1833; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
1834; GCN-NEXT:    v_trunc_f32_e32 v2, v2
1835; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
1836; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
1837; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1838; GCN-NEXT:    v_mul_f32_e32 v1, v4, v5
1839; GCN-NEXT:    v_trunc_f32_e32 v1, v1
1840; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
1841; GCN-NEXT:    v_mad_f32 v2, -v1, v3, v4
1842; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s2
1843; GCN-NEXT:    s_lshr_b32 s0, s1, 16
1844; GCN-NEXT:    s_and_b32 s1, s1, s8
1845; GCN-NEXT:    s_lshr_b32 s10, s3, 16
1846; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
1847; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1848; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s10
1849; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s1
1850; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
1851; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v1, vcc
1852; GCN-NEXT:    v_rcp_iflag_f32_e32 v7, v3
1853; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1854; GCN-NEXT:    v_mul_f32_e32 v1, v5, v6
1855; GCN-NEXT:    v_cvt_f32_u32_e32 v6, s0
1856; GCN-NEXT:    v_trunc_f32_e32 v1, v1
1857; GCN-NEXT:    v_mad_f32 v5, -v1, v4, v5
1858; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
1859; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1860; GCN-NEXT:    v_mul_f32_e32 v4, v6, v7
1861; GCN-NEXT:    v_trunc_f32_e32 v4, v4
1862; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v4
1863; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1864; GCN-NEXT:    v_mad_f32 v4, -v4, v3, v6
1865; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v3
1866; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
1867; GCN-NEXT:    v_and_b32_e32 v0, s8, v0
1868; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1869; GCN-NEXT:    v_and_b32_e32 v1, s8, v1
1870; GCN-NEXT:    v_or_b32_e32 v1, v1, v3
1871; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
1872; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1873; GCN-NEXT:    s_endpgm
1874  %r = udiv <4 x i16> %x, %y
1875  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
1876  ret void
1877}
1878
1879define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
1880; CHECK-LABEL: @urem_v4i16(
1881; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
1882; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
1883; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
1884; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
1885; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
1886; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
1887; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
1888; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
1889; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
1890; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
1891; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
1892; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
1893; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
1894; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
1895; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
1896; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
1897; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
1898; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
1899; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
1900; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 65535
1901; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
1902; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i16> undef, i16 [[TMP21]], i64 0
1903; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1
1904; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1
1905; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP23]] to i32
1906; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TMP24]] to i32
1907; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
1908; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
1909; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
1910; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
1911; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
1912; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
1913; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
1914; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
1915; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
1916; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
1917; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
1918; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
1919; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
1920; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
1921; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
1922; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 65535
1923; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16
1924; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1
1925; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2
1926; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2
1927; CHECK-NEXT:    [[TMP47:%.*]] = zext i16 [[TMP45]] to i32
1928; CHECK-NEXT:    [[TMP48:%.*]] = zext i16 [[TMP46]] to i32
1929; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
1930; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
1931; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
1932; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
1933; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
1934; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
1935; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
1936; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
1937; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
1938; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
1939; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
1940; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
1941; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
1942; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
1943; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
1944; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 65535
1945; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
1946; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2
1947; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3
1948; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3
1949; CHECK-NEXT:    [[TMP69:%.*]] = zext i16 [[TMP67]] to i32
1950; CHECK-NEXT:    [[TMP70:%.*]] = zext i16 [[TMP68]] to i32
1951; CHECK-NEXT:    [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float
1952; CHECK-NEXT:    [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float
1953; CHECK-NEXT:    [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]])
1954; CHECK-NEXT:    [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]]
1955; CHECK-NEXT:    [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]])
1956; CHECK-NEXT:    [[TMP76:%.*]] = fneg fast float [[TMP75]]
1957; CHECK-NEXT:    [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]])
1958; CHECK-NEXT:    [[TMP78:%.*]] = fptoui float [[TMP75]] to i32
1959; CHECK-NEXT:    [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]])
1960; CHECK-NEXT:    [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]])
1961; CHECK-NEXT:    [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]]
1962; CHECK-NEXT:    [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0
1963; CHECK-NEXT:    [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]]
1964; CHECK-NEXT:    [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]]
1965; CHECK-NEXT:    [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]]
1966; CHECK-NEXT:    [[TMP86:%.*]] = and i32 [[TMP85]], 65535
1967; CHECK-NEXT:    [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16
1968; CHECK-NEXT:    [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3
1969; CHECK-NEXT:    store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
1970; CHECK-NEXT:    ret void
1971;
1972; GCN-LABEL: urem_v4i16:
1973; GCN:       ; %bb.0:
1974; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1975; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
1976; GCN-NEXT:    s_mov_b32 s8, 0xffff
1977; GCN-NEXT:    s_mov_b32 s7, 0xf000
1978; GCN-NEXT:    s_mov_b32 s6, -1
1979; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1980; GCN-NEXT:    s_and_b32 s9, s2, s8
1981; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
1982; GCN-NEXT:    s_and_b32 s10, s0, s8
1983; GCN-NEXT:    s_lshr_b32 s11, s2, 16
1984; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s10
1985; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1986; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s11
1987; GCN-NEXT:    s_lshr_b32 s9, s0, 16
1988; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s9
1989; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
1990; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v3
1991; GCN-NEXT:    v_trunc_f32_e32 v2, v2
1992; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
1993; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
1994; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1995; GCN-NEXT:    v_mul_f32_e32 v1, v4, v5
1996; GCN-NEXT:    v_trunc_f32_e32 v1, v1
1997; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
1998; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v1
1999; GCN-NEXT:    v_mad_f32 v1, -v1, v3, v4
2000; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v3
2001; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
2002; GCN-NEXT:    s_and_b32 s2, s3, s8
2003; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
2004; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s2
2005; GCN-NEXT:    s_and_b32 s2, s1, s8
2006; GCN-NEXT:    v_mul_lo_u32 v1, v1, s11
2007; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s2
2008; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2009; GCN-NEXT:    s_lshr_b32 s12, s3, 16
2010; GCN-NEXT:    v_sub_i32_e32 v5, vcc, s9, v1
2011; GCN-NEXT:    s_lshr_b32 s10, s1, 16
2012; GCN-NEXT:    v_mul_f32_e32 v1, v3, v4
2013; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s12
2014; GCN-NEXT:    v_cvt_f32_u32_e32 v6, s10
2015; GCN-NEXT:    v_trunc_f32_e32 v1, v1
2016; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2017; GCN-NEXT:    v_rcp_iflag_f32_e32 v7, v4
2018; GCN-NEXT:    v_mad_f32 v3, -v1, v2, v3
2019; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
2020; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
2021; GCN-NEXT:    v_mul_f32_e32 v2, v6, v7
2022; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2023; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
2024; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2025; GCN-NEXT:    v_mad_f32 v2, -v2, v4, v6
2026; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
2027; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
2028; GCN-NEXT:    v_mul_lo_u32 v1, v1, s3
2029; GCN-NEXT:    v_mul_lo_u32 v2, v2, s12
2030; GCN-NEXT:    v_and_b32_e32 v0, s8, v0
2031; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
2032; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s10, v2
2033; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2034; GCN-NEXT:    v_and_b32_e32 v1, s8, v1
2035; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
2036; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
2037; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
2038; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2039; GCN-NEXT:    s_endpgm
2040  %r = urem <4 x i16> %x, %y
2041  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
2042  ret void
2043}
2044
2045define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
2046; CHECK-LABEL: @sdiv_v4i16(
2047; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2048; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2049; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
2050; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
2051; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
2052; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
2053; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
2054; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
2055; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
2056; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2057; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
2058; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
2059; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
2060; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
2061; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
2062; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
2063; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2064; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
2065; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
2066; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
2067; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
2068; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
2069; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
2070; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i16> undef, i16 [[TMP23]], i64 0
2071; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1
2072; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2073; CHECK-NEXT:    [[TMP27:%.*]] = sext i16 [[TMP25]] to i32
2074; CHECK-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP26]] to i32
2075; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
2076; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
2077; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
2078; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
2079; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
2080; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
2081; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
2082; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
2083; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
2084; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
2085; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
2086; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
2087; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2088; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
2089; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
2090; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
2091; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 16
2092; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
2093; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
2094; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1
2095; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2
2096; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2097; CHECK-NEXT:    [[TMP51:%.*]] = sext i16 [[TMP49]] to i32
2098; CHECK-NEXT:    [[TMP52:%.*]] = sext i16 [[TMP50]] to i32
2099; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
2100; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
2101; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
2102; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
2103; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
2104; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
2105; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
2106; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
2107; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
2108; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
2109; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
2110; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
2111; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
2112; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
2113; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
2114; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
2115; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 16
2116; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
2117; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
2118; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2
2119; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3
2120; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2121; CHECK-NEXT:    [[TMP75:%.*]] = sext i16 [[TMP73]] to i32
2122; CHECK-NEXT:    [[TMP76:%.*]] = sext i16 [[TMP74]] to i32
2123; CHECK-NEXT:    [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]]
2124; CHECK-NEXT:    [[TMP78:%.*]] = ashr i32 [[TMP77]], 30
2125; CHECK-NEXT:    [[TMP79:%.*]] = or i32 [[TMP78]], 1
2126; CHECK-NEXT:    [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float
2127; CHECK-NEXT:    [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float
2128; CHECK-NEXT:    [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]])
2129; CHECK-NEXT:    [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]]
2130; CHECK-NEXT:    [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]])
2131; CHECK-NEXT:    [[TMP85:%.*]] = fneg fast float [[TMP84]]
2132; CHECK-NEXT:    [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]])
2133; CHECK-NEXT:    [[TMP87:%.*]] = fptosi float [[TMP84]] to i32
2134; CHECK-NEXT:    [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]])
2135; CHECK-NEXT:    [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]])
2136; CHECK-NEXT:    [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]]
2137; CHECK-NEXT:    [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0
2138; CHECK-NEXT:    [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]]
2139; CHECK-NEXT:    [[TMP93:%.*]] = shl i32 [[TMP92]], 16
2140; CHECK-NEXT:    [[TMP94:%.*]] = ashr i32 [[TMP93]], 16
2141; CHECK-NEXT:    [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16
2142; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3
2143; CHECK-NEXT:    store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
2144; CHECK-NEXT:    ret void
2145;
2146; GCN-LABEL: sdiv_v4i16:
2147; GCN:       ; %bb.0:
2148; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2149; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
2150; GCN-NEXT:    s_mov_b32 s7, 0xf000
2151; GCN-NEXT:    s_mov_b32 s6, -1
2152; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2153; GCN-NEXT:    s_sext_i32_i16 s8, s2
2154; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
2155; GCN-NEXT:    s_sext_i32_i16 s9, s0
2156; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s9
2157; GCN-NEXT:    s_xor_b32 s8, s9, s8
2158; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2159; GCN-NEXT:    s_ashr_i32 s8, s8, 30
2160; GCN-NEXT:    s_or_b32 s10, s8, 1
2161; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2162; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2163; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
2164; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
2165; GCN-NEXT:    s_cmp_lg_u32 s8, 0
2166; GCN-NEXT:    s_cselect_b32 s8, s10, 0
2167; GCN-NEXT:    s_ashr_i32 s2, s2, 16
2168; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s2
2169; GCN-NEXT:    s_ashr_i32 s0, s0, 16
2170; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
2171; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
2172; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v0
2173; GCN-NEXT:    s_xor_b32 s0, s0, s2
2174; GCN-NEXT:    s_ashr_i32 s0, s0, 30
2175; GCN-NEXT:    s_sext_i32_i16 s2, s3
2176; GCN-NEXT:    v_mul_f32_e32 v3, v1, v3
2177; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2178; GCN-NEXT:    v_mad_f32 v1, -v3, v0, v1
2179; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
2180; GCN-NEXT:    v_add_i32_e32 v2, vcc, s8, v2
2181; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
2182; GCN-NEXT:    s_or_b32 s0, s0, 1
2183; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s2
2184; GCN-NEXT:    s_cmp_lg_u32 s8, 0
2185; GCN-NEXT:    s_cselect_b32 s0, s0, 0
2186; GCN-NEXT:    v_add_i32_e32 v3, vcc, s0, v3
2187; GCN-NEXT:    s_sext_i32_i16 s0, s1
2188; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
2189; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v0
2190; GCN-NEXT:    s_xor_b32 s0, s0, s2
2191; GCN-NEXT:    s_ashr_i32 s0, s0, 30
2192; GCN-NEXT:    s_or_b32 s0, s0, 1
2193; GCN-NEXT:    v_mul_f32_e32 v4, v1, v4
2194; GCN-NEXT:    v_trunc_f32_e32 v4, v4
2195; GCN-NEXT:    v_mad_f32 v1, -v4, v0, v1
2196; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
2197; GCN-NEXT:    s_cmp_lg_u32 s8, 0
2198; GCN-NEXT:    s_cselect_b32 s0, s0, 0
2199; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
2200; GCN-NEXT:    s_ashr_i32 s2, s3, 16
2201; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s2
2202; GCN-NEXT:    v_add_i32_e32 v1, vcc, s0, v4
2203; GCN-NEXT:    s_ashr_i32 s0, s1, 16
2204; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s0
2205; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v0
2206; GCN-NEXT:    s_xor_b32 s0, s0, s2
2207; GCN-NEXT:    s_ashr_i32 s0, s0, 30
2208; GCN-NEXT:    s_or_b32 s2, s0, 1
2209; GCN-NEXT:    v_mul_f32_e32 v5, v4, v5
2210; GCN-NEXT:    v_trunc_f32_e32 v5, v5
2211; GCN-NEXT:    v_mad_f32 v4, -v5, v0, v4
2212; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
2213; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v0|
2214; GCN-NEXT:    s_cmp_lg_u32 s0, 0
2215; GCN-NEXT:    s_cselect_b32 s0, s2, 0
2216; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v5
2217; GCN-NEXT:    s_mov_b32 s0, 0xffff
2218; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2219; GCN-NEXT:    v_and_b32_e32 v1, s0, v1
2220; GCN-NEXT:    v_or_b32_e32 v1, v1, v0
2221; GCN-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
2222; GCN-NEXT:    v_and_b32_e32 v2, s0, v2
2223; GCN-NEXT:    v_or_b32_e32 v0, v2, v0
2224; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2225; GCN-NEXT:    s_endpgm
2226  %r = sdiv <4 x i16> %x, %y
2227  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
2228  ret void
2229}
2230
2231define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
2232; CHECK-LABEL: @srem_v4i16(
2233; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2234; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2235; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
2236; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
2237; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
2238; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
2239; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
2240; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
2241; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
2242; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2243; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
2244; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
2245; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
2246; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
2247; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
2248; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
2249; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2250; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
2251; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
2252; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
2253; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
2254; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
2255; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 16
2256; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
2257; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
2258; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i16> undef, i16 [[TMP25]], i64 0
2259; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1
2260; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2261; CHECK-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP27]] to i32
2262; CHECK-NEXT:    [[TMP30:%.*]] = sext i16 [[TMP28]] to i32
2263; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
2264; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
2265; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
2266; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
2267; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
2268; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
2269; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
2270; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
2271; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
2272; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
2273; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
2274; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
2275; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
2276; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
2277; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
2278; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
2279; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
2280; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
2281; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 16
2282; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
2283; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
2284; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1
2285; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2
2286; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2287; CHECK-NEXT:    [[TMP55:%.*]] = sext i16 [[TMP53]] to i32
2288; CHECK-NEXT:    [[TMP56:%.*]] = sext i16 [[TMP54]] to i32
2289; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
2290; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
2291; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
2292; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
2293; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
2294; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
2295; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
2296; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
2297; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
2298; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
2299; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
2300; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
2301; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
2302; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
2303; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
2304; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
2305; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
2306; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
2307; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 16
2308; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
2309; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
2310; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2
2311; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3
2312; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2313; CHECK-NEXT:    [[TMP81:%.*]] = sext i16 [[TMP79]] to i32
2314; CHECK-NEXT:    [[TMP82:%.*]] = sext i16 [[TMP80]] to i32
2315; CHECK-NEXT:    [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]]
2316; CHECK-NEXT:    [[TMP84:%.*]] = ashr i32 [[TMP83]], 30
2317; CHECK-NEXT:    [[TMP85:%.*]] = or i32 [[TMP84]], 1
2318; CHECK-NEXT:    [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float
2319; CHECK-NEXT:    [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float
2320; CHECK-NEXT:    [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]])
2321; CHECK-NEXT:    [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]]
2322; CHECK-NEXT:    [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]])
2323; CHECK-NEXT:    [[TMP91:%.*]] = fneg fast float [[TMP90]]
2324; CHECK-NEXT:    [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]])
2325; CHECK-NEXT:    [[TMP93:%.*]] = fptosi float [[TMP90]] to i32
2326; CHECK-NEXT:    [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]])
2327; CHECK-NEXT:    [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]])
2328; CHECK-NEXT:    [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]]
2329; CHECK-NEXT:    [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0
2330; CHECK-NEXT:    [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]]
2331; CHECK-NEXT:    [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]]
2332; CHECK-NEXT:    [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]]
2333; CHECK-NEXT:    [[TMP101:%.*]] = shl i32 [[TMP100]], 16
2334; CHECK-NEXT:    [[TMP102:%.*]] = ashr i32 [[TMP101]], 16
2335; CHECK-NEXT:    [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16
2336; CHECK-NEXT:    [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3
2337; CHECK-NEXT:    store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
2338; CHECK-NEXT:    ret void
2339;
2340; GCN-LABEL: srem_v4i16:
2341; GCN:       ; %bb.0:
2342; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2343; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
2344; GCN-NEXT:    s_mov_b32 s7, 0xf000
2345; GCN-NEXT:    s_mov_b32 s6, -1
2346; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2347; GCN-NEXT:    s_sext_i32_i16 s8, s2
2348; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
2349; GCN-NEXT:    s_sext_i32_i16 s9, s0
2350; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s9
2351; GCN-NEXT:    s_xor_b32 s8, s9, s8
2352; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2353; GCN-NEXT:    s_ashr_i32 s8, s8, 30
2354; GCN-NEXT:    s_or_b32 s10, s8, 1
2355; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2356; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2357; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
2358; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
2359; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
2360; GCN-NEXT:    s_cmp_lg_u32 s8, 0
2361; GCN-NEXT:    s_cselect_b32 s8, s10, 0
2362; GCN-NEXT:    v_add_i32_e32 v0, vcc, s8, v2
2363; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
2364; GCN-NEXT:    s_ashr_i32 s2, s2, 16
2365; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
2366; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2367; GCN-NEXT:    s_ashr_i32 s0, s0, 16
2368; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
2369; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v1
2370; GCN-NEXT:    s_xor_b32 s8, s0, s2
2371; GCN-NEXT:    s_ashr_i32 s8, s8, 30
2372; GCN-NEXT:    s_or_b32 s10, s8, 1
2373; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
2374; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2375; GCN-NEXT:    v_mad_f32 v2, -v3, v1, v2
2376; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
2377; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v2|, |v1|
2378; GCN-NEXT:    s_cmp_lg_u32 s8, 0
2379; GCN-NEXT:    s_cselect_b32 s8, s10, 0
2380; GCN-NEXT:    v_add_i32_e32 v1, vcc, s8, v3
2381; GCN-NEXT:    v_mul_lo_u32 v1, v1, s2
2382; GCN-NEXT:    s_sext_i32_i16 s2, s3
2383; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s2
2384; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s0, v1
2385; GCN-NEXT:    s_sext_i32_i16 s0, s1
2386; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
2387; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2388; GCN-NEXT:    s_xor_b32 s0, s0, s2
2389; GCN-NEXT:    s_ashr_i32 s0, s0, 30
2390; GCN-NEXT:    s_or_b32 s0, s0, 1
2391; GCN-NEXT:    v_mul_f32_e32 v4, v1, v4
2392; GCN-NEXT:    v_trunc_f32_e32 v4, v4
2393; GCN-NEXT:    v_mad_f32 v1, -v4, v2, v1
2394; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
2395; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v2|
2396; GCN-NEXT:    s_cmp_lg_u32 s8, 0
2397; GCN-NEXT:    s_cselect_b32 s0, s0, 0
2398; GCN-NEXT:    v_add_i32_e32 v1, vcc, s0, v4
2399; GCN-NEXT:    s_ashr_i32 s0, s3, 16
2400; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
2401; GCN-NEXT:    s_ashr_i32 s8, s1, 16
2402; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s8
2403; GCN-NEXT:    s_xor_b32 s2, s8, s0
2404; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v2
2405; GCN-NEXT:    s_ashr_i32 s2, s2, 30
2406; GCN-NEXT:    v_mul_lo_u32 v1, v1, s3
2407; GCN-NEXT:    s_or_b32 s9, s2, 1
2408; GCN-NEXT:    v_mul_f32_e32 v5, v4, v5
2409; GCN-NEXT:    v_trunc_f32_e32 v5, v5
2410; GCN-NEXT:    v_mad_f32 v4, -v5, v2, v4
2411; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
2412; GCN-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v4|, |v2|
2413; GCN-NEXT:    s_cmp_lg_u32 s2, 0
2414; GCN-NEXT:    s_cselect_b32 s2, s9, 0
2415; GCN-NEXT:    v_add_i32_e32 v2, vcc, s2, v5
2416; GCN-NEXT:    v_mul_lo_u32 v2, v2, s0
2417; GCN-NEXT:    s_mov_b32 s0, 0xffff
2418; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
2419; GCN-NEXT:    v_and_b32_e32 v1, s0, v1
2420; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s8, v2
2421; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2422; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
2423; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
2424; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
2425; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
2426; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2427; GCN-NEXT:    s_endpgm
2428  %r = srem <4 x i16> %x, %y
2429  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
2430  ret void
2431}
2432
2433define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
2434; CHECK-LABEL: @udiv_i3(
2435; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
2436; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
2437; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
2438; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
2439; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
2440; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
2441; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
2442; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
2443; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
2444; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
2445; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2446; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
2447; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
2448; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
2449; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
2450; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 7
2451; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3
2452; CHECK-NEXT:    store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]], align 1
2453; CHECK-NEXT:    ret void
2454;
2455; GCN-LABEL: udiv_i3:
2456; GCN:       ; %bb.0:
2457; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2458; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
2459; GCN-NEXT:    s_mov_b32 s7, 0xf000
2460; GCN-NEXT:    s_mov_b32 s6, -1
2461; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2462; GCN-NEXT:    s_bfe_u32 s1, s0, 0x30008
2463; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, s1
2464; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
2465; GCN-NEXT:    s_and_b32 s0, s0, 7
2466; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
2467; GCN-NEXT:    v_mul_f32_e32 v1, v2, v1
2468; GCN-NEXT:    v_trunc_f32_e32 v1, v1
2469; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v1
2470; GCN-NEXT:    v_mad_f32 v1, -v1, v0, v2
2471; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2472; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
2473; GCN-NEXT:    v_and_b32_e32 v0, 7, v0
2474; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
2475; GCN-NEXT:    s_endpgm
2476  %r = udiv i3 %x, %y
2477  store i3 %r, i3 addrspace(1)* %out
2478  ret void
2479}
2480
2481define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
2482; CHECK-LABEL: @urem_i3(
2483; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
2484; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
2485; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
2486; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
2487; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
2488; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
2489; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
2490; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
2491; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
2492; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
2493; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2494; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
2495; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
2496; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
2497; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
2498; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
2499; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
2500; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 7
2501; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3
2502; CHECK-NEXT:    store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]], align 1
2503; CHECK-NEXT:    ret void
2504;
2505; GCN-LABEL: urem_i3:
2506; GCN:       ; %bb.0:
2507; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2508; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
2509; GCN-NEXT:    s_mov_b32 s7, 0xf000
2510; GCN-NEXT:    s_mov_b32 s6, -1
2511; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2512; GCN-NEXT:    s_bfe_u32 s1, s0, 0x30008
2513; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, s1
2514; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
2515; GCN-NEXT:    s_and_b32 s2, s0, 7
2516; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, s2
2517; GCN-NEXT:    s_lshr_b32 s1, s0, 8
2518; GCN-NEXT:    v_mul_f32_e32 v1, v2, v1
2519; GCN-NEXT:    v_trunc_f32_e32 v1, v1
2520; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v1
2521; GCN-NEXT:    v_mad_f32 v1, -v1, v0, v2
2522; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2523; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
2524; GCN-NEXT:    v_mul_lo_u32 v0, v0, s1
2525; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2526; GCN-NEXT:    v_and_b32_e32 v0, 7, v0
2527; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
2528; GCN-NEXT:    s_endpgm
2529  %r = urem i3 %x, %y
2530  store i3 %r, i3 addrspace(1)* %out
2531  ret void
2532}
2533
2534define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
2535; CHECK-LABEL: @sdiv_i3(
2536; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
2537; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
2538; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
2539; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
2540; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
2541; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
2542; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
2543; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
2544; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
2545; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
2546; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
2547; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
2548; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
2549; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
2550; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
2551; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
2552; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
2553; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
2554; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 29
2555; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 29
2556; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3
2557; CHECK-NEXT:    store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]], align 1
2558; CHECK-NEXT:    ret void
2559;
2560; GCN-LABEL: sdiv_i3:
2561; GCN:       ; %bb.0:
2562; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2563; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
2564; GCN-NEXT:    s_mov_b32 s7, 0xf000
2565; GCN-NEXT:    s_mov_b32 s6, -1
2566; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2567; GCN-NEXT:    s_bfe_i32 s1, s0, 0x30008
2568; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
2569; GCN-NEXT:    s_bfe_i32 s0, s0, 0x30000
2570; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
2571; GCN-NEXT:    s_xor_b32 s0, s0, s1
2572; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2573; GCN-NEXT:    s_ashr_i32 s0, s0, 30
2574; GCN-NEXT:    s_or_b32 s2, s0, 1
2575; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2576; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2577; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
2578; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
2579; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
2580; GCN-NEXT:    s_cmp_lg_u32 s0, 0
2581; GCN-NEXT:    s_cselect_b32 s0, s2, 0
2582; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
2583; GCN-NEXT:    v_and_b32_e32 v0, 7, v0
2584; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
2585; GCN-NEXT:    s_endpgm
2586  %r = sdiv i3 %x, %y
2587  store i3 %r, i3 addrspace(1)* %out
2588  ret void
2589}
2590
2591define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
2592; CHECK-LABEL: @srem_i3(
2593; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
2594; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
2595; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
2596; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
2597; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
2598; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
2599; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
2600; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
2601; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
2602; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
2603; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
2604; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
2605; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
2606; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
2607; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
2608; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
2609; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
2610; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
2611; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
2612; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
2613; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 29
2614; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 29
2615; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3
2616; CHECK-NEXT:    store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]], align 1
2617; CHECK-NEXT:    ret void
2618;
2619; GCN-LABEL: srem_i3:
2620; GCN:       ; %bb.0:
2621; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2622; GCN-NEXT:    s_load_dword s2, s[0:1], 0xb
2623; GCN-NEXT:    s_mov_b32 s7, 0xf000
2624; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2625; GCN-NEXT:    s_bfe_i32 s0, s2, 0x30008
2626; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
2627; GCN-NEXT:    s_bfe_i32 s1, s2, 0x30000
2628; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s1
2629; GCN-NEXT:    s_xor_b32 s0, s1, s0
2630; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2631; GCN-NEXT:    s_ashr_i32 s0, s0, 30
2632; GCN-NEXT:    s_lshr_b32 s3, s2, 8
2633; GCN-NEXT:    s_or_b32 s6, s0, 1
2634; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2635; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2636; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
2637; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
2638; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
2639; GCN-NEXT:    s_cmp_lg_u32 s0, 0
2640; GCN-NEXT:    s_cselect_b32 s0, s6, 0
2641; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
2642; GCN-NEXT:    v_mul_lo_u32 v0, v0, s3
2643; GCN-NEXT:    s_mov_b32 s6, -1
2644; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
2645; GCN-NEXT:    v_and_b32_e32 v0, 7, v0
2646; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
2647; GCN-NEXT:    s_endpgm
2648  %r = srem i3 %x, %y
2649  store i3 %r, i3 addrspace(1)* %out
2650  ret void
2651}
2652
2653define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
2654; CHECK-LABEL: @udiv_v3i16(
2655; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
2656; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
2657; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
2658; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
2659; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
2660; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
2661; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
2662; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
2663; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
2664; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
2665; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
2666; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
2667; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
2668; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
2669; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
2670; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
2671; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
2672; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
2673; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
2674; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <3 x i16> undef, i16 [[TMP19]], i64 0
2675; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1
2676; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1
2677; CHECK-NEXT:    [[TMP23:%.*]] = zext i16 [[TMP21]] to i32
2678; CHECK-NEXT:    [[TMP24:%.*]] = zext i16 [[TMP22]] to i32
2679; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
2680; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
2681; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
2682; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
2683; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
2684; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
2685; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
2686; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
2687; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
2688; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
2689; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
2690; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
2691; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
2692; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 65535
2693; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16
2694; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1
2695; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2
2696; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2
2697; CHECK-NEXT:    [[TMP43:%.*]] = zext i16 [[TMP41]] to i32
2698; CHECK-NEXT:    [[TMP44:%.*]] = zext i16 [[TMP42]] to i32
2699; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
2700; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
2701; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
2702; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
2703; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
2704; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
2705; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
2706; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
2707; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
2708; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
2709; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
2710; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
2711; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
2712; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 65535
2713; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
2714; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2
2715; CHECK-NEXT:    store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
2716; CHECK-NEXT:    ret void
2717;
2718; GCN-LABEL: udiv_v3i16:
2719; GCN:       ; %bb.0:
2720; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2721; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
2722; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2723; GCN-NEXT:    s_mov_b32 s8, 0xffff
2724; GCN-NEXT:    s_mov_b32 s7, 0xf000
2725; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2726; GCN-NEXT:    s_and_b32 s6, s0, s8
2727; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
2728; GCN-NEXT:    s_and_b32 s6, s2, s8
2729; GCN-NEXT:    s_lshr_b32 s0, s0, 16
2730; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s0
2731; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s6
2732; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2733; GCN-NEXT:    s_lshr_b32 s0, s2, 16
2734; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s0
2735; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v3
2736; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2737; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2738; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
2739; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
2740; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2741; GCN-NEXT:    v_mul_f32_e32 v1, v4, v5
2742; GCN-NEXT:    v_trunc_f32_e32 v1, v1
2743; GCN-NEXT:    s_and_b32 s0, s1, s8
2744; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
2745; GCN-NEXT:    v_mad_f32 v2, -v1, v3, v4
2746; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s0
2747; GCN-NEXT:    s_and_b32 s0, s3, s8
2748; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s0
2749; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
2750; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
2751; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
2752; GCN-NEXT:    s_mov_b32 s6, -1
2753; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2754; GCN-NEXT:    v_mul_f32_e32 v2, v5, v6
2755; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2756; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
2757; GCN-NEXT:    v_mad_f32 v2, -v2, v4, v5
2758; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
2759; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2760; GCN-NEXT:    v_and_b32_e32 v0, s8, v0
2761; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
2762; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
2763; GCN-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
2764; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2765; GCN-NEXT:    s_endpgm
2766  %r = udiv <3 x i16> %x, %y
2767  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
2768  ret void
2769}
2770
2771define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
2772; CHECK-LABEL: @urem_v3i16(
2773; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
2774; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
2775; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
2776; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
2777; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
2778; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
2779; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
2780; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
2781; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
2782; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
2783; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
2784; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
2785; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
2786; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
2787; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
2788; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
2789; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
2790; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
2791; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
2792; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 65535
2793; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
2794; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x i16> undef, i16 [[TMP21]], i64 0
2795; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1
2796; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1
2797; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP23]] to i32
2798; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TMP24]] to i32
2799; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
2800; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
2801; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
2802; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
2803; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
2804; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
2805; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
2806; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
2807; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2808; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
2809; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
2810; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
2811; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
2812; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
2813; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
2814; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 65535
2815; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16
2816; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1
2817; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2
2818; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2
2819; CHECK-NEXT:    [[TMP47:%.*]] = zext i16 [[TMP45]] to i32
2820; CHECK-NEXT:    [[TMP48:%.*]] = zext i16 [[TMP46]] to i32
2821; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
2822; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
2823; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
2824; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
2825; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
2826; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
2827; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
2828; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
2829; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
2830; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
2831; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
2832; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
2833; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
2834; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
2835; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
2836; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 65535
2837; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
2838; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2
2839; CHECK-NEXT:    store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
2840; CHECK-NEXT:    ret void
2841;
2842; GCN-LABEL: urem_v3i16:
2843; GCN:       ; %bb.0:
2844; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2845; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
2846; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2847; GCN-NEXT:    s_mov_b32 s8, 0xffff
2848; GCN-NEXT:    s_mov_b32 s7, 0xf000
2849; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2850; GCN-NEXT:    v_mov_b32_e32 v1, s2
2851; GCN-NEXT:    s_and_b32 s6, s0, s8
2852; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
2853; GCN-NEXT:    s_and_b32 s6, s2, s8
2854; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s6
2855; GCN-NEXT:    v_mov_b32_e32 v4, s0
2856; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v0
2857; GCN-NEXT:    v_alignbit_b32 v4, s1, v4, 16
2858; GCN-NEXT:    v_and_b32_e32 v5, s8, v4
2859; GCN-NEXT:    v_alignbit_b32 v1, s3, v1, 16
2860; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
2861; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2862; GCN-NEXT:    v_mad_f32 v2, -v3, v0, v2
2863; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v3
2864; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
2865; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v5
2866; GCN-NEXT:    v_and_b32_e32 v3, s8, v1
2867; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
2868; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
2869; GCN-NEXT:    s_and_b32 s0, s1, s8
2870; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v3
2871; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v2
2872; GCN-NEXT:    v_cvt_f32_u32_e32 v6, s0
2873; GCN-NEXT:    s_and_b32 s0, s3, s8
2874; GCN-NEXT:    v_cvt_f32_u32_e32 v7, s0
2875; GCN-NEXT:    v_mul_f32_e32 v5, v3, v5
2876; GCN-NEXT:    v_trunc_f32_e32 v5, v5
2877; GCN-NEXT:    v_rcp_iflag_f32_e32 v8, v6
2878; GCN-NEXT:    v_mad_f32 v3, -v5, v2, v3
2879; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
2880; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
2881; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
2882; GCN-NEXT:    v_mul_f32_e32 v3, v7, v8
2883; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
2884; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2885; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
2886; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v3
2887; GCN-NEXT:    v_mad_f32 v3, -v3, v6, v7
2888; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v6
2889; GCN-NEXT:    s_mov_b32 s6, -1
2890; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
2891; GCN-NEXT:    v_mul_lo_u32 v3, v3, s1
2892; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
2893; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2894; GCN-NEXT:    v_and_b32_e32 v0, s8, v0
2895; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s3, v3
2896; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
2897; GCN-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
2898; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2899; GCN-NEXT:    s_endpgm
2900  %r = urem <3 x i16> %x, %y
2901  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
2902  ret void
2903}
2904
2905define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
2906; CHECK-LABEL: @sdiv_v3i16(
2907; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
2908; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
2909; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
2910; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
2911; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
2912; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
2913; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
2914; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
2915; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
2916; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2917; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
2918; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
2919; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
2920; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
2921; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
2922; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
2923; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2924; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
2925; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
2926; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
2927; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
2928; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
2929; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
2930; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x i16> undef, i16 [[TMP23]], i64 0
2931; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1
2932; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1
2933; CHECK-NEXT:    [[TMP27:%.*]] = sext i16 [[TMP25]] to i32
2934; CHECK-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP26]] to i32
2935; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
2936; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
2937; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
2938; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
2939; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
2940; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
2941; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
2942; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
2943; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
2944; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
2945; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
2946; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
2947; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2948; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
2949; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
2950; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
2951; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 16
2952; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
2953; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
2954; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1
2955; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2
2956; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2
2957; CHECK-NEXT:    [[TMP51:%.*]] = sext i16 [[TMP49]] to i32
2958; CHECK-NEXT:    [[TMP52:%.*]] = sext i16 [[TMP50]] to i32
2959; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
2960; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
2961; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
2962; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
2963; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
2964; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
2965; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
2966; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
2967; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
2968; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
2969; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
2970; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
2971; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
2972; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
2973; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
2974; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
2975; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 16
2976; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
2977; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
2978; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2
2979; CHECK-NEXT:    store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
2980; CHECK-NEXT:    ret void
2981;
2982; GCN-LABEL: sdiv_v3i16:
2983; GCN:       ; %bb.0:
2984; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2985; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
2986; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2987; GCN-NEXT:    s_mov_b32 s7, 0xf000
2988; GCN-NEXT:    s_mov_b32 s6, -1
2989; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2990; GCN-NEXT:    s_sext_i32_i16 s9, s2
2991; GCN-NEXT:    s_sext_i32_i16 s8, s0
2992; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
2993; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s9
2994; GCN-NEXT:    s_xor_b32 s8, s9, s8
2995; GCN-NEXT:    s_ashr_i32 s8, s8, 30
2996; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2997; GCN-NEXT:    s_or_b32 s10, s8, 1
2998; GCN-NEXT:    s_sext_i32_i16 s1, s1
2999; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
3000; GCN-NEXT:    v_trunc_f32_e32 v2, v2
3001; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
3002; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
3003; GCN-NEXT:    s_cmp_lg_u32 s8, 0
3004; GCN-NEXT:    s_cselect_b32 s8, s10, 0
3005; GCN-NEXT:    s_ashr_i32 s0, s0, 16
3006; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
3007; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
3008; GCN-NEXT:    s_ashr_i32 s2, s2, 16
3009; GCN-NEXT:    s_xor_b32 s0, s2, s0
3010; GCN-NEXT:    v_add_i32_e32 v1, vcc, s8, v2
3011; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s2
3012; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v0
3013; GCN-NEXT:    s_ashr_i32 s0, s0, 30
3014; GCN-NEXT:    s_or_b32 s0, s0, 1
3015; GCN-NEXT:    v_and_b32_e32 v1, 0xffff, v1
3016; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
3017; GCN-NEXT:    v_trunc_f32_e32 v3, v3
3018; GCN-NEXT:    v_mad_f32 v2, -v3, v0, v2
3019; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
3020; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v2|, |v0|
3021; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
3022; GCN-NEXT:    s_cmp_lg_u32 s8, 0
3023; GCN-NEXT:    s_cselect_b32 s0, s0, 0
3024; GCN-NEXT:    v_add_i32_e32 v2, vcc, s0, v3
3025; GCN-NEXT:    s_sext_i32_i16 s0, s3
3026; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s0
3027; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v0
3028; GCN-NEXT:    s_xor_b32 s0, s0, s1
3029; GCN-NEXT:    s_ashr_i32 s0, s0, 30
3030; GCN-NEXT:    s_or_b32 s2, s0, 1
3031; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
3032; GCN-NEXT:    v_trunc_f32_e32 v4, v4
3033; GCN-NEXT:    v_mad_f32 v3, -v4, v0, v3
3034; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
3035; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, |v0|
3036; GCN-NEXT:    s_cmp_lg_u32 s0, 0
3037; GCN-NEXT:    s_cselect_b32 s0, s2, 0
3038; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
3039; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v4
3040; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
3041; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
3042; GCN-NEXT:    buffer_store_dword v1, off, s[4:7], 0
3043; GCN-NEXT:    s_endpgm
3044  %r = sdiv <3 x i16> %x, %y
3045  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
3046  ret void
3047}
3048
3049define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
3050; CHECK-LABEL: @srem_v3i16(
3051; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
3052; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
3053; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
3054; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
3055; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
3056; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
3057; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
3058; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
3059; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
3060; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
3061; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
3062; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
3063; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
3064; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
3065; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
3066; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
3067; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3068; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
3069; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
3070; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
3071; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
3072; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
3073; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 16
3074; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
3075; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
3076; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x i16> undef, i16 [[TMP25]], i64 0
3077; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1
3078; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1
3079; CHECK-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP27]] to i32
3080; CHECK-NEXT:    [[TMP30:%.*]] = sext i16 [[TMP28]] to i32
3081; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
3082; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
3083; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
3084; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
3085; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
3086; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
3087; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
3088; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
3089; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
3090; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
3091; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
3092; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
3093; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
3094; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
3095; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
3096; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
3097; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
3098; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
3099; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 16
3100; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
3101; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
3102; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1
3103; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2
3104; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2
3105; CHECK-NEXT:    [[TMP55:%.*]] = sext i16 [[TMP53]] to i32
3106; CHECK-NEXT:    [[TMP56:%.*]] = sext i16 [[TMP54]] to i32
3107; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
3108; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
3109; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
3110; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
3111; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
3112; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
3113; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
3114; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
3115; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
3116; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
3117; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
3118; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
3119; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
3120; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
3121; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
3122; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
3123; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
3124; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
3125; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 16
3126; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
3127; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
3128; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2
3129; CHECK-NEXT:    store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
3130; CHECK-NEXT:    ret void
3131;
3132; GCN-LABEL: srem_v3i16:
3133; GCN:       ; %bb.0:
3134; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3135; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3136; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3137; GCN-NEXT:    s_mov_b32 s7, 0xf000
3138; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3139; GCN-NEXT:    s_sext_i32_i16 s8, s2
3140; GCN-NEXT:    s_sext_i32_i16 s6, s0
3141; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s6
3142; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s8
3143; GCN-NEXT:    s_xor_b32 s6, s8, s6
3144; GCN-NEXT:    s_ashr_i32 s6, s6, 30
3145; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3146; GCN-NEXT:    s_or_b32 s6, s6, 1
3147; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
3148; GCN-NEXT:    v_trunc_f32_e32 v2, v2
3149; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
3150; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
3151; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v1|, |v0|
3152; GCN-NEXT:    s_cmp_lg_u32 s8, 0
3153; GCN-NEXT:    s_cselect_b32 s6, s6, 0
3154; GCN-NEXT:    v_add_i32_e32 v0, vcc, s6, v2
3155; GCN-NEXT:    v_mov_b32_e32 v2, s0
3156; GCN-NEXT:    v_alignbit_b32 v2, s1, v2, 16
3157; GCN-NEXT:    v_bfe_i32 v3, v2, 0, 16
3158; GCN-NEXT:    v_cvt_f32_i32_e32 v4, v3
3159; GCN-NEXT:    v_mov_b32_e32 v1, s2
3160; GCN-NEXT:    v_alignbit_b32 v1, s3, v1, 16
3161; GCN-NEXT:    v_bfe_i32 v5, v1, 0, 16
3162; GCN-NEXT:    v_cvt_f32_i32_e32 v6, v5
3163; GCN-NEXT:    v_rcp_iflag_f32_e32 v7, v4
3164; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
3165; GCN-NEXT:    v_xor_b32_e32 v3, v5, v3
3166; GCN-NEXT:    s_sext_i32_i16 s0, s1
3167; GCN-NEXT:    v_mul_f32_e32 v5, v6, v7
3168; GCN-NEXT:    v_trunc_f32_e32 v5, v5
3169; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
3170; GCN-NEXT:    v_mad_f32 v6, -v5, v4, v6
3171; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
3172; GCN-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
3173; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v4|
3174; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s0
3175; GCN-NEXT:    v_or_b32_e32 v3, 1, v3
3176; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
3177; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
3178; GCN-NEXT:    s_sext_i32_i16 s2, s3
3179; GCN-NEXT:    v_mul_lo_u32 v2, v3, v2
3180; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s2
3181; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v4
3182; GCN-NEXT:    s_xor_b32 s0, s2, s0
3183; GCN-NEXT:    s_ashr_i32 s0, s0, 30
3184; GCN-NEXT:    s_or_b32 s0, s0, 1
3185; GCN-NEXT:    v_mul_f32_e32 v5, v3, v5
3186; GCN-NEXT:    v_trunc_f32_e32 v5, v5
3187; GCN-NEXT:    v_mad_f32 v3, -v5, v4, v3
3188; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
3189; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v3|, |v4|
3190; GCN-NEXT:    s_cmp_lg_u32 s8, 0
3191; GCN-NEXT:    s_cselect_b32 s0, s0, 0
3192; GCN-NEXT:    v_add_i32_e32 v3, vcc, s0, v5
3193; GCN-NEXT:    v_mul_lo_u32 v3, v3, s1
3194; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
3195; GCN-NEXT:    s_mov_b32 s6, -1
3196; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3197; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3198; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s3, v3
3199; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
3200; GCN-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
3201; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3202; GCN-NEXT:    s_endpgm
3203  %r = srem <3 x i16> %x, %y
3204  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
3205  ret void
3206}
3207
3208define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
3209; CHECK-LABEL: @udiv_v3i15(
3210; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
3211; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
3212; CHECK-NEXT:    [[TMP3:%.*]] = zext i15 [[TMP1]] to i32
3213; CHECK-NEXT:    [[TMP4:%.*]] = zext i15 [[TMP2]] to i32
3214; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3215; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3216; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3217; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3218; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3219; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
3220; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3221; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3222; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3223; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3224; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3225; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3226; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3227; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 32767
3228; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15
3229; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <3 x i15> undef, i15 [[TMP19]], i64 0
3230; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1
3231; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1
3232; CHECK-NEXT:    [[TMP23:%.*]] = zext i15 [[TMP21]] to i32
3233; CHECK-NEXT:    [[TMP24:%.*]] = zext i15 [[TMP22]] to i32
3234; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
3235; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
3236; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
3237; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
3238; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
3239; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
3240; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
3241; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
3242; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
3243; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
3244; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
3245; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
3246; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
3247; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 32767
3248; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15
3249; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1
3250; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2
3251; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2
3252; CHECK-NEXT:    [[TMP43:%.*]] = zext i15 [[TMP41]] to i32
3253; CHECK-NEXT:    [[TMP44:%.*]] = zext i15 [[TMP42]] to i32
3254; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
3255; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
3256; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
3257; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
3258; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
3259; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
3260; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
3261; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
3262; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
3263; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
3264; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
3265; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
3266; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
3267; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 32767
3268; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15
3269; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2
3270; CHECK-NEXT:    store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
3271; CHECK-NEXT:    ret void
3272;
3273; GCN-LABEL: udiv_v3i15:
3274; GCN:       ; %bb.0:
3275; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3276; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3277; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3278; GCN-NEXT:    s_mov_b32 s7, 0xf000
3279; GCN-NEXT:    s_mov_b32 s6, -1
3280; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3281; GCN-NEXT:    v_mov_b32_e32 v0, s2
3282; GCN-NEXT:    v_alignbit_b32 v0, s3, v0, 30
3283; GCN-NEXT:    s_movk_i32 s3, 0x7fff
3284; GCN-NEXT:    s_and_b32 s9, s0, s3
3285; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
3286; GCN-NEXT:    v_mov_b32_e32 v2, s0
3287; GCN-NEXT:    s_and_b32 s8, s2, s3
3288; GCN-NEXT:    s_bfe_u32 s0, s0, 0xf000f
3289; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s0
3290; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s8
3291; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v1
3292; GCN-NEXT:    s_bfe_u32 s2, s2, 0xf000f
3293; GCN-NEXT:    v_alignbit_b32 v2, s1, v2, 30
3294; GCN-NEXT:    v_cvt_f32_u32_e32 v6, s2
3295; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
3296; GCN-NEXT:    v_rcp_iflag_f32_e32 v7, v5
3297; GCN-NEXT:    v_and_b32_e32 v2, s3, v2
3298; GCN-NEXT:    v_trunc_f32_e32 v4, v4
3299; GCN-NEXT:    v_mad_f32 v3, -v4, v1, v3
3300; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
3301; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
3302; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
3303; GCN-NEXT:    v_mul_f32_e32 v1, v6, v7
3304; GCN-NEXT:    v_and_b32_e32 v0, s3, v0
3305; GCN-NEXT:    v_trunc_f32_e32 v1, v1
3306; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
3307; GCN-NEXT:    v_mad_f32 v4, -v1, v5, v6
3308; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
3309; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
3310; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v2
3311; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v5
3312; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
3313; GCN-NEXT:    v_mul_f32_e32 v1, v0, v6
3314; GCN-NEXT:    v_trunc_f32_e32 v1, v1
3315; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v1
3316; GCN-NEXT:    v_mad_f32 v0, -v1, v2, v0
3317; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v2
3318; GCN-NEXT:    v_and_b32_e32 v2, s3, v3
3319; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
3320; GCN-NEXT:    v_and_b32_e32 v3, s3, v4
3321; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
3322; GCN-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
3323; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
3324; GCN-NEXT:    v_or_b32_e32 v0, v2, v0
3325; GCN-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
3326; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3327; GCN-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
3328; GCN-NEXT:    s_endpgm
3329  %r = udiv <3 x i15> %x, %y
3330  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
3331  ret void
3332}
3333
3334define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
3335; CHECK-LABEL: @urem_v3i15(
3336; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
3337; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
3338; CHECK-NEXT:    [[TMP3:%.*]] = zext i15 [[TMP1]] to i32
3339; CHECK-NEXT:    [[TMP4:%.*]] = zext i15 [[TMP2]] to i32
3340; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3341; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3342; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3343; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3344; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3345; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
3346; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3347; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3348; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3349; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3350; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3351; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3352; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3353; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
3354; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
3355; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 32767
3356; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15
3357; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x i15> undef, i15 [[TMP21]], i64 0
3358; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1
3359; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1
3360; CHECK-NEXT:    [[TMP25:%.*]] = zext i15 [[TMP23]] to i32
3361; CHECK-NEXT:    [[TMP26:%.*]] = zext i15 [[TMP24]] to i32
3362; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
3363; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
3364; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
3365; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
3366; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
3367; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
3368; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
3369; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
3370; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
3371; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
3372; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
3373; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
3374; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
3375; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
3376; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
3377; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 32767
3378; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15
3379; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1
3380; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2
3381; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2
3382; CHECK-NEXT:    [[TMP47:%.*]] = zext i15 [[TMP45]] to i32
3383; CHECK-NEXT:    [[TMP48:%.*]] = zext i15 [[TMP46]] to i32
3384; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
3385; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
3386; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
3387; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
3388; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
3389; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
3390; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
3391; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
3392; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
3393; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
3394; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
3395; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
3396; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
3397; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
3398; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
3399; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 32767
3400; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15
3401; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2
3402; CHECK-NEXT:    store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
3403; CHECK-NEXT:    ret void
3404;
3405; GCN-LABEL: urem_v3i15:
3406; GCN:       ; %bb.0:
3407; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3408; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3409; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3410; GCN-NEXT:    s_mov_b32 s7, 0xf000
3411; GCN-NEXT:    s_mov_b32 s6, -1
3412; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3413; GCN-NEXT:    v_mov_b32_e32 v0, s2
3414; GCN-NEXT:    v_alignbit_b32 v0, s3, v0, 30
3415; GCN-NEXT:    s_movk_i32 s3, 0x7fff
3416; GCN-NEXT:    s_and_b32 s10, s0, s3
3417; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s10
3418; GCN-NEXT:    s_and_b32 s9, s2, s3
3419; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s9
3420; GCN-NEXT:    v_mov_b32_e32 v2, s0
3421; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v1
3422; GCN-NEXT:    v_alignbit_b32 v2, s1, v2, 30
3423; GCN-NEXT:    s_bfe_u32 s1, s0, 0xf000f
3424; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s1
3425; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
3426; GCN-NEXT:    v_trunc_f32_e32 v4, v4
3427; GCN-NEXT:    v_mad_f32 v3, -v4, v1, v3
3428; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
3429; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
3430; GCN-NEXT:    s_bfe_u32 s10, s2, 0xf000f
3431; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s10
3432; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
3433; GCN-NEXT:    v_mul_lo_u32 v1, v1, s0
3434; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v5
3435; GCN-NEXT:    v_and_b32_e32 v2, s3, v2
3436; GCN-NEXT:    v_and_b32_e32 v0, s3, v0
3437; GCN-NEXT:    v_sub_i32_e32 v6, vcc, s2, v1
3438; GCN-NEXT:    v_mul_f32_e32 v1, v3, v4
3439; GCN-NEXT:    v_cvt_f32_u32_e32 v4, v2
3440; GCN-NEXT:    v_cvt_f32_u32_e32 v7, v0
3441; GCN-NEXT:    v_trunc_f32_e32 v1, v1
3442; GCN-NEXT:    v_mad_f32 v3, -v1, v5, v3
3443; GCN-NEXT:    v_rcp_iflag_f32_e32 v8, v4
3444; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
3445; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
3446; GCN-NEXT:    s_lshr_b32 s0, s0, 15
3447; GCN-NEXT:    v_mul_f32_e32 v3, v7, v8
3448; GCN-NEXT:    v_trunc_f32_e32 v3, v3
3449; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v3
3450; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3451; GCN-NEXT:    v_mad_f32 v3, -v3, v4, v7
3452; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
3453; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
3454; GCN-NEXT:    v_mul_lo_u32 v1, v1, s0
3455; GCN-NEXT:    v_mul_lo_u32 v2, v3, v2
3456; GCN-NEXT:    s_lshr_b32 s8, s2, 15
3457; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s8, v1
3458; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v2, v0
3459; GCN-NEXT:    v_and_b32_e32 v3, s3, v3
3460; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
3461; GCN-NEXT:    v_and_b32_e32 v2, s3, v6
3462; GCN-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
3463; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
3464; GCN-NEXT:    v_or_b32_e32 v0, v2, v0
3465; GCN-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
3466; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3467; GCN-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
3468; GCN-NEXT:    s_endpgm
3469  %r = urem <3 x i15> %x, %y
3470  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
3471  ret void
3472}
3473
3474define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
3475; CHECK-LABEL: @sdiv_v3i15(
3476; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
3477; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
3478; CHECK-NEXT:    [[TMP3:%.*]] = sext i15 [[TMP1]] to i32
3479; CHECK-NEXT:    [[TMP4:%.*]] = sext i15 [[TMP2]] to i32
3480; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
3481; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
3482; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
3483; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
3484; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
3485; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
3486; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
3487; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
3488; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
3489; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
3490; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
3491; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
3492; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3493; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
3494; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
3495; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
3496; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 17
3497; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 17
3498; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15
3499; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x i15> undef, i15 [[TMP23]], i64 0
3500; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1
3501; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1
3502; CHECK-NEXT:    [[TMP27:%.*]] = sext i15 [[TMP25]] to i32
3503; CHECK-NEXT:    [[TMP28:%.*]] = sext i15 [[TMP26]] to i32
3504; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
3505; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
3506; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
3507; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
3508; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
3509; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
3510; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
3511; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
3512; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
3513; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
3514; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
3515; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
3516; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
3517; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
3518; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
3519; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
3520; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 17
3521; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 17
3522; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15
3523; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1
3524; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2
3525; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2
3526; CHECK-NEXT:    [[TMP51:%.*]] = sext i15 [[TMP49]] to i32
3527; CHECK-NEXT:    [[TMP52:%.*]] = sext i15 [[TMP50]] to i32
3528; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
3529; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
3530; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
3531; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
3532; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
3533; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
3534; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
3535; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
3536; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
3537; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
3538; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
3539; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
3540; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
3541; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
3542; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
3543; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
3544; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 17
3545; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 17
3546; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15
3547; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2
3548; CHECK-NEXT:    store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
3549; CHECK-NEXT:    ret void
3550;
3551; GCN-LABEL: sdiv_v3i15:
3552; GCN:       ; %bb.0:
3553; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3554; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3555; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3556; GCN-NEXT:    s_mov_b32 s7, 0xf000
3557; GCN-NEXT:    s_mov_b32 s6, -1
3558; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3559; GCN-NEXT:    v_mov_b32_e32 v0, s2
3560; GCN-NEXT:    v_alignbit_b32 v0, s3, v0, 30
3561; GCN-NEXT:    s_bfe_i32 s3, s0, 0xf0000
3562; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s3
3563; GCN-NEXT:    v_mov_b32_e32 v1, s0
3564; GCN-NEXT:    v_alignbit_b32 v1, s1, v1, 30
3565; GCN-NEXT:    s_bfe_i32 s1, s2, 0xf0000
3566; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s1
3567; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
3568; GCN-NEXT:    s_xor_b32 s1, s1, s3
3569; GCN-NEXT:    s_ashr_i32 s1, s1, 30
3570; GCN-NEXT:    s_or_b32 s1, s1, 1
3571; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
3572; GCN-NEXT:    v_trunc_f32_e32 v4, v4
3573; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
3574; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v3|, |v2|
3575; GCN-NEXT:    s_cmp_lg_u32 s8, 0
3576; GCN-NEXT:    s_cselect_b32 s1, s1, 0
3577; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
3578; GCN-NEXT:    s_bfe_i32 s0, s0, 0xf000f
3579; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
3580; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 15
3581; GCN-NEXT:    v_add_i32_e32 v3, vcc, s1, v4
3582; GCN-NEXT:    s_bfe_i32 s1, s2, 0xf000f
3583; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s1
3584; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v2
3585; GCN-NEXT:    s_xor_b32 s0, s1, s0
3586; GCN-NEXT:    s_ashr_i32 s0, s0, 30
3587; GCN-NEXT:    s_or_b32 s2, s0, 1
3588; GCN-NEXT:    v_mul_f32_e32 v5, v4, v5
3589; GCN-NEXT:    v_trunc_f32_e32 v5, v5
3590; GCN-NEXT:    v_mad_f32 v4, -v5, v2, v4
3591; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v2|
3592; GCN-NEXT:    v_cvt_f32_i32_e32 v2, v1
3593; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
3594; GCN-NEXT:    s_cmp_lg_u32 s0, 0
3595; GCN-NEXT:    s_cselect_b32 s0, s2, 0
3596; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 15
3597; GCN-NEXT:    v_add_i32_e32 v4, vcc, s0, v5
3598; GCN-NEXT:    v_cvt_f32_i32_e32 v5, v0
3599; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v2
3600; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
3601; GCN-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
3602; GCN-NEXT:    v_or_b32_e32 v0, 1, v0
3603; GCN-NEXT:    v_mul_f32_e32 v1, v5, v6
3604; GCN-NEXT:    v_trunc_f32_e32 v1, v1
3605; GCN-NEXT:    v_mad_f32 v5, -v1, v2, v5
3606; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
3607; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v2|
3608; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
3609; GCN-NEXT:    s_movk_i32 s0, 0x7fff
3610; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
3611; GCN-NEXT:    v_and_b32_e32 v2, s0, v3
3612; GCN-NEXT:    v_and_b32_e32 v3, s0, v4
3613; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
3614; GCN-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
3615; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
3616; GCN-NEXT:    v_or_b32_e32 v0, v2, v0
3617; GCN-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
3618; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3619; GCN-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
3620; GCN-NEXT:    s_endpgm
3621  %r = sdiv <3 x i15> %x, %y
3622  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
3623  ret void
3624}
3625
3626define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
3627; CHECK-LABEL: @srem_v3i15(
3628; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
3629; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
3630; CHECK-NEXT:    [[TMP3:%.*]] = sext i15 [[TMP1]] to i32
3631; CHECK-NEXT:    [[TMP4:%.*]] = sext i15 [[TMP2]] to i32
3632; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
3633; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
3634; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
3635; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
3636; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
3637; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
3638; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
3639; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
3640; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
3641; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
3642; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
3643; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
3644; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3645; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
3646; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
3647; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
3648; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
3649; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
3650; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 17
3651; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 17
3652; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15
3653; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x i15> undef, i15 [[TMP25]], i64 0
3654; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1
3655; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1
3656; CHECK-NEXT:    [[TMP29:%.*]] = sext i15 [[TMP27]] to i32
3657; CHECK-NEXT:    [[TMP30:%.*]] = sext i15 [[TMP28]] to i32
3658; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
3659; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
3660; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
3661; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
3662; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
3663; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
3664; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
3665; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
3666; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
3667; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
3668; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
3669; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
3670; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
3671; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
3672; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
3673; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
3674; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
3675; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
3676; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 17
3677; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 17
3678; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15
3679; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1
3680; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2
3681; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2
3682; CHECK-NEXT:    [[TMP55:%.*]] = sext i15 [[TMP53]] to i32
3683; CHECK-NEXT:    [[TMP56:%.*]] = sext i15 [[TMP54]] to i32
3684; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
3685; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
3686; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
3687; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
3688; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
3689; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
3690; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
3691; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
3692; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
3693; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
3694; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
3695; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
3696; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
3697; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
3698; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
3699; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
3700; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
3701; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
3702; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 17
3703; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 17
3704; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15
3705; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2
3706; CHECK-NEXT:    store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
3707; CHECK-NEXT:    ret void
3708;
3709; GCN-LABEL: srem_v3i15:
3710; GCN:       ; %bb.0:
3711; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3712; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3713; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3714; GCN-NEXT:    s_mov_b32 s7, 0xf000
3715; GCN-NEXT:    s_mov_b32 s6, -1
3716; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3717; GCN-NEXT:    v_mov_b32_e32 v0, s2
3718; GCN-NEXT:    v_mov_b32_e32 v1, s0
3719; GCN-NEXT:    v_alignbit_b32 v0, s3, v0, 30
3720; GCN-NEXT:    s_movk_i32 s3, 0x7fff
3721; GCN-NEXT:    v_alignbit_b32 v1, s1, v1, 30
3722; GCN-NEXT:    s_and_b32 s1, s0, s3
3723; GCN-NEXT:    s_bfe_i32 s1, s1, 0xf0000
3724; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s1
3725; GCN-NEXT:    s_and_b32 s8, s2, s3
3726; GCN-NEXT:    s_bfe_i32 s8, s8, 0xf0000
3727; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s8
3728; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
3729; GCN-NEXT:    s_xor_b32 s1, s8, s1
3730; GCN-NEXT:    s_ashr_i32 s1, s1, 30
3731; GCN-NEXT:    s_lshr_b32 s10, s2, 15
3732; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
3733; GCN-NEXT:    v_trunc_f32_e32 v4, v4
3734; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
3735; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
3736; GCN-NEXT:    s_bfe_u32 s11, s2, 0xf000f
3737; GCN-NEXT:    s_lshr_b32 s12, s0, 15
3738; GCN-NEXT:    s_bfe_u32 s13, s0, 0xf000f
3739; GCN-NEXT:    s_or_b32 s1, s1, 1
3740; GCN-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v3|, |v2|
3741; GCN-NEXT:    s_cmp_lg_u32 s8, 0
3742; GCN-NEXT:    s_cselect_b32 s1, s1, 0
3743; GCN-NEXT:    v_add_i32_e32 v2, vcc, s1, v4
3744; GCN-NEXT:    v_mul_lo_u32 v2, v2, s0
3745; GCN-NEXT:    s_bfe_i32 s0, s13, 0xf0000
3746; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s0
3747; GCN-NEXT:    s_bfe_i32 s1, s11, 0xf0000
3748; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s1
3749; GCN-NEXT:    s_xor_b32 s0, s1, s0
3750; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v3
3751; GCN-NEXT:    s_ashr_i32 s0, s0, 30
3752; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
3753; GCN-NEXT:    s_or_b32 s2, s0, 1
3754; GCN-NEXT:    v_mul_f32_e32 v5, v4, v5
3755; GCN-NEXT:    v_trunc_f32_e32 v5, v5
3756; GCN-NEXT:    v_mad_f32 v4, -v5, v3, v4
3757; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
3758; GCN-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
3759; GCN-NEXT:    s_cmp_lg_u32 s0, 0
3760; GCN-NEXT:    v_and_b32_e32 v1, s3, v1
3761; GCN-NEXT:    s_cselect_b32 s0, s2, 0
3762; GCN-NEXT:    v_bfe_i32 v4, v1, 0, 15
3763; GCN-NEXT:    v_add_i32_e32 v3, vcc, s0, v5
3764; GCN-NEXT:    v_cvt_f32_i32_e32 v5, v4
3765; GCN-NEXT:    v_and_b32_e32 v0, s3, v0
3766; GCN-NEXT:    v_bfe_i32 v6, v0, 0, 15
3767; GCN-NEXT:    v_cvt_f32_i32_e32 v7, v6
3768; GCN-NEXT:    v_rcp_iflag_f32_e32 v8, v5
3769; GCN-NEXT:    v_xor_b32_e32 v4, v6, v4
3770; GCN-NEXT:    v_ashrrev_i32_e32 v4, 30, v4
3771; GCN-NEXT:    v_or_b32_e32 v4, 1, v4
3772; GCN-NEXT:    v_mul_f32_e32 v6, v7, v8
3773; GCN-NEXT:    v_trunc_f32_e32 v6, v6
3774; GCN-NEXT:    v_mad_f32 v7, -v6, v5, v7
3775; GCN-NEXT:    v_cvt_i32_f32_e32 v6, v6
3776; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v5|
3777; GCN-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
3778; GCN-NEXT:    v_mul_lo_u32 v3, v3, s12
3779; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
3780; GCN-NEXT:    v_mul_lo_u32 v1, v4, v1
3781; GCN-NEXT:    v_and_b32_e32 v2, s3, v2
3782; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
3783; GCN-NEXT:    v_and_b32_e32 v3, s3, v3
3784; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
3785; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
3786; GCN-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
3787; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
3788; GCN-NEXT:    v_or_b32_e32 v0, v2, v0
3789; GCN-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
3790; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3791; GCN-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
3792; GCN-NEXT:    s_endpgm
3793  %r = srem <3 x i15> %x, %y
3794  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
3795  ret void
3796}
3797
3798define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
3799; CHECK-LABEL: @udiv_i32_oddk_denom(
3800; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 1235195
3801; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
3802; CHECK-NEXT:    ret void
3803;
3804; GCN-LABEL: udiv_i32_oddk_denom:
3805; GCN:       ; %bb.0:
3806; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3807; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
3808; GCN-NEXT:    v_mov_b32_e32 v0, 0xb2a50881
3809; GCN-NEXT:    s_mov_b32 s7, 0xf000
3810; GCN-NEXT:    s_mov_b32 s6, -1
3811; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3812; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
3813; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s0, v0
3814; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
3815; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
3816; GCN-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
3817; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3818; GCN-NEXT:    s_endpgm
3819  %r = udiv i32 %x, 1235195
3820  store i32 %r, i32 addrspace(1)* %out
3821  ret void
3822}
3823
3824define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
3825; CHECK-LABEL: @udiv_i32_pow2k_denom(
3826; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 4096
3827; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
3828; CHECK-NEXT:    ret void
3829;
3830; GCN-LABEL: udiv_i32_pow2k_denom:
3831; GCN:       ; %bb.0:
3832; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3833; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
3834; GCN-NEXT:    s_mov_b32 s7, 0xf000
3835; GCN-NEXT:    s_mov_b32 s6, -1
3836; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3837; GCN-NEXT:    s_lshr_b32 s0, s0, 12
3838; GCN-NEXT:    v_mov_b32_e32 v0, s0
3839; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3840; GCN-NEXT:    s_endpgm
3841  %r = udiv i32 %x, 4096
3842  store i32 %r, i32 addrspace(1)* %out
3843  ret void
3844}
3845
3846define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
3847; CHECK-LABEL: @udiv_i32_pow2_shl_denom(
3848; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
3849; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]]
3850; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
3851; CHECK-NEXT:    ret void
3852;
3853; GCN-LABEL: udiv_i32_pow2_shl_denom:
3854; GCN:       ; %bb.0:
3855; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3856; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3857; GCN-NEXT:    s_mov_b32 s7, 0xf000
3858; GCN-NEXT:    s_mov_b32 s6, -1
3859; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3860; GCN-NEXT:    s_add_i32 s1, s1, 12
3861; GCN-NEXT:    s_lshr_b32 s0, s0, s1
3862; GCN-NEXT:    v_mov_b32_e32 v0, s0
3863; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3864; GCN-NEXT:    s_endpgm
3865  %shl.y = shl i32 4096, %y
3866  %r = udiv i32 %x, %shl.y
3867  store i32 %r, i32 addrspace(1)* %out
3868  ret void
3869}
3870
3871define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
3872; CHECK-LABEL: @udiv_v2i32_pow2k_denom(
3873; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
3874; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
3875; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
3876; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
3877; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096
3878; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
3879; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
3880; CHECK-NEXT:    ret void
3881;
3882; GCN-LABEL: udiv_v2i32_pow2k_denom:
3883; GCN:       ; %bb.0:
3884; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3885; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3886; GCN-NEXT:    s_mov_b32 s7, 0xf000
3887; GCN-NEXT:    s_mov_b32 s6, -1
3888; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3889; GCN-NEXT:    s_lshr_b32 s0, s0, 12
3890; GCN-NEXT:    s_lshr_b32 s1, s1, 12
3891; GCN-NEXT:    v_mov_b32_e32 v0, s0
3892; GCN-NEXT:    v_mov_b32_e32 v1, s1
3893; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3894; GCN-NEXT:    s_endpgm
3895  %r = udiv <2 x i32> %x, <i32 4096, i32 4096>
3896  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
3897  ret void
3898}
3899
3900define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
3901; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom(
3902; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
3903; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
3904; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
3905; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
3906; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095
3907; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
3908; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
3909; CHECK-NEXT:    ret void
3910;
3911; GCN-LABEL: udiv_v2i32_mixed_pow2k_denom:
3912; GCN:       ; %bb.0:
3913; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3914; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3915; GCN-NEXT:    v_mov_b32_e32 v0, 0x100101
3916; GCN-NEXT:    s_mov_b32 s7, 0xf000
3917; GCN-NEXT:    s_mov_b32 s6, -1
3918; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3919; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
3920; GCN-NEXT:    s_lshr_b32 s0, s0, 12
3921; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v0
3922; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
3923; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
3924; GCN-NEXT:    v_lshrrev_b32_e32 v1, 11, v0
3925; GCN-NEXT:    v_mov_b32_e32 v0, s0
3926; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3927; GCN-NEXT:    s_endpgm
3928  %r = udiv <2 x i32> %x, <i32 4096, i32 4095>
3929  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
3930  ret void
3931}
3932
3933define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
3934; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom(
3935; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
3936; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
3937; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
3938; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
3939; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
3940; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
3941; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
3942; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
3943; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
3944; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
3945; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
3946; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
3947; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
3948; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
3949; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
3950; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
3951; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
3952; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
3953; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
3954; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
3955; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
3956; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
3957; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
3958; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
3959; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
3960; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP21]], 1
3961; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]]
3962; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]]
3963; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]]
3964; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]]
3965; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
3966; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
3967; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x i32> undef, i32 [[TMP31]], i64 0
3968; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x i32> [[X]], i64 1
3969; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
3970; CHECK-NEXT:    [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float
3971; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
3972; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000
3973; CHECK-NEXT:    [[TMP38:%.*]] = fptoui float [[TMP37]] to i32
3974; CHECK-NEXT:    [[TMP39:%.*]] = sub i32 0, [[TMP34]]
3975; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]]
3976; CHECK-NEXT:    [[TMP41:%.*]] = zext i32 [[TMP38]] to i64
3977; CHECK-NEXT:    [[TMP42:%.*]] = zext i32 [[TMP40]] to i64
3978; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]]
3979; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
3980; CHECK-NEXT:    [[TMP45:%.*]] = lshr i64 [[TMP43]], 32
3981; CHECK-NEXT:    [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32
3982; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]]
3983; CHECK-NEXT:    [[TMP48:%.*]] = zext i32 [[TMP33]] to i64
3984; CHECK-NEXT:    [[TMP49:%.*]] = zext i32 [[TMP47]] to i64
3985; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]]
3986; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
3987; CHECK-NEXT:    [[TMP52:%.*]] = lshr i64 [[TMP50]], 32
3988; CHECK-NEXT:    [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32
3989; CHECK-NEXT:    [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]]
3990; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]]
3991; CHECK-NEXT:    [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]]
3992; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP53]], 1
3993; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]]
3994; CHECK-NEXT:    [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]]
3995; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]]
3996; CHECK-NEXT:    [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]]
3997; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP58]], 1
3998; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]]
3999; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP63]], i64 1
4000; CHECK-NEXT:    store <2 x i32> [[TMP64]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4001; CHECK-NEXT:    ret void
4002;
4003; GCN-LABEL: udiv_v2i32_pow2_shl_denom:
4004; GCN:       ; %bb.0:
4005; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
4006; GCN-NEXT:    s_movk_i32 s4, 0x1000
4007; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
4008; GCN-NEXT:    s_mov_b32 s7, 0xf000
4009; GCN-NEXT:    s_mov_b32 s6, -1
4010; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4011; GCN-NEXT:    s_lshl_b32 s5, s4, s2
4012; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s5
4013; GCN-NEXT:    s_lshl_b32 s10, s4, s3
4014; GCN-NEXT:    s_mov_b32 s3, 0x4f7ffffe
4015; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s10
4016; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
4017; GCN-NEXT:    s_sub_i32 s2, 0, s5
4018; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
4019; GCN-NEXT:    v_mul_f32_e32 v0, s3, v0
4020; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4021; GCN-NEXT:    v_mul_f32_e32 v1, s3, v1
4022; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
4023; GCN-NEXT:    v_mul_lo_u32 v2, s2, v0
4024; GCN-NEXT:    s_sub_i32 s2, 0, s10
4025; GCN-NEXT:    v_mul_lo_u32 v3, s2, v1
4026; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
4027; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
4028; GCN-NEXT:    v_mul_hi_u32 v0, s8, v0
4029; GCN-NEXT:    v_mul_hi_u32 v2, v1, v3
4030; GCN-NEXT:    v_mul_lo_u32 v3, v0, s5
4031; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
4032; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
4033; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s5, v3
4034; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[2:3]
4035; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s5, v3
4036; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[2:3]
4037; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
4038; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
4039; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
4040; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
4041; GCN-NEXT:    v_mul_hi_u32 v1, s9, v1
4042; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4043; GCN-NEXT:    v_mul_lo_u32 v2, v1, s10
4044; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
4045; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s9, v2
4046; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
4047; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
4048; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s10, v2
4049; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
4050; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
4051; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
4052; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
4053; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4054; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4055; GCN-NEXT:    s_endpgm
4056  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
4057  %r = udiv <2 x i32> %x, %shl.y
4058  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4059  ret void
4060}
4061
4062define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
4063; CHECK-LABEL: @urem_i32_oddk_denom(
4064; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 1235195
4065; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4066; CHECK-NEXT:    ret void
4067;
4068; GCN-LABEL: urem_i32_oddk_denom:
4069; GCN:       ; %bb.0:
4070; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4071; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4072; GCN-NEXT:    v_mov_b32_e32 v0, 0xb2a50881
4073; GCN-NEXT:    s_mov_b32 s7, 0xf000
4074; GCN-NEXT:    s_mov_b32 s6, -1
4075; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4076; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
4077; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s0, v0
4078; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
4079; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
4080; GCN-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
4081; GCN-NEXT:    v_mul_u32_u24_e32 v0, 0x12d8fb, v0
4082; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
4083; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4084; GCN-NEXT:    s_endpgm
4085  %r = urem i32 %x, 1235195
4086  store i32 %r, i32 addrspace(1)* %out
4087  ret void
4088}
4089
4090define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
4091; CHECK-LABEL: @urem_i32_pow2k_denom(
4092; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 4096
4093; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4094; CHECK-NEXT:    ret void
4095;
4096; GCN-LABEL: urem_i32_pow2k_denom:
4097; GCN:       ; %bb.0:
4098; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4099; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4100; GCN-NEXT:    s_mov_b32 s7, 0xf000
4101; GCN-NEXT:    s_mov_b32 s6, -1
4102; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4103; GCN-NEXT:    s_and_b32 s0, s0, 0xfff
4104; GCN-NEXT:    v_mov_b32_e32 v0, s0
4105; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4106; GCN-NEXT:    s_endpgm
4107  %r = urem i32 %x, 4096
4108  store i32 %r, i32 addrspace(1)* %out
4109  ret void
4110}
4111
4112define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
4113; CHECK-LABEL: @urem_i32_pow2_shl_denom(
4114; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
4115; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]]
4116; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4117; CHECK-NEXT:    ret void
4118;
4119; GCN-LABEL: urem_i32_pow2_shl_denom:
4120; GCN:       ; %bb.0:
4121; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4122; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4123; GCN-NEXT:    s_mov_b32 s7, 0xf000
4124; GCN-NEXT:    s_mov_b32 s6, -1
4125; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4126; GCN-NEXT:    s_lshl_b32 s1, 0x1000, s1
4127; GCN-NEXT:    s_add_i32 s1, s1, -1
4128; GCN-NEXT:    s_and_b32 s0, s0, s1
4129; GCN-NEXT:    v_mov_b32_e32 v0, s0
4130; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4131; GCN-NEXT:    s_endpgm
4132  %shl.y = shl i32 4096, %y
4133  %r = urem i32 %x, %shl.y
4134  store i32 %r, i32 addrspace(1)* %out
4135  ret void
4136}
4137
4138define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
4139; CHECK-LABEL: @urem_v2i32_pow2k_denom(
4140; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4141; CHECK-NEXT:    [[TMP2:%.*]] = urem i32 [[TMP1]], 4096
4142; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
4143; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
4144; CHECK-NEXT:    [[TMP5:%.*]] = urem i32 [[TMP4]], 4096
4145; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
4146; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4147; CHECK-NEXT:    ret void
4148;
4149; GCN-LABEL: urem_v2i32_pow2k_denom:
4150; GCN:       ; %bb.0:
4151; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4152; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4153; GCN-NEXT:    s_movk_i32 s2, 0xfff
4154; GCN-NEXT:    s_mov_b32 s7, 0xf000
4155; GCN-NEXT:    s_mov_b32 s6, -1
4156; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4157; GCN-NEXT:    s_and_b32 s0, s0, s2
4158; GCN-NEXT:    s_and_b32 s1, s1, s2
4159; GCN-NEXT:    v_mov_b32_e32 v0, s0
4160; GCN-NEXT:    v_mov_b32_e32 v1, s1
4161; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4162; GCN-NEXT:    s_endpgm
4163  %r = urem <2 x i32> %x, <i32 4096, i32 4096>
4164  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4165  ret void
4166}
4167
4168define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
4169; CHECK-LABEL: @urem_v2i32_pow2_shl_denom(
4170; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
4171; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4172; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
4173; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
4174; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
4175; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
4176; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
4177; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
4178; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
4179; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
4180; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
4181; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
4182; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
4183; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
4184; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
4185; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
4186; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
4187; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
4188; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
4189; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
4190; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
4191; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
4192; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
4193; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
4194; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
4195; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]]
4196; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]]
4197; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]]
4198; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]]
4199; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]]
4200; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> undef, i32 [[TMP29]], i64 0
4201; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x i32> [[X]], i64 1
4202; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
4203; CHECK-NEXT:    [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float
4204; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
4205; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000
4206; CHECK-NEXT:    [[TMP36:%.*]] = fptoui float [[TMP35]] to i32
4207; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 0, [[TMP32]]
4208; CHECK-NEXT:    [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]]
4209; CHECK-NEXT:    [[TMP39:%.*]] = zext i32 [[TMP36]] to i64
4210; CHECK-NEXT:    [[TMP40:%.*]] = zext i32 [[TMP38]] to i64
4211; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]]
4212; CHECK-NEXT:    [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32
4213; CHECK-NEXT:    [[TMP43:%.*]] = lshr i64 [[TMP41]], 32
4214; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
4215; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]]
4216; CHECK-NEXT:    [[TMP46:%.*]] = zext i32 [[TMP31]] to i64
4217; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP45]] to i64
4218; CHECK-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]]
4219; CHECK-NEXT:    [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32
4220; CHECK-NEXT:    [[TMP50:%.*]] = lshr i64 [[TMP48]], 32
4221; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
4222; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]]
4223; CHECK-NEXT:    [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]]
4224; CHECK-NEXT:    [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]]
4225; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]]
4226; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]]
4227; CHECK-NEXT:    [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]]
4228; CHECK-NEXT:    [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]]
4229; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]]
4230; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP59]], i64 1
4231; CHECK-NEXT:    store <2 x i32> [[TMP60]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4232; CHECK-NEXT:    ret void
4233;
4234; GCN-LABEL: urem_v2i32_pow2_shl_denom:
4235; GCN:       ; %bb.0:
4236; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
4237; GCN-NEXT:    s_movk_i32 s4, 0x1000
4238; GCN-NEXT:    s_mov_b32 s7, 0x4f7ffffe
4239; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4240; GCN-NEXT:    s_lshl_b32 s2, s4, s2
4241; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
4242; GCN-NEXT:    s_lshl_b32 s6, s4, s3
4243; GCN-NEXT:    s_sub_i32 s3, 0, s2
4244; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s6
4245; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
4246; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
4247; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4248; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
4249; GCN-NEXT:    v_mul_f32_e32 v0, s7, v0
4250; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4251; GCN-NEXT:    v_mul_f32_e32 v1, s7, v1
4252; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
4253; GCN-NEXT:    v_mul_lo_u32 v2, s3, v0
4254; GCN-NEXT:    s_sub_i32 s3, 0, s6
4255; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
4256; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
4257; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4258; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
4259; GCN-NEXT:    v_mul_lo_u32 v2, s3, v1
4260; GCN-NEXT:    s_mov_b32 s3, 0xf000
4261; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
4262; GCN-NEXT:    v_mul_hi_u32 v2, v1, v2
4263; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
4264; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v0
4265; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
4266; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4267; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v0
4268; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
4269; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4270; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
4271; GCN-NEXT:    v_mul_hi_u32 v1, s5, v1
4272; GCN-NEXT:    s_mov_b32 s2, -1
4273; GCN-NEXT:    v_mul_lo_u32 v1, v1, s6
4274; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
4275; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s6, v1
4276; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v1
4277; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4278; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s6, v1
4279; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v1
4280; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4281; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4282; GCN-NEXT:    s_endpgm
4283  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
4284  %r = urem <2 x i32> %x, %shl.y
4285  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4286  ret void
4287}
4288
4289define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
4290; CHECK-LABEL: @sdiv_i32_oddk_denom(
4291; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195
4292; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4293; CHECK-NEXT:    ret void
4294;
4295; GCN-LABEL: sdiv_i32_oddk_denom:
4296; GCN:       ; %bb.0:
4297; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4298; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4299; GCN-NEXT:    v_mov_b32_e32 v0, 0xd9528441
4300; GCN-NEXT:    s_mov_b32 s7, 0xf000
4301; GCN-NEXT:    s_mov_b32 s6, -1
4302; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4303; GCN-NEXT:    v_mul_hi_i32 v0, s0, v0
4304; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
4305; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
4306; GCN-NEXT:    v_ashrrev_i32_e32 v0, 20, v0
4307; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
4308; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4309; GCN-NEXT:    s_endpgm
4310  %r = sdiv i32 %x, 1235195
4311  store i32 %r, i32 addrspace(1)* %out
4312  ret void
4313}
4314
4315define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
4316; CHECK-LABEL: @sdiv_i32_pow2k_denom(
4317; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 4096
4318; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4319; CHECK-NEXT:    ret void
4320;
4321; GCN-LABEL: sdiv_i32_pow2k_denom:
4322; GCN:       ; %bb.0:
4323; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4324; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4325; GCN-NEXT:    s_mov_b32 s7, 0xf000
4326; GCN-NEXT:    s_mov_b32 s6, -1
4327; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4328; GCN-NEXT:    s_ashr_i32 s1, s0, 31
4329; GCN-NEXT:    s_lshr_b32 s1, s1, 20
4330; GCN-NEXT:    s_add_i32 s0, s0, s1
4331; GCN-NEXT:    s_ashr_i32 s0, s0, 12
4332; GCN-NEXT:    v_mov_b32_e32 v0, s0
4333; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4334; GCN-NEXT:    s_endpgm
4335  %r = sdiv i32 %x, 4096
4336  store i32 %r, i32 addrspace(1)* %out
4337  ret void
4338}
4339
4340define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
4341; CHECK-LABEL: @sdiv_i32_pow2_shl_denom(
4342; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
4343; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]]
4344; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4345; CHECK-NEXT:    ret void
4346;
4347; GCN-LABEL: sdiv_i32_pow2_shl_denom:
4348; GCN:       ; %bb.0:
4349; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4350; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4351; GCN-NEXT:    s_mov_b32 s7, 0xf000
4352; GCN-NEXT:    s_mov_b32 s6, -1
4353; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4354; GCN-NEXT:    s_lshl_b32 s3, 0x1000, s3
4355; GCN-NEXT:    s_ashr_i32 s8, s3, 31
4356; GCN-NEXT:    s_add_i32 s3, s3, s8
4357; GCN-NEXT:    s_xor_b32 s9, s3, s8
4358; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
4359; GCN-NEXT:    s_sub_i32 s3, 0, s9
4360; GCN-NEXT:    s_ashr_i32 s0, s2, 31
4361; GCN-NEXT:    s_add_i32 s1, s2, s0
4362; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
4363; GCN-NEXT:    s_xor_b32 s1, s1, s0
4364; GCN-NEXT:    s_xor_b32 s2, s0, s8
4365; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
4366; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4367; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
4368; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
4369; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
4370; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
4371; GCN-NEXT:    v_mul_lo_u32 v1, v0, s9
4372; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
4373; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
4374; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v1
4375; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
4376; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s9, v1
4377; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
4378; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
4379; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
4380; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4381; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
4382; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
4383; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4384; GCN-NEXT:    s_endpgm
4385  %shl.y = shl i32 4096, %y
4386  %r = sdiv i32 %x, %shl.y
4387  store i32 %r, i32 addrspace(1)* %out
4388  ret void
4389}
4390
4391define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
4392; CHECK-LABEL: @sdiv_v2i32_pow2k_denom(
4393; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4394; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
4395; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
4396; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
4397; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096
4398; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
4399; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4400; CHECK-NEXT:    ret void
4401;
4402; GCN-LABEL: sdiv_v2i32_pow2k_denom:
4403; GCN:       ; %bb.0:
4404; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4405; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4406; GCN-NEXT:    s_mov_b32 s7, 0xf000
4407; GCN-NEXT:    s_mov_b32 s6, -1
4408; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4409; GCN-NEXT:    s_ashr_i32 s2, s0, 31
4410; GCN-NEXT:    s_lshr_b32 s2, s2, 20
4411; GCN-NEXT:    s_ashr_i32 s3, s1, 31
4412; GCN-NEXT:    s_add_i32 s0, s0, s2
4413; GCN-NEXT:    s_lshr_b32 s2, s3, 20
4414; GCN-NEXT:    s_add_i32 s1, s1, s2
4415; GCN-NEXT:    s_ashr_i32 s0, s0, 12
4416; GCN-NEXT:    s_ashr_i32 s1, s1, 12
4417; GCN-NEXT:    v_mov_b32_e32 v0, s0
4418; GCN-NEXT:    v_mov_b32_e32 v1, s1
4419; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4420; GCN-NEXT:    s_endpgm
4421  %r = sdiv <2 x i32> %x, <i32 4096, i32 4096>
4422  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4423  ret void
4424}
4425
4426define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
4427; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom(
4428; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4429; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
4430; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
4431; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
4432; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095
4433; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
4434; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4435; CHECK-NEXT:    ret void
4436;
4437; GCN-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
4438; GCN:       ; %bb.0:
4439; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4440; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4441; GCN-NEXT:    v_mov_b32_e32 v0, 0x80080081
4442; GCN-NEXT:    s_mov_b32 s7, 0xf000
4443; GCN-NEXT:    s_mov_b32 s6, -1
4444; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4445; GCN-NEXT:    v_mul_hi_i32 v0, s1, v0
4446; GCN-NEXT:    s_ashr_i32 s2, s0, 31
4447; GCN-NEXT:    s_lshr_b32 s2, s2, 20
4448; GCN-NEXT:    s_add_i32 s0, s0, s2
4449; GCN-NEXT:    v_add_i32_e32 v0, vcc, s1, v0
4450; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
4451; GCN-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
4452; GCN-NEXT:    s_ashr_i32 s0, s0, 12
4453; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
4454; GCN-NEXT:    v_mov_b32_e32 v0, s0
4455; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4456; GCN-NEXT:    s_endpgm
4457  %r = sdiv <2 x i32> %x, <i32 4096, i32 4095>
4458  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4459  ret void
4460}
4461
4462define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
4463; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom(
4464; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
4465; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4466; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
4467; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
4468; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
4469; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4470; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]]
4471; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]]
4472; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]]
4473; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]]
4474; CHECK-NEXT:    [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
4475; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]])
4476; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000
4477; CHECK-NEXT:    [[TMP13:%.*]] = fptoui float [[TMP12]] to i32
4478; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 0, [[TMP9]]
4479; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]]
4480; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP13]] to i64
4481; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
4482; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
4483; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
4484; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
4485; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
4486; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]]
4487; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP8]] to i64
4488; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP22]] to i64
4489; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
4490; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
4491; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
4492; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
4493; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]]
4494; CHECK-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]]
4495; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]]
4496; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP28]], 1
4497; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]]
4498; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]]
4499; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]]
4500; CHECK-NEXT:    [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]]
4501; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP33]], 1
4502; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]]
4503; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]]
4504; CHECK-NEXT:    [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]]
4505; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <2 x i32> undef, i32 [[TMP40]], i64 0
4506; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i32> [[X]], i64 1
4507; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
4508; CHECK-NEXT:    [[TMP44:%.*]] = ashr i32 [[TMP42]], 31
4509; CHECK-NEXT:    [[TMP45:%.*]] = ashr i32 [[TMP43]], 31
4510; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]]
4511; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]]
4512; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]]
4513; CHECK-NEXT:    [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]]
4514; CHECK-NEXT:    [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]]
4515; CHECK-NEXT:    [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float
4516; CHECK-NEXT:    [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]])
4517; CHECK-NEXT:    [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000
4518; CHECK-NEXT:    [[TMP54:%.*]] = fptoui float [[TMP53]] to i32
4519; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 0, [[TMP50]]
4520; CHECK-NEXT:    [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]]
4521; CHECK-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP54]] to i64
4522; CHECK-NEXT:    [[TMP58:%.*]] = zext i32 [[TMP56]] to i64
4523; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]]
4524; CHECK-NEXT:    [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32
4525; CHECK-NEXT:    [[TMP61:%.*]] = lshr i64 [[TMP59]], 32
4526; CHECK-NEXT:    [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32
4527; CHECK-NEXT:    [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]]
4528; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[TMP49]] to i64
4529; CHECK-NEXT:    [[TMP65:%.*]] = zext i32 [[TMP63]] to i64
4530; CHECK-NEXT:    [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]]
4531; CHECK-NEXT:    [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32
4532; CHECK-NEXT:    [[TMP68:%.*]] = lshr i64 [[TMP66]], 32
4533; CHECK-NEXT:    [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32
4534; CHECK-NEXT:    [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]]
4535; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]]
4536; CHECK-NEXT:    [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]]
4537; CHECK-NEXT:    [[TMP73:%.*]] = add i32 [[TMP69]], 1
4538; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]]
4539; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]]
4540; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]]
4541; CHECK-NEXT:    [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]]
4542; CHECK-NEXT:    [[TMP78:%.*]] = add i32 [[TMP74]], 1
4543; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]]
4544; CHECK-NEXT:    [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]]
4545; CHECK-NEXT:    [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]]
4546; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <2 x i32> [[TMP41]], i32 [[TMP81]], i64 1
4547; CHECK-NEXT:    store <2 x i32> [[TMP82]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4548; CHECK-NEXT:    ret void
4549;
4550; GCN-LABEL: sdiv_v2i32_pow2_shl_denom:
4551; GCN:       ; %bb.0:
4552; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
4553; GCN-NEXT:    s_movk_i32 s6, 0x1000
4554; GCN-NEXT:    s_mov_b32 s12, 0x4f7ffffe
4555; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4556; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
4557; GCN-NEXT:    s_mov_b32 s7, 0xf000
4558; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4559; GCN-NEXT:    s_lshl_b32 s2, s6, s2
4560; GCN-NEXT:    s_ashr_i32 s10, s2, 31
4561; GCN-NEXT:    s_add_i32 s2, s2, s10
4562; GCN-NEXT:    s_xor_b32 s11, s2, s10
4563; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s11
4564; GCN-NEXT:    s_sub_i32 s1, 0, s11
4565; GCN-NEXT:    s_lshl_b32 s0, s6, s3
4566; GCN-NEXT:    s_ashr_i32 s3, s0, 31
4567; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
4568; GCN-NEXT:    s_add_i32 s0, s0, s3
4569; GCN-NEXT:    s_xor_b32 s13, s0, s3
4570; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s13
4571; GCN-NEXT:    v_mul_f32_e32 v0, s12, v0
4572; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4573; GCN-NEXT:    s_ashr_i32 s2, s8, 31
4574; GCN-NEXT:    s_add_i32 s0, s8, s2
4575; GCN-NEXT:    s_xor_b32 s0, s0, s2
4576; GCN-NEXT:    v_mul_lo_u32 v1, s1, v0
4577; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
4578; GCN-NEXT:    s_xor_b32 s2, s2, s10
4579; GCN-NEXT:    s_mov_b32 s6, -1
4580; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
4581; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
4582; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
4583; GCN-NEXT:    v_mul_f32_e32 v1, s12, v2
4584; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
4585; GCN-NEXT:    v_mul_lo_u32 v2, v0, s11
4586; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
4587; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s0, v2
4588; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v2
4589; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
4590; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s11, v2
4591; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
4592; GCN-NEXT:    s_sub_i32 s0, 0, s13
4593; GCN-NEXT:    v_mul_lo_u32 v4, s0, v1
4594; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
4595; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s11, v2
4596; GCN-NEXT:    s_ashr_i32 s0, s9, 31
4597; GCN-NEXT:    v_mul_hi_u32 v2, v1, v4
4598; GCN-NEXT:    s_add_i32 s1, s9, s0
4599; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4600; GCN-NEXT:    s_xor_b32 s1, s1, s0
4601; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
4602; GCN-NEXT:    v_mul_hi_u32 v1, s1, v1
4603; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
4604; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
4605; GCN-NEXT:    s_xor_b32 s2, s0, s3
4606; GCN-NEXT:    v_mul_lo_u32 v2, v1, s13
4607; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
4608; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
4609; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v2
4610; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
4611; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s13, v2
4612; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
4613; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
4614; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
4615; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
4616; GCN-NEXT:    v_xor_b32_e32 v1, s2, v1
4617; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
4618; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4619; GCN-NEXT:    s_endpgm
4620  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
4621  %r = sdiv <2 x i32> %x, %shl.y
4622  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4623  ret void
4624}
4625
4626define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
4627; CHECK-LABEL: @srem_i32_oddk_denom(
4628; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 1235195
4629; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4630; CHECK-NEXT:    ret void
4631;
4632; GCN-LABEL: srem_i32_oddk_denom:
4633; GCN:       ; %bb.0:
4634; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4635; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4636; GCN-NEXT:    v_mov_b32_e32 v0, 0xd9528441
4637; GCN-NEXT:    s_mov_b32 s7, 0xf000
4638; GCN-NEXT:    s_mov_b32 s6, -1
4639; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4640; GCN-NEXT:    v_mul_hi_i32 v0, s0, v0
4641; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
4642; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
4643; GCN-NEXT:    v_ashrrev_i32_e32 v0, 20, v0
4644; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
4645; GCN-NEXT:    v_mul_i32_i24_e32 v0, 0x12d8fb, v0
4646; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
4647; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4648; GCN-NEXT:    s_endpgm
4649  %r = srem i32 %x, 1235195
4650  store i32 %r, i32 addrspace(1)* %out
4651  ret void
4652}
4653
4654define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
4655; CHECK-LABEL: @srem_i32_pow2k_denom(
4656; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 4096
4657; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4658; CHECK-NEXT:    ret void
4659;
4660; GCN-LABEL: srem_i32_pow2k_denom:
4661; GCN:       ; %bb.0:
4662; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4663; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4664; GCN-NEXT:    s_mov_b32 s7, 0xf000
4665; GCN-NEXT:    s_mov_b32 s6, -1
4666; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4667; GCN-NEXT:    s_ashr_i32 s1, s0, 31
4668; GCN-NEXT:    s_lshr_b32 s1, s1, 20
4669; GCN-NEXT:    s_add_i32 s1, s0, s1
4670; GCN-NEXT:    s_and_b32 s1, s1, 0xfffff000
4671; GCN-NEXT:    s_sub_i32 s0, s0, s1
4672; GCN-NEXT:    v_mov_b32_e32 v0, s0
4673; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4674; GCN-NEXT:    s_endpgm
4675  %r = srem i32 %x, 4096
4676  store i32 %r, i32 addrspace(1)* %out
4677  ret void
4678}
4679
4680define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
4681; CHECK-LABEL: @srem_i32_pow2_shl_denom(
4682; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
4683; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]]
4684; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4685; CHECK-NEXT:    ret void
4686;
4687; GCN-LABEL: srem_i32_pow2_shl_denom:
4688; GCN:       ; %bb.0:
4689; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4690; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4691; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4692; GCN-NEXT:    s_lshl_b32 s3, 0x1000, s3
4693; GCN-NEXT:    s_ashr_i32 s4, s3, 31
4694; GCN-NEXT:    s_add_i32 s3, s3, s4
4695; GCN-NEXT:    s_xor_b32 s4, s3, s4
4696; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
4697; GCN-NEXT:    s_sub_i32 s3, 0, s4
4698; GCN-NEXT:    s_ashr_i32 s5, s2, 31
4699; GCN-NEXT:    s_add_i32 s2, s2, s5
4700; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
4701; GCN-NEXT:    s_xor_b32 s6, s2, s5
4702; GCN-NEXT:    s_mov_b32 s2, -1
4703; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
4704; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4705; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
4706; GCN-NEXT:    s_mov_b32 s3, 0xf000
4707; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
4708; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
4709; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
4710; GCN-NEXT:    v_mul_lo_u32 v0, v0, s4
4711; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
4712; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
4713; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
4714; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
4715; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
4716; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
4717; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
4718; GCN-NEXT:    v_xor_b32_e32 v0, s5, v0
4719; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s5, v0
4720; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4721; GCN-NEXT:    s_endpgm
4722  %shl.y = shl i32 4096, %y
4723  %r = srem i32 %x, %shl.y
4724  store i32 %r, i32 addrspace(1)* %out
4725  ret void
4726}
4727
4728define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
4729; CHECK-LABEL: @srem_v2i32_pow2k_denom(
4730; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4731; CHECK-NEXT:    [[TMP2:%.*]] = srem i32 [[TMP1]], 4096
4732; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
4733; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
4734; CHECK-NEXT:    [[TMP5:%.*]] = srem i32 [[TMP4]], 4096
4735; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
4736; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4737; CHECK-NEXT:    ret void
4738;
4739; GCN-LABEL: srem_v2i32_pow2k_denom:
4740; GCN:       ; %bb.0:
4741; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4742; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4743; GCN-NEXT:    s_movk_i32 s2, 0xf000
4744; GCN-NEXT:    s_mov_b32 s7, 0xf000
4745; GCN-NEXT:    s_mov_b32 s6, -1
4746; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4747; GCN-NEXT:    s_ashr_i32 s3, s0, 31
4748; GCN-NEXT:    s_lshr_b32 s3, s3, 20
4749; GCN-NEXT:    s_add_i32 s3, s0, s3
4750; GCN-NEXT:    s_and_b32 s3, s3, s2
4751; GCN-NEXT:    s_sub_i32 s0, s0, s3
4752; GCN-NEXT:    s_ashr_i32 s3, s1, 31
4753; GCN-NEXT:    s_lshr_b32 s3, s3, 20
4754; GCN-NEXT:    s_add_i32 s3, s1, s3
4755; GCN-NEXT:    s_and_b32 s2, s3, s2
4756; GCN-NEXT:    s_sub_i32 s1, s1, s2
4757; GCN-NEXT:    v_mov_b32_e32 v0, s0
4758; GCN-NEXT:    v_mov_b32_e32 v1, s1
4759; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4760; GCN-NEXT:    s_endpgm
4761  %r = srem <2 x i32> %x, <i32 4096, i32 4096>
4762  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4763  ret void
4764}
4765
4766define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
4767; CHECK-LABEL: @srem_v2i32_pow2_shl_denom(
4768; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
4769; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4770; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
4771; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
4772; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
4773; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
4774; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]]
4775; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]]
4776; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]]
4777; CHECK-NEXT:    [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
4778; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
4779; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000
4780; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP11]] to i32
4781; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 0, [[TMP8]]
4782; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]]
4783; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP12]] to i64
4784; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
4785; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]]
4786; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
4787; CHECK-NEXT:    [[TMP19:%.*]] = lshr i64 [[TMP17]], 32
4788; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
4789; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
4790; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
4791; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP21]] to i64
4792; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]]
4793; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
4794; CHECK-NEXT:    [[TMP26:%.*]] = lshr i64 [[TMP24]], 32
4795; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
4796; CHECK-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]]
4797; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]]
4798; CHECK-NEXT:    [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]]
4799; CHECK-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]]
4800; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]]
4801; CHECK-NEXT:    [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]]
4802; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]]
4803; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]]
4804; CHECK-NEXT:    [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]]
4805; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]]
4806; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <2 x i32> undef, i32 [[TMP37]], i64 0
4807; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <2 x i32> [[X]], i64 1
4808; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
4809; CHECK-NEXT:    [[TMP41:%.*]] = ashr i32 [[TMP39]], 31
4810; CHECK-NEXT:    [[TMP42:%.*]] = ashr i32 [[TMP40]], 31
4811; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]]
4812; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]]
4813; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]]
4814; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]]
4815; CHECK-NEXT:    [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float
4816; CHECK-NEXT:    [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]])
4817; CHECK-NEXT:    [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000
4818; CHECK-NEXT:    [[TMP50:%.*]] = fptoui float [[TMP49]] to i32
4819; CHECK-NEXT:    [[TMP51:%.*]] = sub i32 0, [[TMP46]]
4820; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]]
4821; CHECK-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP50]] to i64
4822; CHECK-NEXT:    [[TMP54:%.*]] = zext i32 [[TMP52]] to i64
4823; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]]
4824; CHECK-NEXT:    [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32
4825; CHECK-NEXT:    [[TMP57:%.*]] = lshr i64 [[TMP55]], 32
4826; CHECK-NEXT:    [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32
4827; CHECK-NEXT:    [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]]
4828; CHECK-NEXT:    [[TMP60:%.*]] = zext i32 [[TMP45]] to i64
4829; CHECK-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP59]] to i64
4830; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]]
4831; CHECK-NEXT:    [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32
4832; CHECK-NEXT:    [[TMP64:%.*]] = lshr i64 [[TMP62]], 32
4833; CHECK-NEXT:    [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32
4834; CHECK-NEXT:    [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]]
4835; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]]
4836; CHECK-NEXT:    [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]]
4837; CHECK-NEXT:    [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]]
4838; CHECK-NEXT:    [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]]
4839; CHECK-NEXT:    [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]]
4840; CHECK-NEXT:    [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]]
4841; CHECK-NEXT:    [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]]
4842; CHECK-NEXT:    [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]]
4843; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]]
4844; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP75]], i64 1
4845; CHECK-NEXT:    store <2 x i32> [[TMP76]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4846; CHECK-NEXT:    ret void
4847;
4848; GCN-LABEL: srem_v2i32_pow2_shl_denom:
4849; GCN:       ; %bb.0:
4850; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
4851; GCN-NEXT:    s_movk_i32 s6, 0x1000
4852; GCN-NEXT:    s_mov_b32 s7, 0x4f7ffffe
4853; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4854; GCN-NEXT:    s_lshl_b32 s2, s6, s2
4855; GCN-NEXT:    s_ashr_i32 s4, s2, 31
4856; GCN-NEXT:    s_add_i32 s2, s2, s4
4857; GCN-NEXT:    s_xor_b32 s2, s2, s4
4858; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
4859; GCN-NEXT:    s_lshl_b32 s3, s6, s3
4860; GCN-NEXT:    s_ashr_i32 s6, s3, 31
4861; GCN-NEXT:    s_add_i32 s3, s3, s6
4862; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
4863; GCN-NEXT:    s_xor_b32 s3, s3, s6
4864; GCN-NEXT:    s_sub_i32 s6, 0, s2
4865; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4866; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4867; GCN-NEXT:    v_mul_f32_e32 v0, s7, v0
4868; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4869; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
4870; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4871; GCN-NEXT:    s_ashr_i32 s8, s0, 31
4872; GCN-NEXT:    v_mul_lo_u32 v2, s6, v0
4873; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
4874; GCN-NEXT:    s_add_i32 s0, s0, s8
4875; GCN-NEXT:    s_xor_b32 s0, s0, s8
4876; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
4877; GCN-NEXT:    v_mul_f32_e32 v1, s7, v1
4878; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
4879; GCN-NEXT:    s_sub_i32 s6, 0, s3
4880; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
4881; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
4882; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
4883; GCN-NEXT:    s_ashr_i32 s9, s1, 31
4884; GCN-NEXT:    s_add_i32 s1, s1, s9
4885; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
4886; GCN-NEXT:    v_mul_hi_u32 v2, v1, v2
4887; GCN-NEXT:    s_mov_b32 s7, 0xf000
4888; GCN-NEXT:    s_mov_b32 s6, -1
4889; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
4890; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v0
4891; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
4892; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4893; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v0
4894; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
4895; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4896; GCN-NEXT:    s_xor_b32 s0, s1, s9
4897; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
4898; GCN-NEXT:    v_mul_hi_u32 v1, s0, v1
4899; GCN-NEXT:    v_xor_b32_e32 v0, s8, v0
4900; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
4901; GCN-NEXT:    v_mul_lo_u32 v1, v1, s3
4902; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s0, v1
4903; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
4904; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
4905; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4906; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
4907; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
4908; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4909; GCN-NEXT:    v_xor_b32_e32 v1, s9, v1
4910; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s9, v1
4911; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4912; GCN-NEXT:    s_endpgm
4913  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
4914  %r = srem <2 x i32> %x, %shl.y
4915  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4916  ret void
4917}
4918
4919define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
4920; CHECK-LABEL: @udiv_i64_oddk_denom(
4921; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943
4922; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
4923; CHECK-NEXT:    ret void
4924;
4925; GCN-LABEL: udiv_i64_oddk_denom:
4926; GCN:       ; %bb.0:
4927; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f176a73
4928; GCN-NEXT:    v_mov_b32_e32 v1, 0x4f800000
4929; GCN-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
4930; GCN-NEXT:    v_rcp_f32_e32 v0, v0
4931; GCN-NEXT:    s_movk_i32 s2, 0xfee0
4932; GCN-NEXT:    s_mov_b32 s3, 0x68958c89
4933; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
4934; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
4935; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
4936; GCN-NEXT:    v_trunc_f32_e32 v1, v1
4937; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
4938; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4939; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
4940; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4941; GCN-NEXT:    s_mov_b32 s4, s8
4942; GCN-NEXT:    s_movk_i32 s8, 0x11f
4943; GCN-NEXT:    v_mul_lo_u32 v2, v0, s2
4944; GCN-NEXT:    v_mul_hi_u32 v3, v0, s3
4945; GCN-NEXT:    v_mul_lo_u32 v4, v1, s3
4946; GCN-NEXT:    v_mul_lo_u32 v5, v0, s3
4947; GCN-NEXT:    s_mov_b32 s5, s9
4948; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
4949; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
4950; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
4951; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
4952; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
4953; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
4954; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
4955; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
4956; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
4957; GCN-NEXT:    v_mul_lo_u32 v6, v1, v5
4958; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
4959; GCN-NEXT:    s_movk_i32 s9, 0x11e
4960; GCN-NEXT:    s_mov_b32 s7, 0xf000
4961; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
4962; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
4963; GCN-NEXT:    v_mov_b32_e32 v4, 0
4964; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
4965; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
4966; GCN-NEXT:    v_mov_b32_e32 v6, 0
4967; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
4968; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
4969; GCN-NEXT:    v_mul_lo_u32 v5, v0, s2
4970; GCN-NEXT:    v_mul_hi_u32 v7, v0, s3
4971; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
4972; GCN-NEXT:    v_mul_lo_u32 v8, v2, s3
4973; GCN-NEXT:    s_mov_b32 s2, 0x976a7377
4974; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
4975; GCN-NEXT:    v_mul_lo_u32 v7, v0, s3
4976; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
4977; GCN-NEXT:    v_mul_lo_u32 v8, v0, v5
4978; GCN-NEXT:    v_mul_hi_u32 v10, v0, v5
4979; GCN-NEXT:    v_mul_hi_u32 v9, v0, v7
4980; GCN-NEXT:    v_mul_hi_u32 v11, v2, v5
4981; GCN-NEXT:    s_mov_b32 s6, -1
4982; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
4983; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
4984; GCN-NEXT:    v_mul_lo_u32 v10, v2, v7
4985; GCN-NEXT:    v_mul_hi_u32 v7, v2, v7
4986; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
4987; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
4988; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
4989; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v11, v4, vcc
4990; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
4991; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
4992; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
4993; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
4994; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
4995; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4996; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
4997; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
4998; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
4999; GCN-NEXT:    v_mul_hi_u32 v7, s11, v1
5000; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
5001; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5002; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
5003; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
5004; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
5005; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
5006; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
5007; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
5008; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5009; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
5010; GCN-NEXT:    v_mul_lo_u32 v2, v0, s8
5011; GCN-NEXT:    v_mul_hi_u32 v3, v0, s2
5012; GCN-NEXT:    v_mul_lo_u32 v4, v1, s2
5013; GCN-NEXT:    v_mov_b32_e32 v5, s8
5014; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5015; GCN-NEXT:    v_mul_lo_u32 v3, v0, s2
5016; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
5017; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s11, v2
5018; GCN-NEXT:    v_sub_i32_e64 v3, s[0:1], s10, v3
5019; GCN-NEXT:    v_subb_u32_e64 v4, vcc, v4, v5, s[0:1]
5020; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s2, v3
5021; GCN-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
5022; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s9, v4
5023; GCN-NEXT:    s_mov_b32 s10, 0x976a7376
5024; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
5025; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v5
5026; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
5027; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v4
5028; GCN-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc
5029; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
5030; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
5031; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
5032; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
5033; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v4
5034; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[2:3]
5035; GCN-NEXT:    v_mov_b32_e32 v6, s11
5036; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v6, v2, s[0:1]
5037; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s9, v2
5038; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
5039; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v3
5040; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
5041; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v2
5042; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
5043; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
5044; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[2:3]
5045; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
5046; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5047; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5048; GCN-NEXT:    s_endpgm
5049  %r = udiv i64 %x, 1235195949943
5050  store i64 %r, i64 addrspace(1)* %out
5051  ret void
5052}
5053
5054define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
5055; CHECK-LABEL: @udiv_i64_pow2k_denom(
5056; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 4096
5057; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5058; CHECK-NEXT:    ret void
5059;
5060; GCN-LABEL: udiv_i64_pow2k_denom:
5061; GCN:       ; %bb.0:
5062; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
5063; GCN-NEXT:    s_mov_b32 s7, 0xf000
5064; GCN-NEXT:    s_mov_b32 s6, -1
5065; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5066; GCN-NEXT:    s_mov_b32 s4, s0
5067; GCN-NEXT:    s_mov_b32 s5, s1
5068; GCN-NEXT:    s_lshr_b64 s[0:1], s[2:3], 12
5069; GCN-NEXT:    v_mov_b32_e32 v0, s0
5070; GCN-NEXT:    v_mov_b32_e32 v1, s1
5071; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5072; GCN-NEXT:    s_endpgm
5073  %r = udiv i64 %x, 4096
5074  store i64 %r, i64 addrspace(1)* %out
5075  ret void
5076}
5077
5078define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
5079; CHECK-LABEL: @udiv_i64_pow2_shl_denom(
5080; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
5081; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]]
5082; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5083; CHECK-NEXT:    ret void
5084;
5085; GCN-LABEL: udiv_i64_pow2_shl_denom:
5086; GCN:       ; %bb.0:
5087; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
5088; GCN-NEXT:    s_load_dword s8, s[0:1], 0xd
5089; GCN-NEXT:    s_mov_b32 s3, 0xf000
5090; GCN-NEXT:    s_mov_b32 s2, -1
5091; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5092; GCN-NEXT:    s_mov_b32 s0, s4
5093; GCN-NEXT:    s_add_i32 s8, s8, 12
5094; GCN-NEXT:    s_mov_b32 s1, s5
5095; GCN-NEXT:    s_lshr_b64 s[4:5], s[6:7], s8
5096; GCN-NEXT:    v_mov_b32_e32 v0, s4
5097; GCN-NEXT:    v_mov_b32_e32 v1, s5
5098; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5099; GCN-NEXT:    s_endpgm
5100  %shl.y = shl i64 4096, %y
5101  %r = udiv i64 %x, %shl.y
5102  store i64 %r, i64 addrspace(1)* %out
5103  ret void
5104}
5105
5106define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
5107; CHECK-LABEL: @udiv_v2i64_pow2k_denom(
5108; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5109; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
5110; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
5111; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
5112; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096
5113; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
5114; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
5115; CHECK-NEXT:    ret void
5116;
5117; GCN-LABEL: udiv_v2i64_pow2k_denom:
5118; GCN:       ; %bb.0:
5119; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5120; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
5121; GCN-NEXT:    s_mov_b32 s7, 0xf000
5122; GCN-NEXT:    s_mov_b32 s6, -1
5123; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5124; GCN-NEXT:    s_lshr_b64 s[0:1], s[0:1], 12
5125; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
5126; GCN-NEXT:    v_mov_b32_e32 v0, s0
5127; GCN-NEXT:    v_mov_b32_e32 v1, s1
5128; GCN-NEXT:    v_mov_b32_e32 v2, s2
5129; GCN-NEXT:    v_mov_b32_e32 v3, s3
5130; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5131; GCN-NEXT:    s_endpgm
5132  %r = udiv <2 x i64> %x, <i64 4096, i64 4096>
5133  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5134  ret void
5135}
5136
5137define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
5138; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom(
5139; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5140; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
5141; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
5142; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
5143; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095
5144; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
5145; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
5146; CHECK-NEXT:    ret void
5147;
5148; GCN-LABEL: udiv_v2i64_mixed_pow2k_denom:
5149; GCN:       ; %bb.0:
5150; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
5151; GCN-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
5152; GCN-NEXT:    v_rcp_f32_e32 v0, v0
5153; GCN-NEXT:    s_movk_i32 s6, 0xf001
5154; GCN-NEXT:    v_mov_b32_e32 v7, 0
5155; GCN-NEXT:    v_mov_b32_e32 v2, 0
5156; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
5157; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
5158; GCN-NEXT:    v_trunc_f32_e32 v1, v1
5159; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
5160; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
5161; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
5162; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5163; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
5164; GCN-NEXT:    s_movk_i32 s0, 0xfff
5165; GCN-NEXT:    v_mul_hi_u32 v3, v0, s6
5166; GCN-NEXT:    v_mul_lo_u32 v5, v1, s6
5167; GCN-NEXT:    v_mul_lo_u32 v4, v0, s6
5168; GCN-NEXT:    s_mov_b32 s7, 0xf000
5169; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, v0, v3
5170; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
5171; GCN-NEXT:    v_mul_hi_u32 v6, v0, v4
5172; GCN-NEXT:    v_mul_lo_u32 v5, v0, v3
5173; GCN-NEXT:    v_mul_hi_u32 v8, v0, v3
5174; GCN-NEXT:    v_mul_hi_u32 v9, v1, v3
5175; GCN-NEXT:    v_mul_lo_u32 v3, v1, v3
5176; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
5177; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v7, v8, vcc
5178; GCN-NEXT:    v_mul_lo_u32 v8, v1, v4
5179; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
5180; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
5181; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v4, vcc
5182; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v2, vcc
5183; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
5184; GCN-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v3
5185; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
5186; GCN-NEXT:    v_mul_hi_u32 v5, v0, s6
5187; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v1, v4, s[2:3]
5188; GCN-NEXT:    v_mul_lo_u32 v6, v3, s6
5189; GCN-NEXT:    v_mul_lo_u32 v8, v0, s6
5190; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, v0, v5
5191; GCN-NEXT:    s_mov_b32 s6, -1
5192; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
5193; GCN-NEXT:    v_mul_lo_u32 v6, v0, v5
5194; GCN-NEXT:    v_mul_hi_u32 v9, v0, v8
5195; GCN-NEXT:    v_mul_hi_u32 v10, v0, v5
5196; GCN-NEXT:    v_mul_hi_u32 v11, v3, v5
5197; GCN-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
5198; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v7, v10, vcc
5199; GCN-NEXT:    v_mul_lo_u32 v10, v3, v8
5200; GCN-NEXT:    v_mul_hi_u32 v8, v3, v8
5201; GCN-NEXT:    v_mul_lo_u32 v3, v3, v5
5202; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
5203; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v9, v8, vcc
5204; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v11, v2, vcc
5205; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
5206; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
5207; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
5208; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
5209; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
5210; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5211; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5212; GCN-NEXT:    v_mul_lo_u32 v3, s10, v1
5213; GCN-NEXT:    v_mul_hi_u32 v4, s10, v0
5214; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
5215; GCN-NEXT:    v_mul_hi_u32 v6, s11, v1
5216; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
5217; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
5218; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
5219; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
5220; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
5221; GCN-NEXT:    s_lshr_b64 s[2:3], s[8:9], 12
5222; GCN-NEXT:    s_movk_i32 s8, 0xffe
5223; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
5224; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v4, v0, vcc
5225; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v6, v2, vcc
5226; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5227; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v7, v2, vcc
5228; GCN-NEXT:    v_mul_lo_u32 v2, v1, s0
5229; GCN-NEXT:    v_mul_hi_u32 v3, v0, s0
5230; GCN-NEXT:    v_mul_lo_u32 v4, v0, s0
5231; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5232; GCN-NEXT:    v_mov_b32_e32 v3, s11
5233; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s10, v4
5234; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
5235; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s0, v4
5236; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
5237; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s8, v3
5238; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
5239; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
5240; GCN-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
5241; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
5242; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
5243; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
5244; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
5245; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s8, v4
5246; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
5247; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
5248; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
5249; GCN-NEXT:    v_cndmask_b32_e32 v2, -1, v4, vcc
5250; GCN-NEXT:    v_cndmask_b32_e64 v3, v8, v6, s[0:1]
5251; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
5252; GCN-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
5253; GCN-NEXT:    v_cndmask_b32_e64 v1, v7, v5, s[0:1]
5254; GCN-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
5255; GCN-NEXT:    v_mov_b32_e32 v0, s2
5256; GCN-NEXT:    v_mov_b32_e32 v1, s3
5257; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5258; GCN-NEXT:    s_endpgm
5259  %r = udiv <2 x i64> %x, <i64 4096, i64 4095>
5260  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5261  ret void
5262}
5263
5264define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
5265; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom(
5266; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
5267; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5268; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
5269; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]]
5270; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
5271; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
5272; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
5273; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]]
5274; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
5275; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
5276; CHECK-NEXT:    ret void
5277;
5278; GCN-LABEL: udiv_v2i64_pow2_shl_denom:
5279; GCN:       ; %bb.0:
5280; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5281; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
5282; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
5283; GCN-NEXT:    s_mov_b32 s7, 0xf000
5284; GCN-NEXT:    s_mov_b32 s6, -1
5285; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5286; GCN-NEXT:    s_add_i32 s0, s0, 12
5287; GCN-NEXT:    s_add_i32 s2, s2, 12
5288; GCN-NEXT:    s_lshr_b64 s[0:1], s[8:9], s0
5289; GCN-NEXT:    s_lshr_b64 s[2:3], s[10:11], s2
5290; GCN-NEXT:    v_mov_b32_e32 v0, s0
5291; GCN-NEXT:    v_mov_b32_e32 v1, s1
5292; GCN-NEXT:    v_mov_b32_e32 v2, s2
5293; GCN-NEXT:    v_mov_b32_e32 v3, s3
5294; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5295; GCN-NEXT:    s_endpgm
5296  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
5297  %r = udiv <2 x i64> %x, %shl.y
5298  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5299  ret void
5300}
5301
5302define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
5303; CHECK-LABEL: @urem_i64_oddk_denom(
5304; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993
5305; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5306; CHECK-NEXT:    ret void
5307;
5308; GCN-LABEL: urem_i64_oddk_denom:
5309; GCN:       ; %bb.0:
5310; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f1761f8
5311; GCN-NEXT:    v_mov_b32_e32 v1, 0x4f800000
5312; GCN-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
5313; GCN-NEXT:    v_rcp_f32_e32 v0, v0
5314; GCN-NEXT:    s_movk_i32 s2, 0xfee0
5315; GCN-NEXT:    s_mov_b32 s3, 0x689e0837
5316; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
5317; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
5318; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
5319; GCN-NEXT:    v_trunc_f32_e32 v1, v1
5320; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
5321; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
5322; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
5323; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5324; GCN-NEXT:    s_mov_b32 s4, s8
5325; GCN-NEXT:    s_movk_i32 s8, 0x11f
5326; GCN-NEXT:    v_mul_lo_u32 v2, v0, s2
5327; GCN-NEXT:    v_mul_hi_u32 v3, v0, s3
5328; GCN-NEXT:    v_mul_lo_u32 v4, v1, s3
5329; GCN-NEXT:    v_mul_lo_u32 v5, v0, s3
5330; GCN-NEXT:    s_mov_b32 s12, 0x9761f7c9
5331; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5332; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
5333; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
5334; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
5335; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
5336; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
5337; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
5338; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
5339; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
5340; GCN-NEXT:    v_mul_lo_u32 v6, v1, v5
5341; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
5342; GCN-NEXT:    s_mov_b32 s5, s9
5343; GCN-NEXT:    s_movk_i32 s9, 0x11e
5344; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
5345; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
5346; GCN-NEXT:    v_mov_b32_e32 v4, 0
5347; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
5348; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5349; GCN-NEXT:    v_mov_b32_e32 v6, 0
5350; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
5351; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
5352; GCN-NEXT:    v_mul_lo_u32 v5, v0, s2
5353; GCN-NEXT:    v_mul_hi_u32 v7, v0, s3
5354; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
5355; GCN-NEXT:    v_mul_lo_u32 v8, v2, s3
5356; GCN-NEXT:    s_mov_b32 s7, 0xf000
5357; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
5358; GCN-NEXT:    v_mul_lo_u32 v7, v0, s3
5359; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
5360; GCN-NEXT:    v_mul_lo_u32 v8, v0, v5
5361; GCN-NEXT:    v_mul_hi_u32 v10, v0, v5
5362; GCN-NEXT:    v_mul_hi_u32 v9, v0, v7
5363; GCN-NEXT:    v_mul_hi_u32 v11, v2, v5
5364; GCN-NEXT:    s_mov_b32 s6, -1
5365; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
5366; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
5367; GCN-NEXT:    v_mul_lo_u32 v10, v2, v7
5368; GCN-NEXT:    v_mul_hi_u32 v7, v2, v7
5369; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
5370; GCN-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
5371; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
5372; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v11, v4, vcc
5373; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
5374; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
5375; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
5376; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
5377; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
5378; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5379; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
5380; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
5381; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
5382; GCN-NEXT:    v_mul_hi_u32 v7, s11, v1
5383; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
5384; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5385; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
5386; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
5387; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
5388; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
5389; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
5390; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
5391; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5392; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
5393; GCN-NEXT:    v_mul_lo_u32 v2, v0, s8
5394; GCN-NEXT:    v_mul_hi_u32 v3, v0, s12
5395; GCN-NEXT:    v_mul_lo_u32 v1, v1, s12
5396; GCN-NEXT:    v_mul_lo_u32 v0, v0, s12
5397; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5398; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
5399; GCN-NEXT:    v_sub_i32_e64 v0, s[0:1], s10, v0
5400; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s11, v1
5401; GCN-NEXT:    v_mov_b32_e32 v3, s8
5402; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v2, v3, s[0:1]
5403; GCN-NEXT:    v_subrev_i32_e64 v4, s[2:3], s12, v0
5404; GCN-NEXT:    v_subbrev_u32_e64 v5, vcc, 0, v2, s[2:3]
5405; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s9, v5
5406; GCN-NEXT:    s_mov_b32 s10, 0x9761f7c8
5407; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
5408; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v4
5409; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
5410; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v5
5411; GCN-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
5412; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v2, v3, s[2:3]
5413; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s12, v4
5414; GCN-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
5415; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v6
5416; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[2:3]
5417; GCN-NEXT:    v_mov_b32_e32 v5, s11
5418; GCN-NEXT:    v_subb_u32_e64 v1, vcc, v5, v1, s[0:1]
5419; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s9, v1
5420; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
5421; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s10, v0
5422; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
5423; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v1
5424; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
5425; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
5426; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5427; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[2:3]
5428; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5429; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5430; GCN-NEXT:    s_endpgm
5431  %r = urem i64 %x, 1235195393993
5432  store i64 %r, i64 addrspace(1)* %out
5433  ret void
5434}
5435
5436define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
5437; CHECK-LABEL: @urem_i64_pow2k_denom(
5438; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 4096
5439; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5440; CHECK-NEXT:    ret void
5441;
5442; GCN-LABEL: urem_i64_pow2k_denom:
5443; GCN:       ; %bb.0:
5444; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
5445; GCN-NEXT:    s_mov_b32 s3, 0xf000
5446; GCN-NEXT:    s_mov_b32 s2, -1
5447; GCN-NEXT:    v_mov_b32_e32 v1, 0
5448; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5449; GCN-NEXT:    s_mov_b32 s0, s4
5450; GCN-NEXT:    s_and_b32 s4, s6, 0xfff
5451; GCN-NEXT:    s_mov_b32 s1, s5
5452; GCN-NEXT:    v_mov_b32_e32 v0, s4
5453; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5454; GCN-NEXT:    s_endpgm
5455  %r = urem i64 %x, 4096
5456  store i64 %r, i64 addrspace(1)* %out
5457  ret void
5458}
5459
5460define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
5461; CHECK-LABEL: @urem_i64_pow2_shl_denom(
5462; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
5463; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]]
5464; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5465; CHECK-NEXT:    ret void
5466;
5467; GCN-LABEL: urem_i64_pow2_shl_denom:
5468; GCN:       ; %bb.0:
5469; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
5470; GCN-NEXT:    s_load_dword s8, s[0:1], 0xd
5471; GCN-NEXT:    s_mov_b32 s3, 0xf000
5472; GCN-NEXT:    s_mov_b32 s2, -1
5473; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5474; GCN-NEXT:    s_mov_b32 s0, s4
5475; GCN-NEXT:    s_mov_b32 s1, s5
5476; GCN-NEXT:    s_mov_b32 s5, 0
5477; GCN-NEXT:    s_movk_i32 s4, 0x1000
5478; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], s8
5479; GCN-NEXT:    s_add_u32 s4, s4, -1
5480; GCN-NEXT:    s_addc_u32 s5, s5, -1
5481; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
5482; GCN-NEXT:    v_mov_b32_e32 v0, s4
5483; GCN-NEXT:    v_mov_b32_e32 v1, s5
5484; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5485; GCN-NEXT:    s_endpgm
5486  %shl.y = shl i64 4096, %y
5487  %r = urem i64 %x, %shl.y
5488  store i64 %r, i64 addrspace(1)* %out
5489  ret void
5490}
5491
5492define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
5493; CHECK-LABEL: @urem_v2i64_pow2k_denom(
5494; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5495; CHECK-NEXT:    [[TMP2:%.*]] = urem i64 [[TMP1]], 4096
5496; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
5497; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
5498; CHECK-NEXT:    [[TMP5:%.*]] = urem i64 [[TMP4]], 4096
5499; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
5500; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
5501; CHECK-NEXT:    ret void
5502;
5503; GCN-LABEL: urem_v2i64_pow2k_denom:
5504; GCN:       ; %bb.0:
5505; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5506; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
5507; GCN-NEXT:    s_movk_i32 s8, 0xfff
5508; GCN-NEXT:    v_mov_b32_e32 v1, 0
5509; GCN-NEXT:    s_mov_b32 s7, 0xf000
5510; GCN-NEXT:    s_mov_b32 s6, -1
5511; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5512; GCN-NEXT:    s_and_b32 s0, s0, s8
5513; GCN-NEXT:    s_and_b32 s1, s2, s8
5514; GCN-NEXT:    v_mov_b32_e32 v0, s0
5515; GCN-NEXT:    v_mov_b32_e32 v2, s1
5516; GCN-NEXT:    v_mov_b32_e32 v3, v1
5517; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5518; GCN-NEXT:    s_endpgm
5519  %r = urem <2 x i64> %x, <i64 4096, i64 4096>
5520  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5521  ret void
5522}
5523
5524define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
5525; CHECK-LABEL: @urem_v2i64_pow2_shl_denom(
5526; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
5527; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5528; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
5529; CHECK-NEXT:    [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]]
5530; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
5531; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
5532; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
5533; CHECK-NEXT:    [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]]
5534; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
5535; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
5536; CHECK-NEXT:    ret void
5537;
5538; GCN-LABEL: urem_v2i64_pow2_shl_denom:
5539; GCN:       ; %bb.0:
5540; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5541; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
5542; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
5543; GCN-NEXT:    s_mov_b32 s13, 0
5544; GCN-NEXT:    s_movk_i32 s12, 0x1000
5545; GCN-NEXT:    s_mov_b32 s7, 0xf000
5546; GCN-NEXT:    s_mov_b32 s6, -1
5547; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5548; GCN-NEXT:    s_lshl_b64 s[2:3], s[12:13], s2
5549; GCN-NEXT:    s_lshl_b64 s[0:1], s[12:13], s0
5550; GCN-NEXT:    s_add_u32 s0, s0, -1
5551; GCN-NEXT:    s_addc_u32 s1, s1, -1
5552; GCN-NEXT:    s_and_b64 s[0:1], s[8:9], s[0:1]
5553; GCN-NEXT:    s_add_u32 s2, s2, -1
5554; GCN-NEXT:    s_addc_u32 s3, s3, -1
5555; GCN-NEXT:    s_and_b64 s[2:3], s[10:11], s[2:3]
5556; GCN-NEXT:    v_mov_b32_e32 v0, s0
5557; GCN-NEXT:    v_mov_b32_e32 v1, s1
5558; GCN-NEXT:    v_mov_b32_e32 v2, s2
5559; GCN-NEXT:    v_mov_b32_e32 v3, s3
5560; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5561; GCN-NEXT:    s_endpgm
5562  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
5563  %r = urem <2 x i64> %x, %shl.y
5564  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5565  ret void
5566}
5567
5568define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
5569; CHECK-LABEL: @sdiv_i64_oddk_denom(
5570; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195
5571; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5572; CHECK-NEXT:    ret void
5573;
5574; GCN-LABEL: sdiv_i64_oddk_denom:
5575; GCN:       ; %bb.0:
5576; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
5577; GCN-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
5578; GCN-NEXT:    v_rcp_f32_e32 v0, v0
5579; GCN-NEXT:    s_mov_b32 s2, 0xffed2705
5580; GCN-NEXT:    v_mov_b32_e32 v8, 0
5581; GCN-NEXT:    v_mov_b32_e32 v7, 0
5582; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
5583; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
5584; GCN-NEXT:    v_trunc_f32_e32 v1, v1
5585; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
5586; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
5587; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
5588; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
5589; GCN-NEXT:    s_mov_b32 s7, 0xf000
5590; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
5591; GCN-NEXT:    v_mul_lo_u32 v2, v1, s2
5592; GCN-NEXT:    v_mul_lo_u32 v4, v0, s2
5593; GCN-NEXT:    s_mov_b32 s6, -1
5594; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5595; GCN-NEXT:    s_mov_b32 s4, s8
5596; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5597; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
5598; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
5599; GCN-NEXT:    v_mul_hi_u32 v6, v0, v4
5600; GCN-NEXT:    v_mul_hi_u32 v3, v0, v2
5601; GCN-NEXT:    v_mul_hi_u32 v9, v1, v2
5602; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
5603; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
5604; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
5605; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
5606; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
5607; GCN-NEXT:    s_mov_b32 s5, s9
5608; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
5609; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
5610; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
5611; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5612; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
5613; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
5614; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
5615; GCN-NEXT:    v_mul_lo_u32 v4, v2, s2
5616; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
5617; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
5618; GCN-NEXT:    v_mul_lo_u32 v5, v0, s2
5619; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
5620; GCN-NEXT:    v_mul_lo_u32 v10, v0, v4
5621; GCN-NEXT:    v_mul_hi_u32 v12, v0, v4
5622; GCN-NEXT:    v_mul_hi_u32 v11, v0, v5
5623; GCN-NEXT:    v_mul_hi_u32 v9, v2, v5
5624; GCN-NEXT:    v_mul_lo_u32 v5, v2, v5
5625; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
5626; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
5627; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v8, v12, vcc
5628; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
5629; GCN-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
5630; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v11, v9, vcc
5631; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v7, vcc
5632; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
5633; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
5634; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
5635; GCN-NEXT:    s_ashr_i32 s2, s11, 31
5636; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
5637; GCN-NEXT:    s_add_u32 s0, s10, s2
5638; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
5639; GCN-NEXT:    s_mov_b32 s3, s2
5640; GCN-NEXT:    s_addc_u32 s1, s11, s2
5641; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
5642; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5643; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
5644; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
5645; GCN-NEXT:    v_mul_hi_u32 v4, s0, v1
5646; GCN-NEXT:    v_mul_hi_u32 v5, s1, v1
5647; GCN-NEXT:    v_mul_lo_u32 v1, s1, v1
5648; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5649; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
5650; GCN-NEXT:    v_mul_lo_u32 v4, s1, v0
5651; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
5652; GCN-NEXT:    s_mov_b32 s3, 0x12d8fb
5653; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
5654; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
5655; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
5656; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5657; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
5658; GCN-NEXT:    v_mul_lo_u32 v2, v1, s3
5659; GCN-NEXT:    v_mul_hi_u32 v3, s3, v0
5660; GCN-NEXT:    v_mul_lo_u32 v4, v0, s3
5661; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5662; GCN-NEXT:    v_mov_b32_e32 v3, s1
5663; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s0, v4
5664; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
5665; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v4
5666; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
5667; GCN-NEXT:    s_mov_b32 s3, 0x12d8fa
5668; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v3
5669; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
5670; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
5671; GCN-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
5672; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
5673; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
5674; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
5675; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
5676; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v4
5677; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
5678; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
5679; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
5680; GCN-NEXT:    v_cndmask_b32_e32 v2, -1, v4, vcc
5681; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
5682; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
5683; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5684; GCN-NEXT:    v_cndmask_b32_e64 v3, v8, v6, s[0:1]
5685; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
5686; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
5687; GCN-NEXT:    v_xor_b32_e32 v1, s2, v1
5688; GCN-NEXT:    v_mov_b32_e32 v2, s2
5689; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
5690; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
5691; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5692; GCN-NEXT:    s_endpgm
5693  %r = sdiv i64 %x, 1235195
5694  store i64 %r, i64 addrspace(1)* %out
5695  ret void
5696}
5697
5698define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
5699; CHECK-LABEL: @sdiv_i64_pow2k_denom(
5700; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 4096
5701; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5702; CHECK-NEXT:    ret void
5703;
5704; GCN-LABEL: sdiv_i64_pow2k_denom:
5705; GCN:       ; %bb.0:
5706; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
5707; GCN-NEXT:    s_mov_b32 s7, 0xf000
5708; GCN-NEXT:    s_mov_b32 s6, -1
5709; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5710; GCN-NEXT:    s_mov_b32 s4, s0
5711; GCN-NEXT:    s_ashr_i32 s0, s3, 31
5712; GCN-NEXT:    s_lshr_b32 s0, s0, 20
5713; GCN-NEXT:    s_add_u32 s0, s2, s0
5714; GCN-NEXT:    s_mov_b32 s5, s1
5715; GCN-NEXT:    s_addc_u32 s1, s3, 0
5716; GCN-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
5717; GCN-NEXT:    v_mov_b32_e32 v0, s0
5718; GCN-NEXT:    v_mov_b32_e32 v1, s1
5719; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5720; GCN-NEXT:    s_endpgm
5721  %r = sdiv i64 %x, 4096
5722  store i64 %r, i64 addrspace(1)* %out
5723  ret void
5724}
5725
5726define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
5727; CHECK-LABEL: @sdiv_i64_pow2_shl_denom(
5728; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
5729; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]]
5730; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5731; CHECK-NEXT:    ret void
5732;
5733; GCN-LABEL: sdiv_i64_pow2_shl_denom:
5734; GCN:       ; %bb.0:
5735; GCN-NEXT:    s_load_dword s4, s[0:1], 0xd
5736; GCN-NEXT:    s_mov_b32 s3, 0
5737; GCN-NEXT:    s_movk_i32 s2, 0x1000
5738; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
5739; GCN-NEXT:    s_mov_b32 s7, 0xf000
5740; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5741; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
5742; GCN-NEXT:    s_ashr_i32 s12, s3, 31
5743; GCN-NEXT:    s_add_u32 s2, s2, s12
5744; GCN-NEXT:    s_mov_b32 s13, s12
5745; GCN-NEXT:    s_addc_u32 s3, s3, s12
5746; GCN-NEXT:    s_xor_b64 s[14:15], s[2:3], s[12:13]
5747; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s14
5748; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s15
5749; GCN-NEXT:    s_sub_u32 s2, 0, s14
5750; GCN-NEXT:    s_subb_u32 s3, 0, s15
5751; GCN-NEXT:    s_ashr_i32 s16, s11, 31
5752; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
5753; GCN-NEXT:    v_rcp_f32_e32 v0, v0
5754; GCN-NEXT:    s_mov_b32 s17, s16
5755; GCN-NEXT:    s_mov_b32 s6, -1
5756; GCN-NEXT:    s_mov_b32 s4, s8
5757; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
5758; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
5759; GCN-NEXT:    v_trunc_f32_e32 v1, v1
5760; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
5761; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
5762; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
5763; GCN-NEXT:    s_mov_b32 s5, s9
5764; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
5765; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
5766; GCN-NEXT:    v_mul_lo_u32 v5, s3, v0
5767; GCN-NEXT:    v_mul_lo_u32 v4, s2, v0
5768; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5769; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
5770; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
5771; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
5772; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
5773; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
5774; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
5775; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
5776; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
5777; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
5778; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
5779; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
5780; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
5781; GCN-NEXT:    v_mov_b32_e32 v4, 0
5782; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
5783; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5784; GCN-NEXT:    v_mov_b32_e32 v6, 0
5785; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
5786; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
5787; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
5788; GCN-NEXT:    v_mul_lo_u32 v5, s2, v2
5789; GCN-NEXT:    v_mul_hi_u32 v7, s2, v0
5790; GCN-NEXT:    v_mul_lo_u32 v8, s3, v0
5791; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
5792; GCN-NEXT:    v_mul_lo_u32 v7, s2, v0
5793; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
5794; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
5795; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
5796; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
5797; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
5798; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
5799; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
5800; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
5801; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
5802; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
5803; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
5804; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
5805; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
5806; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
5807; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
5808; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
5809; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
5810; GCN-NEXT:    s_add_u32 s0, s10, s16
5811; GCN-NEXT:    s_addc_u32 s1, s11, s16
5812; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
5813; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[16:17]
5814; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5815; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
5816; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
5817; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
5818; GCN-NEXT:    v_mul_hi_u32 v7, s11, v1
5819; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
5820; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5821; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
5822; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
5823; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
5824; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
5825; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
5826; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
5827; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5828; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
5829; GCN-NEXT:    v_mul_lo_u32 v2, s14, v1
5830; GCN-NEXT:    v_mul_hi_u32 v3, s14, v0
5831; GCN-NEXT:    v_mul_lo_u32 v4, s15, v0
5832; GCN-NEXT:    v_mov_b32_e32 v5, s15
5833; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5834; GCN-NEXT:    v_mul_lo_u32 v3, s14, v0
5835; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
5836; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s11, v2
5837; GCN-NEXT:    v_sub_i32_e64 v3, s[0:1], s10, v3
5838; GCN-NEXT:    v_subb_u32_e64 v4, vcc, v4, v5, s[0:1]
5839; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s14, v3
5840; GCN-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
5841; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s15, v4
5842; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
5843; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s14, v5
5844; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
5845; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v4
5846; GCN-NEXT:    v_cndmask_b32_e32 v4, v6, v5, vcc
5847; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
5848; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
5849; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
5850; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
5851; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v4
5852; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[2:3]
5853; GCN-NEXT:    v_mov_b32_e32 v6, s11
5854; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v6, v2, s[0:1]
5855; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s15, v2
5856; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
5857; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s14, v3
5858; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
5859; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v2
5860; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
5861; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
5862; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[2:3]
5863; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5864; GCN-NEXT:    s_xor_b64 s[0:1], s[16:17], s[12:13]
5865; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
5866; GCN-NEXT:    v_xor_b32_e32 v0, s0, v0
5867; GCN-NEXT:    v_xor_b32_e32 v1, s1, v1
5868; GCN-NEXT:    v_mov_b32_e32 v2, s1
5869; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
5870; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
5871; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5872; GCN-NEXT:    s_endpgm
5873  %shl.y = shl i64 4096, %y
5874  %r = sdiv i64 %x, %shl.y
5875  store i64 %r, i64 addrspace(1)* %out
5876  ret void
5877}
5878
5879define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
5880; CHECK-LABEL: @sdiv_v2i64_pow2k_denom(
5881; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5882; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
5883; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
5884; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
5885; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096
5886; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
5887; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
5888; CHECK-NEXT:    ret void
5889;
5890; GCN-LABEL: sdiv_v2i64_pow2k_denom:
5891; GCN:       ; %bb.0:
5892; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5893; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
5894; GCN-NEXT:    s_mov_b32 s7, 0xf000
5895; GCN-NEXT:    s_mov_b32 s6, -1
5896; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5897; GCN-NEXT:    s_ashr_i32 s8, s1, 31
5898; GCN-NEXT:    s_lshr_b32 s8, s8, 20
5899; GCN-NEXT:    s_add_u32 s0, s0, s8
5900; GCN-NEXT:    s_addc_u32 s1, s1, 0
5901; GCN-NEXT:    s_ashr_i32 s8, s3, 31
5902; GCN-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
5903; GCN-NEXT:    s_lshr_b32 s8, s8, 20
5904; GCN-NEXT:    s_add_u32 s2, s2, s8
5905; GCN-NEXT:    s_addc_u32 s3, s3, 0
5906; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
5907; GCN-NEXT:    v_mov_b32_e32 v0, s0
5908; GCN-NEXT:    v_mov_b32_e32 v1, s1
5909; GCN-NEXT:    v_mov_b32_e32 v2, s2
5910; GCN-NEXT:    v_mov_b32_e32 v3, s3
5911; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5912; GCN-NEXT:    s_endpgm
5913  %r = sdiv <2 x i64> %x, <i64 4096, i64 4096>
5914  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5915  ret void
5916}
5917
5918define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
5919; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom(
5920; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5921; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
5922; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
5923; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
5924; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095
5925; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
5926; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
5927; CHECK-NEXT:    ret void
5928;
5929; GCN-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
5930; GCN:       ; %bb.0:
5931; GCN-NEXT:    v_mov_b32_e32 v0, 0x457ff000
5932; GCN-NEXT:    v_mov_b32_e32 v1, 0x4f800000
5933; GCN-NEXT:    v_mac_f32_e32 v0, 0, v1
5934; GCN-NEXT:    v_rcp_f32_e32 v0, v0
5935; GCN-NEXT:    s_movk_i32 s6, 0xf001
5936; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5937; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
5938; GCN-NEXT:    s_mov_b32 s7, 0xf000
5939; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
5940; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
5941; GCN-NEXT:    v_trunc_f32_e32 v1, v1
5942; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
5943; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
5944; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
5945; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5946; GCN-NEXT:    s_ashr_i32 s0, s9, 31
5947; GCN-NEXT:    s_lshr_b32 s0, s0, 20
5948; GCN-NEXT:    v_mul_hi_u32 v2, s6, v0
5949; GCN-NEXT:    v_mul_lo_u32 v3, v1, s6
5950; GCN-NEXT:    s_add_u32 s2, s8, s0
5951; GCN-NEXT:    s_addc_u32 s3, s9, 0
5952; GCN-NEXT:    s_ashr_i32 s8, s11, 31
5953; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
5954; GCN-NEXT:    v_mul_lo_u32 v3, v0, s6
5955; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
5956; GCN-NEXT:    v_mul_lo_u32 v4, v0, v2
5957; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
5958; GCN-NEXT:    v_mul_hi_u32 v5, v0, v3
5959; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
5960; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
5961; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
5962; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
5963; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
5964; GCN-NEXT:    v_mul_lo_u32 v6, v1, v3
5965; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
5966; GCN-NEXT:    s_mov_b32 s9, s8
5967; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
5968; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
5969; GCN-NEXT:    v_mov_b32_e32 v4, 0
5970; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
5971; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5972; GCN-NEXT:    v_mov_b32_e32 v6, 0
5973; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
5974; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
5975; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
5976; GCN-NEXT:    v_mul_lo_u32 v5, v2, s6
5977; GCN-NEXT:    v_mul_hi_u32 v7, s6, v0
5978; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
5979; GCN-NEXT:    v_mul_lo_u32 v7, v0, s6
5980; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, v0, v5
5981; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
5982; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
5983; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
5984; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
5985; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
5986; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
5987; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
5988; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
5989; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
5990; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
5991; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
5992; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
5993; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
5994; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
5995; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
5996; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
5997; GCN-NEXT:    s_add_u32 s0, s10, s8
5998; GCN-NEXT:    s_addc_u32 s1, s11, s8
5999; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
6000; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[8:9]
6001; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6002; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
6003; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
6004; GCN-NEXT:    v_mul_hi_u32 v5, s0, v1
6005; GCN-NEXT:    v_mul_hi_u32 v7, s1, v1
6006; GCN-NEXT:    v_mul_lo_u32 v1, s1, v1
6007; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6008; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6009; GCN-NEXT:    v_mul_lo_u32 v5, s1, v0
6010; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
6011; GCN-NEXT:    s_movk_i32 s9, 0xfff
6012; GCN-NEXT:    s_mov_b32 s6, -1
6013; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
6014; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
6015; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
6016; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6017; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
6018; GCN-NEXT:    v_mul_lo_u32 v2, v1, s9
6019; GCN-NEXT:    v_mul_hi_u32 v3, s9, v0
6020; GCN-NEXT:    v_mul_lo_u32 v4, v0, s9
6021; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6022; GCN-NEXT:    v_mov_b32_e32 v3, s1
6023; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s0, v4
6024; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
6025; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v4
6026; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
6027; GCN-NEXT:    s_movk_i32 s9, 0xffe
6028; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s9, v3
6029; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
6030; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
6031; GCN-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
6032; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
6033; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
6034; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
6035; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
6036; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s9, v4
6037; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
6038; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
6039; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
6040; GCN-NEXT:    v_cndmask_b32_e32 v2, -1, v4, vcc
6041; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
6042; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
6043; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6044; GCN-NEXT:    v_cndmask_b32_e64 v3, v8, v6, s[0:1]
6045; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
6046; GCN-NEXT:    v_xor_b32_e32 v0, s8, v0
6047; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
6048; GCN-NEXT:    v_xor_b32_e32 v1, s8, v1
6049; GCN-NEXT:    v_mov_b32_e32 v3, s8
6050; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
6051; GCN-NEXT:    v_mov_b32_e32 v0, s2
6052; GCN-NEXT:    v_mov_b32_e32 v1, s3
6053; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
6054; GCN-NEXT:    s_endpgm
6055  %r = sdiv <2 x i64> %x, <i64 4096, i64 4095>
6056  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
6057  ret void
6058}
6059
6060define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
6061; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom(
6062; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
6063; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
6064; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
6065; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]]
6066; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
6067; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
6068; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
6069; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]]
6070; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
6071; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
6072; CHECK-NEXT:    ret void
6073;
6074; GCN-LABEL: sdiv_v2i64_pow2_shl_denom:
6075; GCN:       ; %bb.0:
6076; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
6077; GCN-NEXT:    s_mov_b32 s3, 0
6078; GCN-NEXT:    s_movk_i32 s2, 0x1000
6079; GCN-NEXT:    s_mov_b32 s20, 0x4f800000
6080; GCN-NEXT:    s_mov_b32 s21, 0x5f7ffffc
6081; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6082; GCN-NEXT:    s_lshl_b64 s[12:13], s[2:3], s6
6083; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
6084; GCN-NEXT:    s_ashr_i32 s16, s3, 31
6085; GCN-NEXT:    s_add_u32 s2, s2, s16
6086; GCN-NEXT:    s_mov_b32 s17, s16
6087; GCN-NEXT:    s_addc_u32 s3, s3, s16
6088; GCN-NEXT:    s_xor_b64 s[14:15], s[2:3], s[16:17]
6089; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s14
6090; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s15
6091; GCN-NEXT:    s_mov_b32 s22, 0x2f800000
6092; GCN-NEXT:    s_mov_b32 s23, 0xcf800000
6093; GCN-NEXT:    s_sub_u32 s6, 0, s14
6094; GCN-NEXT:    v_mac_f32_e32 v0, s20, v1
6095; GCN-NEXT:    v_rcp_f32_e32 v0, v0
6096; GCN-NEXT:    s_subb_u32 s7, 0, s15
6097; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6098; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
6099; GCN-NEXT:    v_mul_f32_e32 v0, s21, v0
6100; GCN-NEXT:    v_mul_f32_e32 v1, s22, v0
6101; GCN-NEXT:    v_trunc_f32_e32 v1, v1
6102; GCN-NEXT:    v_mac_f32_e32 v0, s23, v1
6103; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
6104; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
6105; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6106; GCN-NEXT:    s_ashr_i32 s18, s9, 31
6107; GCN-NEXT:    s_add_u32 s0, s8, s18
6108; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
6109; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
6110; GCN-NEXT:    v_mul_lo_u32 v4, s7, v0
6111; GCN-NEXT:    v_mul_lo_u32 v5, s6, v0
6112; GCN-NEXT:    s_mov_b32 s19, s18
6113; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6114; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
6115; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
6116; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
6117; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
6118; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
6119; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
6120; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
6121; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
6122; GCN-NEXT:    v_mul_lo_u32 v6, v1, v5
6123; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
6124; GCN-NEXT:    s_addc_u32 s1, s9, s18
6125; GCN-NEXT:    s_xor_b64 s[8:9], s[0:1], s[18:19]
6126; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
6127; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
6128; GCN-NEXT:    v_mov_b32_e32 v4, 0
6129; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
6130; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6131; GCN-NEXT:    v_mov_b32_e32 v6, 0
6132; GCN-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v2
6133; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
6134; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[2:3]
6135; GCN-NEXT:    v_mul_lo_u32 v5, s6, v2
6136; GCN-NEXT:    v_mul_hi_u32 v7, s6, v0
6137; GCN-NEXT:    v_mul_lo_u32 v8, s7, v0
6138; GCN-NEXT:    s_mov_b32 s7, 0xf000
6139; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
6140; GCN-NEXT:    v_mul_lo_u32 v7, s6, v0
6141; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
6142; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
6143; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
6144; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
6145; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
6146; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
6147; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
6148; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
6149; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
6150; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
6151; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
6152; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
6153; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
6154; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
6155; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
6156; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6157; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
6158; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
6159; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6160; GCN-NEXT:    v_mul_lo_u32 v2, s8, v1
6161; GCN-NEXT:    v_mul_hi_u32 v3, s8, v0
6162; GCN-NEXT:    v_mul_hi_u32 v5, s8, v1
6163; GCN-NEXT:    v_mul_hi_u32 v7, s9, v1
6164; GCN-NEXT:    v_mul_lo_u32 v1, s9, v1
6165; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6166; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6167; GCN-NEXT:    v_mul_lo_u32 v5, s9, v0
6168; GCN-NEXT:    v_mul_hi_u32 v0, s9, v0
6169; GCN-NEXT:    s_mov_b32 s6, -1
6170; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
6171; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
6172; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
6173; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6174; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
6175; GCN-NEXT:    v_mul_lo_u32 v2, s14, v1
6176; GCN-NEXT:    v_mul_hi_u32 v3, s14, v0
6177; GCN-NEXT:    v_mul_lo_u32 v5, s15, v0
6178; GCN-NEXT:    v_mov_b32_e32 v7, s15
6179; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6180; GCN-NEXT:    v_mul_lo_u32 v3, s14, v0
6181; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
6182; GCN-NEXT:    v_sub_i32_e32 v5, vcc, s9, v2
6183; GCN-NEXT:    v_sub_i32_e64 v3, s[0:1], s8, v3
6184; GCN-NEXT:    v_subb_u32_e64 v5, vcc, v5, v7, s[0:1]
6185; GCN-NEXT:    v_subrev_i32_e32 v7, vcc, s14, v3
6186; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
6187; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s15, v5
6188; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
6189; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s14, v7
6190; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
6191; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v5
6192; GCN-NEXT:    v_cndmask_b32_e32 v5, v8, v7, vcc
6193; GCN-NEXT:    v_add_i32_e32 v7, vcc, 2, v0
6194; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
6195; GCN-NEXT:    v_add_i32_e32 v9, vcc, 1, v0
6196; GCN-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
6197; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v5
6198; GCN-NEXT:    v_cndmask_b32_e64 v5, v10, v8, s[2:3]
6199; GCN-NEXT:    v_mov_b32_e32 v8, s9
6200; GCN-NEXT:    s_xor_b64 s[8:9], s[18:19], s[16:17]
6201; GCN-NEXT:    s_ashr_i32 s16, s13, 31
6202; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v8, v2, s[0:1]
6203; GCN-NEXT:    s_add_u32 s0, s12, s16
6204; GCN-NEXT:    s_mov_b32 s17, s16
6205; GCN-NEXT:    s_addc_u32 s1, s13, s16
6206; GCN-NEXT:    s_xor_b64 s[12:13], s[0:1], s[16:17]
6207; GCN-NEXT:    v_cvt_f32_u32_e32 v10, s12
6208; GCN-NEXT:    v_cvt_f32_u32_e32 v11, s13
6209; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s15, v2
6210; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
6211; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s14, v3
6212; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
6213; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v2
6214; GCN-NEXT:    v_mac_f32_e32 v10, s20, v11
6215; GCN-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
6216; GCN-NEXT:    v_rcp_f32_e32 v3, v10
6217; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
6218; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
6219; GCN-NEXT:    v_cndmask_b32_e64 v2, v9, v7, s[2:3]
6220; GCN-NEXT:    v_mul_f32_e32 v3, s21, v3
6221; GCN-NEXT:    v_mul_f32_e32 v5, s22, v3
6222; GCN-NEXT:    v_trunc_f32_e32 v5, v5
6223; GCN-NEXT:    v_mac_f32_e32 v3, s23, v5
6224; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
6225; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
6226; GCN-NEXT:    s_sub_u32 s2, 0, s12
6227; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6228; GCN-NEXT:    v_mul_hi_u32 v2, s2, v3
6229; GCN-NEXT:    v_mul_lo_u32 v7, s2, v5
6230; GCN-NEXT:    s_subb_u32 s3, 0, s13
6231; GCN-NEXT:    v_mul_lo_u32 v8, s3, v3
6232; GCN-NEXT:    s_ashr_i32 s14, s11, 31
6233; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
6234; GCN-NEXT:    v_mul_lo_u32 v7, s2, v3
6235; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
6236; GCN-NEXT:    v_mul_lo_u32 v8, v3, v2
6237; GCN-NEXT:    v_mul_hi_u32 v10, v3, v2
6238; GCN-NEXT:    v_mul_hi_u32 v9, v3, v7
6239; GCN-NEXT:    v_mul_hi_u32 v11, v5, v2
6240; GCN-NEXT:    v_mul_lo_u32 v2, v5, v2
6241; GCN-NEXT:    s_mov_b32 s15, s14
6242; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
6243; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
6244; GCN-NEXT:    v_mul_lo_u32 v10, v5, v7
6245; GCN-NEXT:    v_mul_hi_u32 v7, v5, v7
6246; GCN-NEXT:    v_xor_b32_e32 v0, s8, v0
6247; GCN-NEXT:    v_xor_b32_e32 v1, s9, v1
6248; GCN-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
6249; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
6250; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v11, v4, vcc
6251; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
6252; GCN-NEXT:    v_add_i32_e64 v2, s[0:1], v3, v2
6253; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v6, v8, vcc
6254; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v5, v7, s[0:1]
6255; GCN-NEXT:    v_mul_lo_u32 v8, s2, v3
6256; GCN-NEXT:    v_mul_hi_u32 v9, s2, v2
6257; GCN-NEXT:    v_mul_lo_u32 v10, s3, v2
6258; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
6259; GCN-NEXT:    v_mul_lo_u32 v9, s2, v2
6260; GCN-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
6261; GCN-NEXT:    v_mul_lo_u32 v12, v2, v8
6262; GCN-NEXT:    v_mul_hi_u32 v14, v2, v8
6263; GCN-NEXT:    v_mul_hi_u32 v13, v2, v9
6264; GCN-NEXT:    v_mul_hi_u32 v11, v3, v9
6265; GCN-NEXT:    v_mul_lo_u32 v9, v3, v9
6266; GCN-NEXT:    v_mul_hi_u32 v10, v3, v8
6267; GCN-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
6268; GCN-NEXT:    v_addc_u32_e32 v13, vcc, 0, v14, vcc
6269; GCN-NEXT:    v_mul_lo_u32 v3, v3, v8
6270; GCN-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
6271; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v13, v11, vcc
6272; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v10, v4, vcc
6273; GCN-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
6274; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v6, v8, vcc
6275; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
6276; GCN-NEXT:    v_addc_u32_e64 v5, vcc, v5, v8, s[0:1]
6277; GCN-NEXT:    s_add_u32 s0, s10, s14
6278; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
6279; GCN-NEXT:    s_addc_u32 s1, s11, s14
6280; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
6281; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6282; GCN-NEXT:    v_mul_lo_u32 v5, s10, v3
6283; GCN-NEXT:    v_mul_hi_u32 v7, s10, v2
6284; GCN-NEXT:    v_mul_hi_u32 v9, s10, v3
6285; GCN-NEXT:    v_mul_hi_u32 v10, s11, v3
6286; GCN-NEXT:    v_mul_lo_u32 v3, s11, v3
6287; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
6288; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
6289; GCN-NEXT:    v_mul_lo_u32 v9, s11, v2
6290; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
6291; GCN-NEXT:    v_mov_b32_e32 v8, s9
6292; GCN-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
6293; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v2, vcc
6294; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v10, v4, vcc
6295; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
6296; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v4, vcc
6297; GCN-NEXT:    v_mul_lo_u32 v4, s12, v3
6298; GCN-NEXT:    v_mul_hi_u32 v5, s12, v2
6299; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
6300; GCN-NEXT:    v_mul_lo_u32 v6, s13, v2
6301; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
6302; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
6303; GCN-NEXT:    v_mul_lo_u32 v5, s12, v2
6304; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
6305; GCN-NEXT:    v_sub_i32_e32 v6, vcc, s11, v4
6306; GCN-NEXT:    v_mov_b32_e32 v7, s13
6307; GCN-NEXT:    v_sub_i32_e64 v5, s[0:1], s10, v5
6308; GCN-NEXT:    v_subb_u32_e64 v6, vcc, v6, v7, s[0:1]
6309; GCN-NEXT:    v_subrev_i32_e32 v7, vcc, s12, v5
6310; GCN-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
6311; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v6
6312; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
6313; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v7
6314; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
6315; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v6
6316; GCN-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
6317; GCN-NEXT:    v_add_i32_e32 v7, vcc, 2, v2
6318; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v3, vcc
6319; GCN-NEXT:    v_add_i32_e32 v9, vcc, 1, v2
6320; GCN-NEXT:    v_addc_u32_e32 v10, vcc, 0, v3, vcc
6321; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v6
6322; GCN-NEXT:    v_cndmask_b32_e64 v6, v10, v8, s[2:3]
6323; GCN-NEXT:    v_mov_b32_e32 v8, s11
6324; GCN-NEXT:    v_subb_u32_e64 v4, vcc, v8, v4, s[0:1]
6325; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v4
6326; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
6327; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v5
6328; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
6329; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v4
6330; GCN-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
6331; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
6332; GCN-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[2:3]
6333; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
6334; GCN-NEXT:    s_xor_b64 s[0:1], s[14:15], s[16:17]
6335; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
6336; GCN-NEXT:    v_xor_b32_e32 v2, s0, v2
6337; GCN-NEXT:    v_xor_b32_e32 v3, s1, v3
6338; GCN-NEXT:    v_mov_b32_e32 v4, s1
6339; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v2
6340; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
6341; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
6342; GCN-NEXT:    s_endpgm
6343  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
6344  %r = sdiv <2 x i64> %x, %shl.y
6345  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
6346  ret void
6347}
6348
6349define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
6350; CHECK-LABEL: @srem_i64_oddk_denom(
6351; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 1235195
6352; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
6353; CHECK-NEXT:    ret void
6354;
6355; GCN-LABEL: srem_i64_oddk_denom:
6356; GCN:       ; %bb.0:
6357; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
6358; GCN-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
6359; GCN-NEXT:    v_rcp_f32_e32 v0, v0
6360; GCN-NEXT:    s_mov_b32 s2, 0xffed2705
6361; GCN-NEXT:    v_mov_b32_e32 v8, 0
6362; GCN-NEXT:    v_mov_b32_e32 v7, 0
6363; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
6364; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
6365; GCN-NEXT:    v_trunc_f32_e32 v1, v1
6366; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
6367; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
6368; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
6369; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
6370; GCN-NEXT:    s_mov_b32 s7, 0xf000
6371; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
6372; GCN-NEXT:    v_mul_lo_u32 v2, v1, s2
6373; GCN-NEXT:    v_mul_lo_u32 v4, v0, s2
6374; GCN-NEXT:    s_mov_b32 s6, -1
6375; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6376; GCN-NEXT:    s_mov_b32 s4, s8
6377; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6378; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
6379; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
6380; GCN-NEXT:    v_mul_hi_u32 v6, v0, v4
6381; GCN-NEXT:    v_mul_hi_u32 v3, v0, v2
6382; GCN-NEXT:    v_mul_hi_u32 v9, v1, v2
6383; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
6384; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
6385; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
6386; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
6387; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
6388; GCN-NEXT:    s_mov_b32 s5, s9
6389; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
6390; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
6391; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
6392; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6393; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
6394; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
6395; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
6396; GCN-NEXT:    v_mul_lo_u32 v4, v2, s2
6397; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
6398; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
6399; GCN-NEXT:    v_mul_lo_u32 v5, v0, s2
6400; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
6401; GCN-NEXT:    v_mul_lo_u32 v10, v0, v4
6402; GCN-NEXT:    v_mul_hi_u32 v12, v0, v4
6403; GCN-NEXT:    v_mul_hi_u32 v11, v0, v5
6404; GCN-NEXT:    v_mul_hi_u32 v9, v2, v5
6405; GCN-NEXT:    v_mul_lo_u32 v5, v2, v5
6406; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
6407; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
6408; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v8, v12, vcc
6409; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
6410; GCN-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
6411; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v11, v9, vcc
6412; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v7, vcc
6413; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
6414; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
6415; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6416; GCN-NEXT:    s_ashr_i32 s2, s11, 31
6417; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
6418; GCN-NEXT:    s_add_u32 s0, s10, s2
6419; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
6420; GCN-NEXT:    s_mov_b32 s3, s2
6421; GCN-NEXT:    s_addc_u32 s1, s11, s2
6422; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
6423; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6424; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
6425; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
6426; GCN-NEXT:    v_mul_hi_u32 v4, s0, v1
6427; GCN-NEXT:    v_mul_hi_u32 v5, s1, v1
6428; GCN-NEXT:    v_mul_lo_u32 v1, s1, v1
6429; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6430; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
6431; GCN-NEXT:    v_mul_lo_u32 v4, s1, v0
6432; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
6433; GCN-NEXT:    s_mov_b32 s3, 0x12d8fb
6434; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
6435; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
6436; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
6437; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6438; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
6439; GCN-NEXT:    v_mul_hi_u32 v2, s3, v0
6440; GCN-NEXT:    v_mul_lo_u32 v1, v1, s3
6441; GCN-NEXT:    v_mul_lo_u32 v0, v0, s3
6442; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
6443; GCN-NEXT:    v_mov_b32_e32 v2, s1
6444; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
6445; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
6446; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v0
6447; GCN-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
6448; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s3, v2
6449; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v3, vcc
6450; GCN-NEXT:    s_mov_b32 s3, 0x12d8fa
6451; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v2
6452; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
6453; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
6454; GCN-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
6455; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
6456; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v0
6457; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
6458; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
6459; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
6460; GCN-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
6461; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
6462; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
6463; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6464; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
6465; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
6466; GCN-NEXT:    v_xor_b32_e32 v1, s2, v1
6467; GCN-NEXT:    v_mov_b32_e32 v2, s2
6468; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
6469; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
6470; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6471; GCN-NEXT:    s_endpgm
6472  %r = srem i64 %x, 1235195
6473  store i64 %r, i64 addrspace(1)* %out
6474  ret void
6475}
6476
6477define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
6478; CHECK-LABEL: @srem_i64_pow2k_denom(
6479; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 4096
6480; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
6481; CHECK-NEXT:    ret void
6482;
6483; GCN-LABEL: srem_i64_pow2k_denom:
6484; GCN:       ; %bb.0:
6485; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
6486; GCN-NEXT:    s_mov_b32 s3, 0xf000
6487; GCN-NEXT:    s_mov_b32 s2, -1
6488; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6489; GCN-NEXT:    s_mov_b32 s0, s4
6490; GCN-NEXT:    s_ashr_i32 s4, s7, 31
6491; GCN-NEXT:    s_lshr_b32 s4, s4, 20
6492; GCN-NEXT:    s_add_u32 s4, s6, s4
6493; GCN-NEXT:    s_mov_b32 s1, s5
6494; GCN-NEXT:    s_addc_u32 s5, s7, 0
6495; GCN-NEXT:    s_and_b32 s4, s4, 0xfffff000
6496; GCN-NEXT:    s_sub_u32 s4, s6, s4
6497; GCN-NEXT:    s_subb_u32 s5, s7, s5
6498; GCN-NEXT:    v_mov_b32_e32 v0, s4
6499; GCN-NEXT:    v_mov_b32_e32 v1, s5
6500; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6501; GCN-NEXT:    s_endpgm
6502  %r = srem i64 %x, 4096
6503  store i64 %r, i64 addrspace(1)* %out
6504  ret void
6505}
6506
6507define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
6508; CHECK-LABEL: @srem_i64_pow2_shl_denom(
6509; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
6510; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]]
6511; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
6512; CHECK-NEXT:    ret void
6513;
6514; GCN-LABEL: srem_i64_pow2_shl_denom:
6515; GCN:       ; %bb.0:
6516; GCN-NEXT:    s_load_dword s4, s[0:1], 0xd
6517; GCN-NEXT:    s_mov_b32 s3, 0
6518; GCN-NEXT:    s_movk_i32 s2, 0x1000
6519; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
6520; GCN-NEXT:    s_mov_b32 s7, 0xf000
6521; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6522; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
6523; GCN-NEXT:    s_ashr_i32 s4, s3, 31
6524; GCN-NEXT:    s_add_u32 s2, s2, s4
6525; GCN-NEXT:    s_mov_b32 s5, s4
6526; GCN-NEXT:    s_addc_u32 s3, s3, s4
6527; GCN-NEXT:    s_xor_b64 s[12:13], s[2:3], s[4:5]
6528; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
6529; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s13
6530; GCN-NEXT:    s_sub_u32 s2, 0, s12
6531; GCN-NEXT:    s_subb_u32 s3, 0, s13
6532; GCN-NEXT:    s_ashr_i32 s14, s11, 31
6533; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
6534; GCN-NEXT:    v_rcp_f32_e32 v0, v0
6535; GCN-NEXT:    s_mov_b32 s15, s14
6536; GCN-NEXT:    s_mov_b32 s6, -1
6537; GCN-NEXT:    s_mov_b32 s4, s8
6538; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
6539; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
6540; GCN-NEXT:    v_trunc_f32_e32 v1, v1
6541; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
6542; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
6543; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
6544; GCN-NEXT:    s_mov_b32 s5, s9
6545; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
6546; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
6547; GCN-NEXT:    v_mul_lo_u32 v5, s3, v0
6548; GCN-NEXT:    v_mul_lo_u32 v4, s2, v0
6549; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6550; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
6551; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
6552; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
6553; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
6554; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
6555; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
6556; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
6557; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
6558; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
6559; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
6560; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
6561; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
6562; GCN-NEXT:    v_mov_b32_e32 v4, 0
6563; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
6564; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6565; GCN-NEXT:    v_mov_b32_e32 v6, 0
6566; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
6567; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
6568; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
6569; GCN-NEXT:    v_mul_lo_u32 v5, s2, v2
6570; GCN-NEXT:    v_mul_hi_u32 v7, s2, v0
6571; GCN-NEXT:    v_mul_lo_u32 v8, s3, v0
6572; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
6573; GCN-NEXT:    v_mul_lo_u32 v7, s2, v0
6574; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
6575; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
6576; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
6577; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
6578; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
6579; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
6580; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
6581; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
6582; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
6583; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
6584; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
6585; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
6586; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
6587; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
6588; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
6589; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6590; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
6591; GCN-NEXT:    s_add_u32 s0, s10, s14
6592; GCN-NEXT:    s_addc_u32 s1, s11, s14
6593; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
6594; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
6595; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6596; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
6597; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
6598; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
6599; GCN-NEXT:    v_mul_hi_u32 v7, s11, v1
6600; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
6601; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6602; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6603; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
6604; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
6605; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
6606; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
6607; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
6608; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6609; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
6610; GCN-NEXT:    v_mul_lo_u32 v1, s12, v1
6611; GCN-NEXT:    v_mul_hi_u32 v2, s12, v0
6612; GCN-NEXT:    v_mul_lo_u32 v3, s13, v0
6613; GCN-NEXT:    v_mul_lo_u32 v0, s12, v0
6614; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
6615; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6616; GCN-NEXT:    v_sub_i32_e64 v0, s[0:1], s10, v0
6617; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s11, v1
6618; GCN-NEXT:    v_mov_b32_e32 v3, s13
6619; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v2, v3, s[0:1]
6620; GCN-NEXT:    v_subrev_i32_e64 v4, s[2:3], s12, v0
6621; GCN-NEXT:    v_subbrev_u32_e64 v5, vcc, 0, v2, s[2:3]
6622; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v5
6623; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
6624; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v4
6625; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
6626; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v5
6627; GCN-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
6628; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v2, v3, s[2:3]
6629; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s12, v4
6630; GCN-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
6631; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v6
6632; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[2:3]
6633; GCN-NEXT:    v_mov_b32_e32 v5, s11
6634; GCN-NEXT:    v_subb_u32_e64 v1, vcc, v5, v1, s[0:1]
6635; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
6636; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
6637; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
6638; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
6639; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
6640; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
6641; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
6642; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6643; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[2:3]
6644; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6645; GCN-NEXT:    v_xor_b32_e32 v0, s14, v0
6646; GCN-NEXT:    v_xor_b32_e32 v1, s14, v1
6647; GCN-NEXT:    v_mov_b32_e32 v2, s14
6648; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s14, v0
6649; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
6650; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6651; GCN-NEXT:    s_endpgm
6652  %shl.y = shl i64 4096, %y
6653  %r = srem i64 %x, %shl.y
6654  store i64 %r, i64 addrspace(1)* %out
6655  ret void
6656}
6657
6658define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
6659; CHECK-LABEL: @srem_v2i64_pow2k_denom(
6660; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
6661; CHECK-NEXT:    [[TMP2:%.*]] = srem i64 [[TMP1]], 4096
6662; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
6663; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
6664; CHECK-NEXT:    [[TMP5:%.*]] = srem i64 [[TMP4]], 4096
6665; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
6666; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
6667; CHECK-NEXT:    ret void
6668;
6669; GCN-LABEL: srem_v2i64_pow2k_denom:
6670; GCN:       ; %bb.0:
6671; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6672; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
6673; GCN-NEXT:    s_movk_i32 s8, 0xf000
6674; GCN-NEXT:    s_mov_b32 s7, 0xf000
6675; GCN-NEXT:    s_mov_b32 s6, -1
6676; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6677; GCN-NEXT:    s_ashr_i32 s9, s1, 31
6678; GCN-NEXT:    s_lshr_b32 s9, s9, 20
6679; GCN-NEXT:    s_add_u32 s9, s0, s9
6680; GCN-NEXT:    s_addc_u32 s10, s1, 0
6681; GCN-NEXT:    s_and_b32 s9, s9, s8
6682; GCN-NEXT:    s_sub_u32 s0, s0, s9
6683; GCN-NEXT:    s_subb_u32 s1, s1, s10
6684; GCN-NEXT:    s_ashr_i32 s9, s3, 31
6685; GCN-NEXT:    s_lshr_b32 s9, s9, 20
6686; GCN-NEXT:    s_add_u32 s9, s2, s9
6687; GCN-NEXT:    s_addc_u32 s10, s3, 0
6688; GCN-NEXT:    s_and_b32 s8, s9, s8
6689; GCN-NEXT:    s_sub_u32 s2, s2, s8
6690; GCN-NEXT:    s_subb_u32 s3, s3, s10
6691; GCN-NEXT:    v_mov_b32_e32 v0, s0
6692; GCN-NEXT:    v_mov_b32_e32 v1, s1
6693; GCN-NEXT:    v_mov_b32_e32 v2, s2
6694; GCN-NEXT:    v_mov_b32_e32 v3, s3
6695; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
6696; GCN-NEXT:    s_endpgm
6697  %r = srem <2 x i64> %x, <i64 4096, i64 4096>
6698  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
6699  ret void
6700}
6701
6702define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
6703; CHECK-LABEL: @srem_v2i64_pow2_shl_denom(
6704; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
6705; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
6706; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
6707; CHECK-NEXT:    [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]]
6708; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
6709; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
6710; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
6711; CHECK-NEXT:    [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]]
6712; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
6713; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
6714; CHECK-NEXT:    ret void
6715;
6716; GCN-LABEL: srem_v2i64_pow2_shl_denom:
6717; GCN:       ; %bb.0:
6718; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
6719; GCN-NEXT:    s_mov_b32 s3, 0
6720; GCN-NEXT:    s_movk_i32 s2, 0x1000
6721; GCN-NEXT:    s_mov_b32 s18, 0x4f800000
6722; GCN-NEXT:    s_mov_b32 s19, 0x5f7ffffc
6723; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6724; GCN-NEXT:    s_lshl_b64 s[14:15], s[2:3], s6
6725; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
6726; GCN-NEXT:    s_ashr_i32 s4, s3, 31
6727; GCN-NEXT:    s_add_u32 s2, s2, s4
6728; GCN-NEXT:    s_mov_b32 s5, s4
6729; GCN-NEXT:    s_addc_u32 s3, s3, s4
6730; GCN-NEXT:    s_xor_b64 s[16:17], s[2:3], s[4:5]
6731; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s16
6732; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s17
6733; GCN-NEXT:    s_mov_b32 s20, 0x2f800000
6734; GCN-NEXT:    s_mov_b32 s21, 0xcf800000
6735; GCN-NEXT:    s_sub_u32 s6, 0, s16
6736; GCN-NEXT:    v_mac_f32_e32 v0, s18, v1
6737; GCN-NEXT:    v_rcp_f32_e32 v0, v0
6738; GCN-NEXT:    s_subb_u32 s7, 0, s17
6739; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6740; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
6741; GCN-NEXT:    v_mul_f32_e32 v0, s19, v0
6742; GCN-NEXT:    v_mul_f32_e32 v1, s20, v0
6743; GCN-NEXT:    v_trunc_f32_e32 v1, v1
6744; GCN-NEXT:    v_mac_f32_e32 v0, s21, v1
6745; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
6746; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
6747; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6748; GCN-NEXT:    s_ashr_i32 s12, s9, 31
6749; GCN-NEXT:    s_add_u32 s0, s8, s12
6750; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
6751; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
6752; GCN-NEXT:    v_mul_lo_u32 v4, s7, v0
6753; GCN-NEXT:    v_mul_lo_u32 v5, s6, v0
6754; GCN-NEXT:    s_mov_b32 s13, s12
6755; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6756; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
6757; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
6758; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
6759; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
6760; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
6761; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
6762; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
6763; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
6764; GCN-NEXT:    v_mul_lo_u32 v6, v1, v5
6765; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
6766; GCN-NEXT:    s_addc_u32 s1, s9, s12
6767; GCN-NEXT:    s_xor_b64 s[8:9], s[0:1], s[12:13]
6768; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
6769; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
6770; GCN-NEXT:    v_mov_b32_e32 v4, 0
6771; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
6772; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6773; GCN-NEXT:    v_mov_b32_e32 v6, 0
6774; GCN-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v2
6775; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
6776; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[2:3]
6777; GCN-NEXT:    v_mul_lo_u32 v5, s6, v2
6778; GCN-NEXT:    v_mul_hi_u32 v7, s6, v0
6779; GCN-NEXT:    v_mul_lo_u32 v8, s7, v0
6780; GCN-NEXT:    s_mov_b32 s7, 0xf000
6781; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
6782; GCN-NEXT:    v_mul_lo_u32 v7, s6, v0
6783; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
6784; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
6785; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
6786; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
6787; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
6788; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
6789; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
6790; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
6791; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
6792; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
6793; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
6794; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
6795; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
6796; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
6797; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
6798; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6799; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
6800; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
6801; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6802; GCN-NEXT:    v_mul_lo_u32 v2, s8, v1
6803; GCN-NEXT:    v_mul_hi_u32 v3, s8, v0
6804; GCN-NEXT:    v_mul_hi_u32 v5, s8, v1
6805; GCN-NEXT:    v_mul_hi_u32 v7, s9, v1
6806; GCN-NEXT:    v_mul_lo_u32 v1, s9, v1
6807; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6808; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6809; GCN-NEXT:    v_mul_lo_u32 v5, s9, v0
6810; GCN-NEXT:    v_mul_hi_u32 v0, s9, v0
6811; GCN-NEXT:    s_mov_b32 s6, -1
6812; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
6813; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
6814; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
6815; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6816; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
6817; GCN-NEXT:    v_mul_lo_u32 v1, s16, v1
6818; GCN-NEXT:    v_mul_hi_u32 v2, s16, v0
6819; GCN-NEXT:    v_mul_lo_u32 v3, s17, v0
6820; GCN-NEXT:    v_mul_lo_u32 v0, s16, v0
6821; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
6822; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6823; GCN-NEXT:    v_sub_i32_e64 v0, s[0:1], s8, v0
6824; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s9, v1
6825; GCN-NEXT:    v_mov_b32_e32 v3, s17
6826; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v2, v3, s[0:1]
6827; GCN-NEXT:    v_subrev_i32_e64 v5, s[2:3], s16, v0
6828; GCN-NEXT:    v_subbrev_u32_e64 v7, vcc, 0, v2, s[2:3]
6829; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s17, v7
6830; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
6831; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s16, v5
6832; GCN-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
6833; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s17, v7
6834; GCN-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
6835; GCN-NEXT:    v_subb_u32_e64 v2, vcc, v2, v3, s[2:3]
6836; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s16, v5
6837; GCN-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
6838; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v8
6839; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[2:3]
6840; GCN-NEXT:    v_mov_b32_e32 v7, s9
6841; GCN-NEXT:    v_subb_u32_e64 v1, vcc, v7, v1, s[0:1]
6842; GCN-NEXT:    s_ashr_i32 s0, s15, 31
6843; GCN-NEXT:    s_add_u32 s8, s14, s0
6844; GCN-NEXT:    s_mov_b32 s1, s0
6845; GCN-NEXT:    s_addc_u32 s9, s15, s0
6846; GCN-NEXT:    s_xor_b64 s[8:9], s[8:9], s[0:1]
6847; GCN-NEXT:    v_cvt_f32_u32_e32 v9, s8
6848; GCN-NEXT:    v_cvt_f32_u32_e32 v10, s9
6849; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s17, v1
6850; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
6851; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s16, v0
6852; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
6853; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s17, v1
6854; GCN-NEXT:    v_mac_f32_e32 v9, s18, v10
6855; GCN-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
6856; GCN-NEXT:    v_rcp_f32_e32 v8, v9
6857; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
6858; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6859; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v3, s[2:3]
6860; GCN-NEXT:    v_mul_f32_e32 v3, s19, v8
6861; GCN-NEXT:    v_mul_f32_e32 v5, s20, v3
6862; GCN-NEXT:    v_trunc_f32_e32 v5, v5
6863; GCN-NEXT:    v_mac_f32_e32 v3, s21, v5
6864; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
6865; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
6866; GCN-NEXT:    s_sub_u32 s2, 0, s8
6867; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6868; GCN-NEXT:    v_mul_hi_u32 v2, s2, v3
6869; GCN-NEXT:    v_mul_lo_u32 v7, s2, v5
6870; GCN-NEXT:    s_subb_u32 s3, 0, s9
6871; GCN-NEXT:    v_mul_lo_u32 v8, s3, v3
6872; GCN-NEXT:    s_ashr_i32 s14, s11, 31
6873; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
6874; GCN-NEXT:    v_mul_lo_u32 v7, s2, v3
6875; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
6876; GCN-NEXT:    v_mul_lo_u32 v8, v3, v2
6877; GCN-NEXT:    v_mul_hi_u32 v10, v3, v2
6878; GCN-NEXT:    v_mul_hi_u32 v9, v3, v7
6879; GCN-NEXT:    v_mul_hi_u32 v11, v5, v2
6880; GCN-NEXT:    v_mul_lo_u32 v2, v5, v2
6881; GCN-NEXT:    s_mov_b32 s15, s14
6882; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
6883; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
6884; GCN-NEXT:    v_mul_lo_u32 v10, v5, v7
6885; GCN-NEXT:    v_mul_hi_u32 v7, v5, v7
6886; GCN-NEXT:    v_xor_b32_e32 v0, s12, v0
6887; GCN-NEXT:    v_xor_b32_e32 v1, s12, v1
6888; GCN-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
6889; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
6890; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v11, v4, vcc
6891; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
6892; GCN-NEXT:    v_add_i32_e64 v2, s[0:1], v3, v2
6893; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v6, v8, vcc
6894; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v5, v7, s[0:1]
6895; GCN-NEXT:    v_mul_lo_u32 v8, s2, v3
6896; GCN-NEXT:    v_mul_hi_u32 v9, s2, v2
6897; GCN-NEXT:    v_mul_lo_u32 v10, s3, v2
6898; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
6899; GCN-NEXT:    v_mul_lo_u32 v9, s2, v2
6900; GCN-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
6901; GCN-NEXT:    v_mul_lo_u32 v12, v2, v8
6902; GCN-NEXT:    v_mul_hi_u32 v14, v2, v8
6903; GCN-NEXT:    v_mul_hi_u32 v13, v2, v9
6904; GCN-NEXT:    v_mul_hi_u32 v11, v3, v9
6905; GCN-NEXT:    v_mul_lo_u32 v9, v3, v9
6906; GCN-NEXT:    v_mul_hi_u32 v10, v3, v8
6907; GCN-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
6908; GCN-NEXT:    v_addc_u32_e32 v13, vcc, 0, v14, vcc
6909; GCN-NEXT:    v_mul_lo_u32 v3, v3, v8
6910; GCN-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
6911; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v13, v11, vcc
6912; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v10, v4, vcc
6913; GCN-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
6914; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v6, v8, vcc
6915; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
6916; GCN-NEXT:    v_addc_u32_e64 v5, vcc, v5, v8, s[0:1]
6917; GCN-NEXT:    s_add_u32 s0, s10, s14
6918; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
6919; GCN-NEXT:    s_addc_u32 s1, s11, s14
6920; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
6921; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6922; GCN-NEXT:    v_mul_lo_u32 v5, s10, v3
6923; GCN-NEXT:    v_mul_hi_u32 v7, s10, v2
6924; GCN-NEXT:    v_mul_hi_u32 v9, s10, v3
6925; GCN-NEXT:    v_mul_hi_u32 v10, s11, v3
6926; GCN-NEXT:    v_mul_lo_u32 v3, s11, v3
6927; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
6928; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
6929; GCN-NEXT:    v_mul_lo_u32 v9, s11, v2
6930; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
6931; GCN-NEXT:    v_mov_b32_e32 v8, s12
6932; GCN-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
6933; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v2, vcc
6934; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v10, v4, vcc
6935; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
6936; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v4, vcc
6937; GCN-NEXT:    v_mul_lo_u32 v3, s8, v3
6938; GCN-NEXT:    v_mul_hi_u32 v4, s8, v2
6939; GCN-NEXT:    v_mul_lo_u32 v5, s9, v2
6940; GCN-NEXT:    v_mul_lo_u32 v2, s8, v2
6941; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
6942; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
6943; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
6944; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
6945; GCN-NEXT:    v_sub_i32_e64 v2, s[0:1], s10, v2
6946; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s11, v3
6947; GCN-NEXT:    v_mov_b32_e32 v5, s9
6948; GCN-NEXT:    v_subb_u32_e64 v4, vcc, v4, v5, s[0:1]
6949; GCN-NEXT:    v_subrev_i32_e64 v6, s[2:3], s8, v2
6950; GCN-NEXT:    v_subbrev_u32_e64 v7, vcc, 0, v4, s[2:3]
6951; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v7
6952; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
6953; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v6
6954; GCN-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
6955; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v7
6956; GCN-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
6957; GCN-NEXT:    v_subb_u32_e64 v4, vcc, v4, v5, s[2:3]
6958; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s8, v6
6959; GCN-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
6960; GCN-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v8
6961; GCN-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[2:3]
6962; GCN-NEXT:    v_mov_b32_e32 v7, s11
6963; GCN-NEXT:    v_subb_u32_e64 v3, vcc, v7, v3, s[0:1]
6964; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
6965; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
6966; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
6967; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
6968; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
6969; GCN-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
6970; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
6971; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
6972; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[2:3]
6973; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
6974; GCN-NEXT:    v_xor_b32_e32 v2, s14, v2
6975; GCN-NEXT:    v_xor_b32_e32 v3, s14, v3
6976; GCN-NEXT:    v_mov_b32_e32 v4, s14
6977; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s14, v2
6978; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
6979; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
6980; GCN-NEXT:    s_endpgm
6981  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
6982  %r = srem <2 x i64> %x, %shl.y
6983  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
6984  ret void
6985}
6986