1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
3; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s
5
6define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
7; CHECK-LABEL: @udiv_i32(
8; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
9; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
10; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000
11; CHECK-NEXT:    [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
12; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 0, [[Y]]
13; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]]
14; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP4]] to i64
15; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
16; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
17; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
18; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
19; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
20; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]]
21; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64
22; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
23; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
24; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
25; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
26; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
27; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]]
28; CHECK-NEXT:    [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]]
29; CHECK-NEXT:    [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]]
30; CHECK-NEXT:    [[TMP23:%.*]] = add i32 [[TMP19]], 1
31; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP19]]
32; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP21]], [[Y]]
33; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP22]], i32 [[TMP25]], i32 [[TMP21]]
34; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]]
35; CHECK-NEXT:    [[TMP28:%.*]] = add i32 [[TMP24]], 1
36; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]]
37; CHECK-NEXT:    store i32 [[TMP29]], i32 addrspace(1)* [[OUT:%.*]], align 4
38; CHECK-NEXT:    ret void
39;
40; GCN-LABEL: udiv_i32:
41; GCN:       ; %bb.0:
42; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
43; GCN-NEXT:    s_mov_b32 s7, 0xf000
44; GCN-NEXT:    s_mov_b32 s6, -1
45; GCN-NEXT:    s_waitcnt lgkmcnt(0)
46; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s3
47; GCN-NEXT:    s_sub_i32 s4, 0, s3
48; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
49; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
50; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
51; GCN-NEXT:    v_mul_lo_u32 v1, s4, v0
52; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
53; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
54; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
55; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
56; GCN-NEXT:    v_mul_lo_u32 v1, v0, s3
57; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
58; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s2, v1
59; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v1
60; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
61; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
62; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
63; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
64; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
65; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
66; GCN-NEXT:    s_waitcnt lgkmcnt(0)
67; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
68; GCN-NEXT:    s_endpgm
69  %r = udiv i32 %x, %y
70  store i32 %r, i32 addrspace(1)* %out
71  ret void
72}
73
74define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
75; CHECK-LABEL: @urem_i32(
76; CHECK-NEXT:    [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float
77; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]])
78; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000
79; CHECK-NEXT:    [[TMP4:%.*]] = fptoui float [[TMP3]] to i32
80; CHECK-NEXT:    [[TMP5:%.*]] = sub i32 0, [[Y]]
81; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]]
82; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP4]] to i64
83; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
84; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]]
85; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
86; CHECK-NEXT:    [[TMP11:%.*]] = lshr i64 [[TMP9]], 32
87; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
88; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]]
89; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64
90; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
91; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
92; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
93; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
94; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
95; CHECK-NEXT:    [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]]
96; CHECK-NEXT:    [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]]
97; CHECK-NEXT:    [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]]
98; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP21]], [[Y]]
99; CHECK-NEXT:    [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP21]]
100; CHECK-NEXT:    [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]]
101; CHECK-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]]
102; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]]
103; CHECK-NEXT:    store i32 [[TMP27]], i32 addrspace(1)* [[OUT:%.*]], align 4
104; CHECK-NEXT:    ret void
105;
106; GCN-LABEL: urem_i32:
107; GCN:       ; %bb.0:
108; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
109; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
110; GCN-NEXT:    s_mov_b32 s3, 0xf000
111; GCN-NEXT:    s_waitcnt lgkmcnt(0)
112; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s5
113; GCN-NEXT:    s_sub_i32 s2, 0, s5
114; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
115; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
116; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
117; GCN-NEXT:    v_mul_lo_u32 v1, s2, v0
118; GCN-NEXT:    s_mov_b32 s2, -1
119; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
120; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
121; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
122; GCN-NEXT:    v_mul_lo_u32 v0, v0, s5
123; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
124; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s5, v0
125; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s5, v0
126; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
127; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s5, v0
128; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s5, v0
129; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
130; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
131; GCN-NEXT:    s_endpgm
132  %r = urem i32 %x, %y
133  store i32 %r, i32 addrspace(1)* %out
134  ret void
135}
136
137define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
138; CHECK-LABEL: @sdiv_i32(
139; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
140; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
141; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
142; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[X]], [[TMP1]]
143; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]]
144; CHECK-NEXT:    [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]]
145; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]]
146; CHECK-NEXT:    [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float
147; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]])
148; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41EFFFFFC0000000
149; CHECK-NEXT:    [[TMP11:%.*]] = fptoui float [[TMP10]] to i32
150; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 0, [[TMP7]]
151; CHECK-NEXT:    [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP11]]
152; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP11]] to i64
153; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP13]] to i64
154; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]]
155; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
156; CHECK-NEXT:    [[TMP18:%.*]] = lshr i64 [[TMP16]], 32
157; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
158; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP11]], [[TMP19]]
159; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP6]] to i64
160; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP20]] to i64
161; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]]
162; CHECK-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
163; CHECK-NEXT:    [[TMP25:%.*]] = lshr i64 [[TMP23]], 32
164; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
165; CHECK-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP26]], [[TMP7]]
166; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP6]], [[TMP27]]
167; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP7]]
168; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
169; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
170; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP28]], [[TMP7]]
171; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP29]], i32 [[TMP32]], i32 [[TMP28]]
172; CHECK-NEXT:    [[TMP34:%.*]] = icmp uge i32 [[TMP33]], [[TMP7]]
173; CHECK-NEXT:    [[TMP35:%.*]] = add i32 [[TMP31]], 1
174; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]]
175; CHECK-NEXT:    [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]]
176; CHECK-NEXT:    [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]]
177; CHECK-NEXT:    store i32 [[TMP38]], i32 addrspace(1)* [[OUT:%.*]], align 4
178; CHECK-NEXT:    ret void
179;
180; GCN-LABEL: sdiv_i32:
181; GCN:       ; %bb.0:
182; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
183; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
184; GCN-NEXT:    s_mov_b32 s7, 0xf000
185; GCN-NEXT:    s_mov_b32 s6, -1
186; GCN-NEXT:    s_waitcnt lgkmcnt(0)
187; GCN-NEXT:    s_ashr_i32 s8, s3, 31
188; GCN-NEXT:    s_add_i32 s3, s3, s8
189; GCN-NEXT:    s_xor_b32 s9, s3, s8
190; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
191; GCN-NEXT:    s_sub_i32 s3, 0, s9
192; GCN-NEXT:    s_ashr_i32 s0, s2, 31
193; GCN-NEXT:    s_add_i32 s1, s2, s0
194; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
195; GCN-NEXT:    s_xor_b32 s1, s1, s0
196; GCN-NEXT:    s_xor_b32 s2, s0, s8
197; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
198; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
199; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
200; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
201; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
202; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
203; GCN-NEXT:    v_mul_lo_u32 v1, v0, s9
204; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
205; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
206; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v1
207; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
208; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s9, v1
209; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
210; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
211; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
212; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
213; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
214; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
215; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
216; GCN-NEXT:    s_endpgm
217  %r = sdiv i32 %x, %y
218  store i32 %r, i32 addrspace(1)* %out
219  ret void
220}
221
222define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) {
223; CHECK-LABEL: @srem_i32(
224; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31
225; CHECK-NEXT:    [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31
226; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[X]], [[TMP1]]
227; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]]
228; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]]
229; CHECK-NEXT:    [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]]
230; CHECK-NEXT:    [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float
231; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
232; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41EFFFFFC0000000
233; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP9]] to i32
234; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 0, [[TMP6]]
235; CHECK-NEXT:    [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP10]]
236; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP10]] to i64
237; CHECK-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP12]] to i64
238; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]]
239; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
240; CHECK-NEXT:    [[TMP17:%.*]] = lshr i64 [[TMP15]], 32
241; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
242; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[TMP10]], [[TMP18]]
243; CHECK-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
244; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP19]] to i64
245; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]]
246; CHECK-NEXT:    [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32
247; CHECK-NEXT:    [[TMP24:%.*]] = lshr i64 [[TMP22]], 32
248; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
249; CHECK-NEXT:    [[TMP26:%.*]] = mul i32 [[TMP25]], [[TMP6]]
250; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP5]], [[TMP26]]
251; CHECK-NEXT:    [[TMP28:%.*]] = icmp uge i32 [[TMP27]], [[TMP6]]
252; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP6]]
253; CHECK-NEXT:    [[TMP30:%.*]] = select i1 [[TMP28]], i32 [[TMP29]], i32 [[TMP27]]
254; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP6]]
255; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP30]], [[TMP6]]
256; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]]
257; CHECK-NEXT:    [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]]
258; CHECK-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]]
259; CHECK-NEXT:    store i32 [[TMP35]], i32 addrspace(1)* [[OUT:%.*]], align 4
260; CHECK-NEXT:    ret void
261;
262; GCN-LABEL: srem_i32:
263; GCN:       ; %bb.0:
264; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
265; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
266; GCN-NEXT:    s_waitcnt lgkmcnt(0)
267; GCN-NEXT:    s_ashr_i32 s4, s3, 31
268; GCN-NEXT:    s_add_i32 s3, s3, s4
269; GCN-NEXT:    s_xor_b32 s6, s3, s4
270; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
271; GCN-NEXT:    s_sub_i32 s3, 0, s6
272; GCN-NEXT:    s_ashr_i32 s4, s2, 31
273; GCN-NEXT:    s_add_i32 s2, s2, s4
274; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
275; GCN-NEXT:    s_xor_b32 s5, s2, s4
276; GCN-NEXT:    s_mov_b32 s2, -1
277; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
278; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
279; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
280; GCN-NEXT:    s_mov_b32 s3, 0xf000
281; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
282; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
283; GCN-NEXT:    v_mul_hi_u32 v0, s5, v0
284; GCN-NEXT:    v_mul_lo_u32 v0, v0, s6
285; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
286; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v0
287; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
288; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
289; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v0
290; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
291; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
292; GCN-NEXT:    v_xor_b32_e32 v0, s4, v0
293; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
294; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
295; GCN-NEXT:    s_endpgm
296  %r = srem i32 %x, %y
297  store i32 %r, i32 addrspace(1)* %out
298  ret void
299}
300
301define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
302; CHECK-LABEL: @udiv_i16(
303; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
304; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
305; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
306; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
307; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
308; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
309; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
310; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
311; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
312; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
313; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
314; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
315; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
316; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
317; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
318; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 65535
319; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16
320; CHECK-NEXT:    store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]], align 2
321; CHECK-NEXT:    ret void
322;
323; GCN-LABEL: udiv_i16:
324; GCN:       ; %bb.0:
325; GCN-NEXT:    s_load_dword s2, s[0:1], 0xb
326; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
327; GCN-NEXT:    s_waitcnt lgkmcnt(0)
328; GCN-NEXT:    s_lshr_b32 s3, s2, 16
329; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s3
330; GCN-NEXT:    s_and_b32 s2, s2, 0xffff
331; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s2
332; GCN-NEXT:    s_mov_b32 s3, 0xf000
333; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
334; GCN-NEXT:    s_mov_b32 s2, -1
335; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
336; GCN-NEXT:    v_trunc_f32_e32 v2, v2
337; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
338; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
339; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
340; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
341; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
342; GCN-NEXT:    s_endpgm
343  %r = udiv i16 %x, %y
344  store i16 %r, i16 addrspace(1)* %out
345  ret void
346}
347
348define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
349; CHECK-LABEL: @urem_i16(
350; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32
351; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32
352; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
353; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
354; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
355; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
356; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
357; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
358; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
359; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
360; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
361; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
362; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
363; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
364; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
365; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
366; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
367; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
368; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
369; CHECK-NEXT:    store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]], align 2
370; CHECK-NEXT:    ret void
371;
372; GCN-LABEL: urem_i16:
373; GCN:       ; %bb.0:
374; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
375; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
376; GCN-NEXT:    s_waitcnt lgkmcnt(0)
377; GCN-NEXT:    s_lshr_b32 s2, s4, 16
378; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
379; GCN-NEXT:    s_and_b32 s3, s4, 0xffff
380; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
381; GCN-NEXT:    s_mov_b32 s3, 0xf000
382; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
383; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
384; GCN-NEXT:    v_trunc_f32_e32 v2, v2
385; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
386; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
387; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
388; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
389; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
390; GCN-NEXT:    s_mov_b32 s2, -1
391; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
392; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
393; GCN-NEXT:    s_endpgm
394  %r = urem i16 %x, %y
395  store i16 %r, i16 addrspace(1)* %out
396  ret void
397}
398
399define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
400; CHECK-LABEL: @sdiv_i16(
401; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
402; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
403; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
404; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
405; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
406; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
407; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
408; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
409; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
410; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
411; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
412; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
413; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
414; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
415; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
416; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
417; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
418; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
419; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 16
420; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 16
421; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
422; CHECK-NEXT:    store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]], align 2
423; CHECK-NEXT:    ret void
424;
425; GCN-LABEL: sdiv_i16:
426; GCN:       ; %bb.0:
427; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
428; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
429; GCN-NEXT:    s_mov_b32 s7, 0xf000
430; GCN-NEXT:    s_mov_b32 s6, -1
431; GCN-NEXT:    s_waitcnt lgkmcnt(0)
432; GCN-NEXT:    s_ashr_i32 s1, s0, 16
433; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
434; GCN-NEXT:    s_sext_i32_i16 s0, s0
435; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
436; GCN-NEXT:    s_xor_b32 s0, s0, s1
437; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
438; GCN-NEXT:    s_ashr_i32 s0, s0, 30
439; GCN-NEXT:    s_or_b32 s0, s0, 1
440; GCN-NEXT:    v_mov_b32_e32 v3, s0
441; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
442; GCN-NEXT:    v_trunc_f32_e32 v2, v2
443; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
444; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
445; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
446; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
447; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
448; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0
449; GCN-NEXT:    s_endpgm
450  %r = sdiv i16 %x, %y
451  store i16 %r, i16 addrspace(1)* %out
452  ret void
453}
454
455define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
456; CHECK-LABEL: @srem_i16(
457; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32
458; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32
459; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
460; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
461; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
462; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
463; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
464; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
465; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
466; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
467; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
468; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
469; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
470; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
471; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
472; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
473; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
474; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
475; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
476; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
477; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
478; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
479; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
480; CHECK-NEXT:    store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]], align 2
481; CHECK-NEXT:    ret void
482;
483; GCN-LABEL: srem_i16:
484; GCN:       ; %bb.0:
485; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
486; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
487; GCN-NEXT:    s_waitcnt lgkmcnt(0)
488; GCN-NEXT:    s_ashr_i32 s2, s4, 16
489; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s2
490; GCN-NEXT:    s_sext_i32_i16 s3, s4
491; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s3
492; GCN-NEXT:    s_xor_b32 s3, s3, s2
493; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
494; GCN-NEXT:    s_ashr_i32 s3, s3, 30
495; GCN-NEXT:    s_or_b32 s3, s3, 1
496; GCN-NEXT:    v_mov_b32_e32 v3, s3
497; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
498; GCN-NEXT:    v_trunc_f32_e32 v2, v2
499; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
500; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
501; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
502; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
503; GCN-NEXT:    s_mov_b32 s3, 0xf000
504; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
505; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
506; GCN-NEXT:    s_mov_b32 s2, -1
507; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
508; GCN-NEXT:    buffer_store_short v0, off, s[0:3], 0
509; GCN-NEXT:    s_endpgm
510  %r = srem i16 %x, %y
511  store i16 %r, i16 addrspace(1)* %out
512  ret void
513}
514
515define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
516; CHECK-LABEL: @udiv_i8(
517; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
518; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
519; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
520; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
521; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
522; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
523; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
524; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
525; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
526; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
527; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
528; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
529; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
530; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
531; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
532; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 255
533; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8
534; CHECK-NEXT:    store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]], align 1
535; CHECK-NEXT:    ret void
536;
537; GCN-LABEL: udiv_i8:
538; GCN:       ; %bb.0:
539; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
540; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
541; GCN-NEXT:    s_mov_b32 s7, 0xf000
542; GCN-NEXT:    s_mov_b32 s6, -1
543; GCN-NEXT:    s_waitcnt lgkmcnt(0)
544; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v0, s0
545; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
546; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
547; GCN-NEXT:    v_mul_f32_e32 v1, v2, v1
548; GCN-NEXT:    v_trunc_f32_e32 v1, v1
549; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v1
550; GCN-NEXT:    v_mad_f32 v1, -v1, v0, v2
551; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
552; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
553; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
554; GCN-NEXT:    s_endpgm
555  %r = udiv i8 %x, %y
556  store i8 %r, i8 addrspace(1)* %out
557  ret void
558}
559
560define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
561; CHECK-LABEL: @urem_i8(
562; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
563; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
564; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
565; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
566; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
567; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
568; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
569; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
570; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
571; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
572; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
573; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
574; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
575; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
576; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
577; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
578; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
579; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 255
580; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8
581; CHECK-NEXT:    store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]], align 1
582; CHECK-NEXT:    ret void
583;
584; GCN-LABEL: urem_i8:
585; GCN:       ; %bb.0:
586; GCN-NEXT:    s_load_dword s4, s[0:1], 0xb
587; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
588; GCN-NEXT:    s_mov_b32 s3, 0xf000
589; GCN-NEXT:    s_waitcnt lgkmcnt(0)
590; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v0, s4
591; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
592; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
593; GCN-NEXT:    s_lshr_b32 s2, s4, 8
594; GCN-NEXT:    v_mul_f32_e32 v1, v2, v1
595; GCN-NEXT:    v_trunc_f32_e32 v1, v1
596; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v1
597; GCN-NEXT:    v_mad_f32 v1, -v1, v0, v2
598; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
599; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
600; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
601; GCN-NEXT:    s_mov_b32 s2, -1
602; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
603; GCN-NEXT:    buffer_store_byte v0, off, s[0:3], 0
604; GCN-NEXT:    s_endpgm
605  %r = urem i8 %x, %y
606  store i8 %r, i8 addrspace(1)* %out
607  ret void
608}
609
610define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
611; CHECK-LABEL: @sdiv_i8(
612; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
613; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
614; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
615; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
616; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
617; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
618; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
619; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
620; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
621; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
622; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
623; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
624; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
625; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
626; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
627; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
628; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
629; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
630; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 24
631; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 24
632; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8
633; CHECK-NEXT:    store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]], align 1
634; CHECK-NEXT:    ret void
635;
636; GCN-LABEL: sdiv_i8:
637; GCN:       ; %bb.0:
638; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
639; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
640; GCN-NEXT:    s_mov_b32 s7, 0xf000
641; GCN-NEXT:    s_mov_b32 s6, -1
642; GCN-NEXT:    s_waitcnt lgkmcnt(0)
643; GCN-NEXT:    s_bfe_i32 s1, s0, 0x80008
644; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
645; GCN-NEXT:    s_sext_i32_i8 s0, s0
646; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
647; GCN-NEXT:    s_xor_b32 s0, s0, s1
648; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
649; GCN-NEXT:    s_ashr_i32 s0, s0, 30
650; GCN-NEXT:    s_or_b32 s0, s0, 1
651; GCN-NEXT:    v_mov_b32_e32 v3, s0
652; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
653; GCN-NEXT:    v_trunc_f32_e32 v2, v2
654; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
655; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
656; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
657; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
658; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
659; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
660; GCN-NEXT:    s_endpgm
661  %r = sdiv i8 %x, %y
662  store i8 %r, i8 addrspace(1)* %out
663  ret void
664}
665
666define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
667; CHECK-LABEL: @srem_i8(
668; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32
669; CHECK-NEXT:    [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32
670; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
671; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
672; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
673; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
674; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
675; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
676; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
677; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
678; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
679; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
680; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
681; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
682; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
683; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
684; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
685; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
686; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
687; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
688; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 24
689; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 24
690; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8
691; CHECK-NEXT:    store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]], align 1
692; CHECK-NEXT:    ret void
693;
694; GCN-LABEL: srem_i8:
695; GCN:       ; %bb.0:
696; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
697; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
698; GCN-NEXT:    s_mov_b32 s7, 0xf000
699; GCN-NEXT:    s_mov_b32 s6, -1
700; GCN-NEXT:    s_waitcnt lgkmcnt(0)
701; GCN-NEXT:    s_bfe_i32 s1, s0, 0x80008
702; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
703; GCN-NEXT:    s_sext_i32_i8 s3, s0
704; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s3
705; GCN-NEXT:    s_xor_b32 s1, s3, s1
706; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
707; GCN-NEXT:    s_ashr_i32 s1, s1, 30
708; GCN-NEXT:    s_or_b32 s1, s1, 1
709; GCN-NEXT:    v_mov_b32_e32 v3, s1
710; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
711; GCN-NEXT:    v_trunc_f32_e32 v2, v2
712; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
713; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
714; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
715; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
716; GCN-NEXT:    s_lshr_b32 s2, s0, 8
717; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
718; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
719; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
720; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
721; GCN-NEXT:    s_endpgm
722  %r = srem i8 %x, %y
723  store i8 %r, i8 addrspace(1)* %out
724  ret void
725}
726
727define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
728; CHECK-LABEL: @udiv_v4i32(
729; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
730; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
731; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
732; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
733; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
734; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
735; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
736; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
737; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
738; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
739; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
740; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
741; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
742; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
743; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
744; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
745; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
746; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
747; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
748; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
749; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
750; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
751; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
752; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
753; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP21]], 1
754; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]]
755; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]]
756; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]]
757; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]]
758; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
759; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
760; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i64 0
761; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x i32> [[X]], i64 1
762; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x i32> [[Y]], i64 1
763; CHECK-NEXT:    [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float
764; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
765; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000
766; CHECK-NEXT:    [[TMP38:%.*]] = fptoui float [[TMP37]] to i32
767; CHECK-NEXT:    [[TMP39:%.*]] = sub i32 0, [[TMP34]]
768; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]]
769; CHECK-NEXT:    [[TMP41:%.*]] = zext i32 [[TMP38]] to i64
770; CHECK-NEXT:    [[TMP42:%.*]] = zext i32 [[TMP40]] to i64
771; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]]
772; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
773; CHECK-NEXT:    [[TMP45:%.*]] = lshr i64 [[TMP43]], 32
774; CHECK-NEXT:    [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32
775; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]]
776; CHECK-NEXT:    [[TMP48:%.*]] = zext i32 [[TMP33]] to i64
777; CHECK-NEXT:    [[TMP49:%.*]] = zext i32 [[TMP47]] to i64
778; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]]
779; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
780; CHECK-NEXT:    [[TMP52:%.*]] = lshr i64 [[TMP50]], 32
781; CHECK-NEXT:    [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32
782; CHECK-NEXT:    [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]]
783; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]]
784; CHECK-NEXT:    [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]]
785; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP53]], 1
786; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]]
787; CHECK-NEXT:    [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]]
788; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]]
789; CHECK-NEXT:    [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]]
790; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP58]], 1
791; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]]
792; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP63]], i64 1
793; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <4 x i32> [[X]], i64 2
794; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <4 x i32> [[Y]], i64 2
795; CHECK-NEXT:    [[TMP67:%.*]] = uitofp i32 [[TMP66]] to float
796; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP67]])
797; CHECK-NEXT:    [[TMP69:%.*]] = fmul fast float [[TMP68]], 0x41EFFFFFC0000000
798; CHECK-NEXT:    [[TMP70:%.*]] = fptoui float [[TMP69]] to i32
799; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 0, [[TMP66]]
800; CHECK-NEXT:    [[TMP72:%.*]] = mul i32 [[TMP71]], [[TMP70]]
801; CHECK-NEXT:    [[TMP73:%.*]] = zext i32 [[TMP70]] to i64
802; CHECK-NEXT:    [[TMP74:%.*]] = zext i32 [[TMP72]] to i64
803; CHECK-NEXT:    [[TMP75:%.*]] = mul i64 [[TMP73]], [[TMP74]]
804; CHECK-NEXT:    [[TMP76:%.*]] = trunc i64 [[TMP75]] to i32
805; CHECK-NEXT:    [[TMP77:%.*]] = lshr i64 [[TMP75]], 32
806; CHECK-NEXT:    [[TMP78:%.*]] = trunc i64 [[TMP77]] to i32
807; CHECK-NEXT:    [[TMP79:%.*]] = add i32 [[TMP70]], [[TMP78]]
808; CHECK-NEXT:    [[TMP80:%.*]] = zext i32 [[TMP65]] to i64
809; CHECK-NEXT:    [[TMP81:%.*]] = zext i32 [[TMP79]] to i64
810; CHECK-NEXT:    [[TMP82:%.*]] = mul i64 [[TMP80]], [[TMP81]]
811; CHECK-NEXT:    [[TMP83:%.*]] = trunc i64 [[TMP82]] to i32
812; CHECK-NEXT:    [[TMP84:%.*]] = lshr i64 [[TMP82]], 32
813; CHECK-NEXT:    [[TMP85:%.*]] = trunc i64 [[TMP84]] to i32
814; CHECK-NEXT:    [[TMP86:%.*]] = mul i32 [[TMP85]], [[TMP66]]
815; CHECK-NEXT:    [[TMP87:%.*]] = sub i32 [[TMP65]], [[TMP86]]
816; CHECK-NEXT:    [[TMP88:%.*]] = icmp uge i32 [[TMP87]], [[TMP66]]
817; CHECK-NEXT:    [[TMP89:%.*]] = add i32 [[TMP85]], 1
818; CHECK-NEXT:    [[TMP90:%.*]] = select i1 [[TMP88]], i32 [[TMP89]], i32 [[TMP85]]
819; CHECK-NEXT:    [[TMP91:%.*]] = sub i32 [[TMP87]], [[TMP66]]
820; CHECK-NEXT:    [[TMP92:%.*]] = select i1 [[TMP88]], i32 [[TMP91]], i32 [[TMP87]]
821; CHECK-NEXT:    [[TMP93:%.*]] = icmp uge i32 [[TMP92]], [[TMP66]]
822; CHECK-NEXT:    [[TMP94:%.*]] = add i32 [[TMP90]], 1
823; CHECK-NEXT:    [[TMP95:%.*]] = select i1 [[TMP93]], i32 [[TMP94]], i32 [[TMP90]]
824; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP95]], i64 2
825; CHECK-NEXT:    [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 3
826; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 3
827; CHECK-NEXT:    [[TMP99:%.*]] = uitofp i32 [[TMP98]] to float
828; CHECK-NEXT:    [[TMP100:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP99]])
829; CHECK-NEXT:    [[TMP101:%.*]] = fmul fast float [[TMP100]], 0x41EFFFFFC0000000
830; CHECK-NEXT:    [[TMP102:%.*]] = fptoui float [[TMP101]] to i32
831; CHECK-NEXT:    [[TMP103:%.*]] = sub i32 0, [[TMP98]]
832; CHECK-NEXT:    [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP102]]
833; CHECK-NEXT:    [[TMP105:%.*]] = zext i32 [[TMP102]] to i64
834; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP104]] to i64
835; CHECK-NEXT:    [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]]
836; CHECK-NEXT:    [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32
837; CHECK-NEXT:    [[TMP109:%.*]] = lshr i64 [[TMP107]], 32
838; CHECK-NEXT:    [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32
839; CHECK-NEXT:    [[TMP111:%.*]] = add i32 [[TMP102]], [[TMP110]]
840; CHECK-NEXT:    [[TMP112:%.*]] = zext i32 [[TMP97]] to i64
841; CHECK-NEXT:    [[TMP113:%.*]] = zext i32 [[TMP111]] to i64
842; CHECK-NEXT:    [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]]
843; CHECK-NEXT:    [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32
844; CHECK-NEXT:    [[TMP116:%.*]] = lshr i64 [[TMP114]], 32
845; CHECK-NEXT:    [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32
846; CHECK-NEXT:    [[TMP118:%.*]] = mul i32 [[TMP117]], [[TMP98]]
847; CHECK-NEXT:    [[TMP119:%.*]] = sub i32 [[TMP97]], [[TMP118]]
848; CHECK-NEXT:    [[TMP120:%.*]] = icmp uge i32 [[TMP119]], [[TMP98]]
849; CHECK-NEXT:    [[TMP121:%.*]] = add i32 [[TMP117]], 1
850; CHECK-NEXT:    [[TMP122:%.*]] = select i1 [[TMP120]], i32 [[TMP121]], i32 [[TMP117]]
851; CHECK-NEXT:    [[TMP123:%.*]] = sub i32 [[TMP119]], [[TMP98]]
852; CHECK-NEXT:    [[TMP124:%.*]] = select i1 [[TMP120]], i32 [[TMP123]], i32 [[TMP119]]
853; CHECK-NEXT:    [[TMP125:%.*]] = icmp uge i32 [[TMP124]], [[TMP98]]
854; CHECK-NEXT:    [[TMP126:%.*]] = add i32 [[TMP122]], 1
855; CHECK-NEXT:    [[TMP127:%.*]] = select i1 [[TMP125]], i32 [[TMP126]], i32 [[TMP122]]
856; CHECK-NEXT:    [[TMP128:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP127]], i64 3
857; CHECK-NEXT:    store <4 x i32> [[TMP128]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
858; CHECK-NEXT:    ret void
859;
860; GCN-LABEL: udiv_v4i32:
861; GCN:       ; %bb.0:
862; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
863; GCN-NEXT:    s_mov_b32 s3, 0x4f7ffffe
864; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
865; GCN-NEXT:    s_mov_b32 s15, 0xf000
866; GCN-NEXT:    s_mov_b32 s14, -1
867; GCN-NEXT:    s_waitcnt lgkmcnt(0)
868; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
869; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
870; GCN-NEXT:    s_sub_i32 s2, 0, s8
871; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s10
872; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
873; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
874; GCN-NEXT:    v_cvt_f32_u32_e32 v6, s11
875; GCN-NEXT:    v_mul_f32_e32 v0, s3, v0
876; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
877; GCN-NEXT:    v_mul_f32_e32 v1, s3, v1
878; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
879; GCN-NEXT:    v_mul_lo_u32 v2, s2, v0
880; GCN-NEXT:    s_sub_i32 s2, 0, s9
881; GCN-NEXT:    v_mul_lo_u32 v3, s2, v1
882; GCN-NEXT:    s_sub_i32 s2, 0, s10
883; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
884; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
885; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
886; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
887; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
888; GCN-NEXT:    v_mul_hi_u32 v1, s5, v1
889; GCN-NEXT:    v_mul_lo_u32 v2, v0, s8
890; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
891; GCN-NEXT:    v_mul_lo_u32 v5, v1, s9
892; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2
893; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v2
894; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
895; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v2
896; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
897; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
898; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
899; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v4
900; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
901; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s5, v5
902; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
903; GCN-NEXT:    v_mul_f32_e32 v2, s3, v2
904; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
905; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v3
906; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
907; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v3
908; GCN-NEXT:    v_mul_lo_u32 v4, s2, v2
909; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
910; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v1
911; GCN-NEXT:    s_sub_i32 s0, 0, s11
912; GCN-NEXT:    v_mul_hi_u32 v4, v2, v4
913; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
914; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v6
915; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
916; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
917; GCN-NEXT:    v_mul_hi_u32 v2, s6, v2
918; GCN-NEXT:    v_mul_f32_e32 v4, s3, v4
919; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
920; GCN-NEXT:    v_mul_lo_u32 v3, v2, s10
921; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
922; GCN-NEXT:    v_mul_lo_u32 v5, s0, v4
923; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
924; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v3
925; GCN-NEXT:    v_mul_hi_u32 v5, v4, v5
926; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
927; GCN-NEXT:    v_subrev_i32_e32 v6, vcc, s10, v3
928; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
929; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
930; GCN-NEXT:    v_mul_hi_u32 v4, s7, v4
931; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
932; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
933; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
934; GCN-NEXT:    v_mul_lo_u32 v6, v4, s11
935; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
936; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s7, v6
937; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v3
938; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
939; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s11, v3
940; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
941; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
942; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
943; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
944; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
945; GCN-NEXT:    s_endpgm
946  %r = udiv <4 x i32> %x, %y
947  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
948  ret void
949}
950
951define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
952; CHECK-LABEL: @urem_v4i32(
953; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
954; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
955; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
956; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
957; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
958; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
959; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
960; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
961; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
962; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
963; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
964; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
965; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
966; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
967; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
968; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
969; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
970; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
971; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
972; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
973; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
974; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
975; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
976; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
977; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]]
978; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]]
979; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]]
980; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]]
981; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]]
982; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> undef, i32 [[TMP29]], i64 0
983; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[X]], i64 1
984; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[Y]], i64 1
985; CHECK-NEXT:    [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float
986; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
987; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000
988; CHECK-NEXT:    [[TMP36:%.*]] = fptoui float [[TMP35]] to i32
989; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 0, [[TMP32]]
990; CHECK-NEXT:    [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]]
991; CHECK-NEXT:    [[TMP39:%.*]] = zext i32 [[TMP36]] to i64
992; CHECK-NEXT:    [[TMP40:%.*]] = zext i32 [[TMP38]] to i64
993; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]]
994; CHECK-NEXT:    [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32
995; CHECK-NEXT:    [[TMP43:%.*]] = lshr i64 [[TMP41]], 32
996; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
997; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]]
998; CHECK-NEXT:    [[TMP46:%.*]] = zext i32 [[TMP31]] to i64
999; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP45]] to i64
1000; CHECK-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]]
1001; CHECK-NEXT:    [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32
1002; CHECK-NEXT:    [[TMP50:%.*]] = lshr i64 [[TMP48]], 32
1003; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
1004; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]]
1005; CHECK-NEXT:    [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]]
1006; CHECK-NEXT:    [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]]
1007; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]]
1008; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]]
1009; CHECK-NEXT:    [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]]
1010; CHECK-NEXT:    [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]]
1011; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]]
1012; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP59]], i64 1
1013; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <4 x i32> [[X]], i64 2
1014; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1015; CHECK-NEXT:    [[TMP63:%.*]] = uitofp i32 [[TMP62]] to float
1016; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP63]])
1017; CHECK-NEXT:    [[TMP65:%.*]] = fmul fast float [[TMP64]], 0x41EFFFFFC0000000
1018; CHECK-NEXT:    [[TMP66:%.*]] = fptoui float [[TMP65]] to i32
1019; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 0, [[TMP62]]
1020; CHECK-NEXT:    [[TMP68:%.*]] = mul i32 [[TMP67]], [[TMP66]]
1021; CHECK-NEXT:    [[TMP69:%.*]] = zext i32 [[TMP66]] to i64
1022; CHECK-NEXT:    [[TMP70:%.*]] = zext i32 [[TMP68]] to i64
1023; CHECK-NEXT:    [[TMP71:%.*]] = mul i64 [[TMP69]], [[TMP70]]
1024; CHECK-NEXT:    [[TMP72:%.*]] = trunc i64 [[TMP71]] to i32
1025; CHECK-NEXT:    [[TMP73:%.*]] = lshr i64 [[TMP71]], 32
1026; CHECK-NEXT:    [[TMP74:%.*]] = trunc i64 [[TMP73]] to i32
1027; CHECK-NEXT:    [[TMP75:%.*]] = add i32 [[TMP66]], [[TMP74]]
1028; CHECK-NEXT:    [[TMP76:%.*]] = zext i32 [[TMP61]] to i64
1029; CHECK-NEXT:    [[TMP77:%.*]] = zext i32 [[TMP75]] to i64
1030; CHECK-NEXT:    [[TMP78:%.*]] = mul i64 [[TMP76]], [[TMP77]]
1031; CHECK-NEXT:    [[TMP79:%.*]] = trunc i64 [[TMP78]] to i32
1032; CHECK-NEXT:    [[TMP80:%.*]] = lshr i64 [[TMP78]], 32
1033; CHECK-NEXT:    [[TMP81:%.*]] = trunc i64 [[TMP80]] to i32
1034; CHECK-NEXT:    [[TMP82:%.*]] = mul i32 [[TMP81]], [[TMP62]]
1035; CHECK-NEXT:    [[TMP83:%.*]] = sub i32 [[TMP61]], [[TMP82]]
1036; CHECK-NEXT:    [[TMP84:%.*]] = icmp uge i32 [[TMP83]], [[TMP62]]
1037; CHECK-NEXT:    [[TMP85:%.*]] = sub i32 [[TMP83]], [[TMP62]]
1038; CHECK-NEXT:    [[TMP86:%.*]] = select i1 [[TMP84]], i32 [[TMP85]], i32 [[TMP83]]
1039; CHECK-NEXT:    [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP62]]
1040; CHECK-NEXT:    [[TMP88:%.*]] = sub i32 [[TMP86]], [[TMP62]]
1041; CHECK-NEXT:    [[TMP89:%.*]] = select i1 [[TMP87]], i32 [[TMP88]], i32 [[TMP86]]
1042; CHECK-NEXT:    [[TMP90:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP89]], i64 2
1043; CHECK-NEXT:    [[TMP91:%.*]] = extractelement <4 x i32> [[X]], i64 3
1044; CHECK-NEXT:    [[TMP92:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1045; CHECK-NEXT:    [[TMP93:%.*]] = uitofp i32 [[TMP92]] to float
1046; CHECK-NEXT:    [[TMP94:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP93]])
1047; CHECK-NEXT:    [[TMP95:%.*]] = fmul fast float [[TMP94]], 0x41EFFFFFC0000000
1048; CHECK-NEXT:    [[TMP96:%.*]] = fptoui float [[TMP95]] to i32
1049; CHECK-NEXT:    [[TMP97:%.*]] = sub i32 0, [[TMP92]]
1050; CHECK-NEXT:    [[TMP98:%.*]] = mul i32 [[TMP97]], [[TMP96]]
1051; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP96]] to i64
1052; CHECK-NEXT:    [[TMP100:%.*]] = zext i32 [[TMP98]] to i64
1053; CHECK-NEXT:    [[TMP101:%.*]] = mul i64 [[TMP99]], [[TMP100]]
1054; CHECK-NEXT:    [[TMP102:%.*]] = trunc i64 [[TMP101]] to i32
1055; CHECK-NEXT:    [[TMP103:%.*]] = lshr i64 [[TMP101]], 32
1056; CHECK-NEXT:    [[TMP104:%.*]] = trunc i64 [[TMP103]] to i32
1057; CHECK-NEXT:    [[TMP105:%.*]] = add i32 [[TMP96]], [[TMP104]]
1058; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP91]] to i64
1059; CHECK-NEXT:    [[TMP107:%.*]] = zext i32 [[TMP105]] to i64
1060; CHECK-NEXT:    [[TMP108:%.*]] = mul i64 [[TMP106]], [[TMP107]]
1061; CHECK-NEXT:    [[TMP109:%.*]] = trunc i64 [[TMP108]] to i32
1062; CHECK-NEXT:    [[TMP110:%.*]] = lshr i64 [[TMP108]], 32
1063; CHECK-NEXT:    [[TMP111:%.*]] = trunc i64 [[TMP110]] to i32
1064; CHECK-NEXT:    [[TMP112:%.*]] = mul i32 [[TMP111]], [[TMP92]]
1065; CHECK-NEXT:    [[TMP113:%.*]] = sub i32 [[TMP91]], [[TMP112]]
1066; CHECK-NEXT:    [[TMP114:%.*]] = icmp uge i32 [[TMP113]], [[TMP92]]
1067; CHECK-NEXT:    [[TMP115:%.*]] = sub i32 [[TMP113]], [[TMP92]]
1068; CHECK-NEXT:    [[TMP116:%.*]] = select i1 [[TMP114]], i32 [[TMP115]], i32 [[TMP113]]
1069; CHECK-NEXT:    [[TMP117:%.*]] = icmp uge i32 [[TMP116]], [[TMP92]]
1070; CHECK-NEXT:    [[TMP118:%.*]] = sub i32 [[TMP116]], [[TMP92]]
1071; CHECK-NEXT:    [[TMP119:%.*]] = select i1 [[TMP117]], i32 [[TMP118]], i32 [[TMP116]]
1072; CHECK-NEXT:    [[TMP120:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP119]], i64 3
1073; CHECK-NEXT:    store <4 x i32> [[TMP120]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1074; CHECK-NEXT:    ret void
1075;
1076; GCN-LABEL: urem_v4i32:
1077; GCN:       ; %bb.0:
1078; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
1079; GCN-NEXT:    s_mov_b32 s13, 0x4f7ffffe
1080; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1081; GCN-NEXT:    s_mov_b32 s3, 0xf000
1082; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1083; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
1084; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
1085; GCN-NEXT:    s_sub_i32 s2, 0, s8
1086; GCN-NEXT:    s_sub_i32 s12, 0, s9
1087; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1088; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1089; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s10
1090; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s11
1091; GCN-NEXT:    v_mul_f32_e32 v0, s13, v0
1092; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
1093; GCN-NEXT:    v_mul_f32_e32 v1, s13, v1
1094; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1095; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1096; GCN-NEXT:    v_mul_lo_u32 v2, s2, v0
1097; GCN-NEXT:    s_mov_b32 s2, -1
1098; GCN-NEXT:    v_mul_lo_u32 v4, s12, v1
1099; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
1100; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
1101; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
1102; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
1103; GCN-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
1104; GCN-NEXT:    v_mul_hi_u32 v1, s5, v1
1105; GCN-NEXT:    v_mul_f32_e32 v2, s13, v3
1106; GCN-NEXT:    v_mul_lo_u32 v0, v0, s8
1107; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
1108; GCN-NEXT:    v_mul_lo_u32 v1, v1, s9
1109; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1110; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
1111; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1112; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1113; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
1114; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
1115; GCN-NEXT:    s_sub_i32 s4, 0, s10
1116; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1117; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
1118; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
1119; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
1120; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1121; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1122; GCN-NEXT:    v_mul_hi_u32 v3, v2, v3
1123; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v5
1124; GCN-NEXT:    s_sub_i32 s4, 0, s11
1125; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
1126; GCN-NEXT:    v_mul_f32_e32 v3, s13, v4
1127; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
1128; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s9, v1
1129; GCN-NEXT:    v_mul_hi_u32 v2, s6, v2
1130; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
1131; GCN-NEXT:    v_mul_lo_u32 v5, s4, v3
1132; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1133; GCN-NEXT:    v_mul_lo_u32 v2, v2, s10
1134; GCN-NEXT:    v_mul_hi_u32 v4, v3, v5
1135; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s6, v2
1136; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
1137; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
1138; GCN-NEXT:    v_mul_hi_u32 v3, s7, v3
1139; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1140; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1141; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s10, v2
1142; GCN-NEXT:    v_mul_lo_u32 v3, v3, s11
1143; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1144; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
1145; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s7, v3
1146; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s11, v3
1147; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1148; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1149; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s11, v3
1150; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
1151; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1152; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1153; GCN-NEXT:    s_endpgm
1154  %r = urem <4 x i32> %x, %y
1155  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
1156  ret void
1157}
1158
1159define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1160; CHECK-LABEL: @sdiv_v4i32(
1161; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1162; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1163; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
1164; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
1165; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
1166; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]]
1167; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]]
1168; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]]
1169; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]]
1170; CHECK-NEXT:    [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
1171; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]])
1172; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000
1173; CHECK-NEXT:    [[TMP13:%.*]] = fptoui float [[TMP12]] to i32
1174; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 0, [[TMP9]]
1175; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]]
1176; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP13]] to i64
1177; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
1178; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
1179; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
1180; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
1181; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
1182; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]]
1183; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP8]] to i64
1184; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP22]] to i64
1185; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
1186; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
1187; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
1188; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
1189; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]]
1190; CHECK-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]]
1191; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]]
1192; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP28]], 1
1193; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]]
1194; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]]
1195; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]]
1196; CHECK-NEXT:    [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]]
1197; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP33]], 1
1198; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]]
1199; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]]
1200; CHECK-NEXT:    [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]]
1201; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> undef, i32 [[TMP40]], i64 0
1202; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i32> [[X]], i64 1
1203; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1204; CHECK-NEXT:    [[TMP44:%.*]] = ashr i32 [[TMP42]], 31
1205; CHECK-NEXT:    [[TMP45:%.*]] = ashr i32 [[TMP43]], 31
1206; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]]
1207; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]]
1208; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]]
1209; CHECK-NEXT:    [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]]
1210; CHECK-NEXT:    [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]]
1211; CHECK-NEXT:    [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float
1212; CHECK-NEXT:    [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]])
1213; CHECK-NEXT:    [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000
1214; CHECK-NEXT:    [[TMP54:%.*]] = fptoui float [[TMP53]] to i32
1215; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 0, [[TMP50]]
1216; CHECK-NEXT:    [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]]
1217; CHECK-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP54]] to i64
1218; CHECK-NEXT:    [[TMP58:%.*]] = zext i32 [[TMP56]] to i64
1219; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]]
1220; CHECK-NEXT:    [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32
1221; CHECK-NEXT:    [[TMP61:%.*]] = lshr i64 [[TMP59]], 32
1222; CHECK-NEXT:    [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32
1223; CHECK-NEXT:    [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]]
1224; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[TMP49]] to i64
1225; CHECK-NEXT:    [[TMP65:%.*]] = zext i32 [[TMP63]] to i64
1226; CHECK-NEXT:    [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]]
1227; CHECK-NEXT:    [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32
1228; CHECK-NEXT:    [[TMP68:%.*]] = lshr i64 [[TMP66]], 32
1229; CHECK-NEXT:    [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32
1230; CHECK-NEXT:    [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]]
1231; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]]
1232; CHECK-NEXT:    [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]]
1233; CHECK-NEXT:    [[TMP73:%.*]] = add i32 [[TMP69]], 1
1234; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]]
1235; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]]
1236; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]]
1237; CHECK-NEXT:    [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]]
1238; CHECK-NEXT:    [[TMP78:%.*]] = add i32 [[TMP74]], 1
1239; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]]
1240; CHECK-NEXT:    [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]]
1241; CHECK-NEXT:    [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]]
1242; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP81]], i64 1
1243; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <4 x i32> [[X]], i64 2
1244; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1245; CHECK-NEXT:    [[TMP85:%.*]] = ashr i32 [[TMP83]], 31
1246; CHECK-NEXT:    [[TMP86:%.*]] = ashr i32 [[TMP84]], 31
1247; CHECK-NEXT:    [[TMP87:%.*]] = xor i32 [[TMP85]], [[TMP86]]
1248; CHECK-NEXT:    [[TMP88:%.*]] = add i32 [[TMP83]], [[TMP85]]
1249; CHECK-NEXT:    [[TMP89:%.*]] = add i32 [[TMP84]], [[TMP86]]
1250; CHECK-NEXT:    [[TMP90:%.*]] = xor i32 [[TMP88]], [[TMP85]]
1251; CHECK-NEXT:    [[TMP91:%.*]] = xor i32 [[TMP89]], [[TMP86]]
1252; CHECK-NEXT:    [[TMP92:%.*]] = uitofp i32 [[TMP91]] to float
1253; CHECK-NEXT:    [[TMP93:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP92]])
1254; CHECK-NEXT:    [[TMP94:%.*]] = fmul fast float [[TMP93]], 0x41EFFFFFC0000000
1255; CHECK-NEXT:    [[TMP95:%.*]] = fptoui float [[TMP94]] to i32
1256; CHECK-NEXT:    [[TMP96:%.*]] = sub i32 0, [[TMP91]]
1257; CHECK-NEXT:    [[TMP97:%.*]] = mul i32 [[TMP96]], [[TMP95]]
1258; CHECK-NEXT:    [[TMP98:%.*]] = zext i32 [[TMP95]] to i64
1259; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP97]] to i64
1260; CHECK-NEXT:    [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]]
1261; CHECK-NEXT:    [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32
1262; CHECK-NEXT:    [[TMP102:%.*]] = lshr i64 [[TMP100]], 32
1263; CHECK-NEXT:    [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32
1264; CHECK-NEXT:    [[TMP104:%.*]] = add i32 [[TMP95]], [[TMP103]]
1265; CHECK-NEXT:    [[TMP105:%.*]] = zext i32 [[TMP90]] to i64
1266; CHECK-NEXT:    [[TMP106:%.*]] = zext i32 [[TMP104]] to i64
1267; CHECK-NEXT:    [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]]
1268; CHECK-NEXT:    [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32
1269; CHECK-NEXT:    [[TMP109:%.*]] = lshr i64 [[TMP107]], 32
1270; CHECK-NEXT:    [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32
1271; CHECK-NEXT:    [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP91]]
1272; CHECK-NEXT:    [[TMP112:%.*]] = sub i32 [[TMP90]], [[TMP111]]
1273; CHECK-NEXT:    [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP91]]
1274; CHECK-NEXT:    [[TMP114:%.*]] = add i32 [[TMP110]], 1
1275; CHECK-NEXT:    [[TMP115:%.*]] = select i1 [[TMP113]], i32 [[TMP114]], i32 [[TMP110]]
1276; CHECK-NEXT:    [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP91]]
1277; CHECK-NEXT:    [[TMP117:%.*]] = select i1 [[TMP113]], i32 [[TMP116]], i32 [[TMP112]]
1278; CHECK-NEXT:    [[TMP118:%.*]] = icmp uge i32 [[TMP117]], [[TMP91]]
1279; CHECK-NEXT:    [[TMP119:%.*]] = add i32 [[TMP115]], 1
1280; CHECK-NEXT:    [[TMP120:%.*]] = select i1 [[TMP118]], i32 [[TMP119]], i32 [[TMP115]]
1281; CHECK-NEXT:    [[TMP121:%.*]] = xor i32 [[TMP120]], [[TMP87]]
1282; CHECK-NEXT:    [[TMP122:%.*]] = sub i32 [[TMP121]], [[TMP87]]
1283; CHECK-NEXT:    [[TMP123:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP122]], i64 2
1284; CHECK-NEXT:    [[TMP124:%.*]] = extractelement <4 x i32> [[X]], i64 3
1285; CHECK-NEXT:    [[TMP125:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1286; CHECK-NEXT:    [[TMP126:%.*]] = ashr i32 [[TMP124]], 31
1287; CHECK-NEXT:    [[TMP127:%.*]] = ashr i32 [[TMP125]], 31
1288; CHECK-NEXT:    [[TMP128:%.*]] = xor i32 [[TMP126]], [[TMP127]]
1289; CHECK-NEXT:    [[TMP129:%.*]] = add i32 [[TMP124]], [[TMP126]]
1290; CHECK-NEXT:    [[TMP130:%.*]] = add i32 [[TMP125]], [[TMP127]]
1291; CHECK-NEXT:    [[TMP131:%.*]] = xor i32 [[TMP129]], [[TMP126]]
1292; CHECK-NEXT:    [[TMP132:%.*]] = xor i32 [[TMP130]], [[TMP127]]
1293; CHECK-NEXT:    [[TMP133:%.*]] = uitofp i32 [[TMP132]] to float
1294; CHECK-NEXT:    [[TMP134:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP133]])
1295; CHECK-NEXT:    [[TMP135:%.*]] = fmul fast float [[TMP134]], 0x41EFFFFFC0000000
1296; CHECK-NEXT:    [[TMP136:%.*]] = fptoui float [[TMP135]] to i32
1297; CHECK-NEXT:    [[TMP137:%.*]] = sub i32 0, [[TMP132]]
1298; CHECK-NEXT:    [[TMP138:%.*]] = mul i32 [[TMP137]], [[TMP136]]
1299; CHECK-NEXT:    [[TMP139:%.*]] = zext i32 [[TMP136]] to i64
1300; CHECK-NEXT:    [[TMP140:%.*]] = zext i32 [[TMP138]] to i64
1301; CHECK-NEXT:    [[TMP141:%.*]] = mul i64 [[TMP139]], [[TMP140]]
1302; CHECK-NEXT:    [[TMP142:%.*]] = trunc i64 [[TMP141]] to i32
1303; CHECK-NEXT:    [[TMP143:%.*]] = lshr i64 [[TMP141]], 32
1304; CHECK-NEXT:    [[TMP144:%.*]] = trunc i64 [[TMP143]] to i32
1305; CHECK-NEXT:    [[TMP145:%.*]] = add i32 [[TMP136]], [[TMP144]]
1306; CHECK-NEXT:    [[TMP146:%.*]] = zext i32 [[TMP131]] to i64
1307; CHECK-NEXT:    [[TMP147:%.*]] = zext i32 [[TMP145]] to i64
1308; CHECK-NEXT:    [[TMP148:%.*]] = mul i64 [[TMP146]], [[TMP147]]
1309; CHECK-NEXT:    [[TMP149:%.*]] = trunc i64 [[TMP148]] to i32
1310; CHECK-NEXT:    [[TMP150:%.*]] = lshr i64 [[TMP148]], 32
1311; CHECK-NEXT:    [[TMP151:%.*]] = trunc i64 [[TMP150]] to i32
1312; CHECK-NEXT:    [[TMP152:%.*]] = mul i32 [[TMP151]], [[TMP132]]
1313; CHECK-NEXT:    [[TMP153:%.*]] = sub i32 [[TMP131]], [[TMP152]]
1314; CHECK-NEXT:    [[TMP154:%.*]] = icmp uge i32 [[TMP153]], [[TMP132]]
1315; CHECK-NEXT:    [[TMP155:%.*]] = add i32 [[TMP151]], 1
1316; CHECK-NEXT:    [[TMP156:%.*]] = select i1 [[TMP154]], i32 [[TMP155]], i32 [[TMP151]]
1317; CHECK-NEXT:    [[TMP157:%.*]] = sub i32 [[TMP153]], [[TMP132]]
1318; CHECK-NEXT:    [[TMP158:%.*]] = select i1 [[TMP154]], i32 [[TMP157]], i32 [[TMP153]]
1319; CHECK-NEXT:    [[TMP159:%.*]] = icmp uge i32 [[TMP158]], [[TMP132]]
1320; CHECK-NEXT:    [[TMP160:%.*]] = add i32 [[TMP156]], 1
1321; CHECK-NEXT:    [[TMP161:%.*]] = select i1 [[TMP159]], i32 [[TMP160]], i32 [[TMP156]]
1322; CHECK-NEXT:    [[TMP162:%.*]] = xor i32 [[TMP161]], [[TMP128]]
1323; CHECK-NEXT:    [[TMP163:%.*]] = sub i32 [[TMP162]], [[TMP128]]
1324; CHECK-NEXT:    [[TMP164:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP163]], i64 3
1325; CHECK-NEXT:    store <4 x i32> [[TMP164]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1326; CHECK-NEXT:    ret void
1327;
1328; GCN-LABEL: sdiv_v4i32:
1329; GCN:       ; %bb.0:
1330; GCN-NEXT:    s_load_dwordx8 s[8:15], s[0:1], 0xd
1331; GCN-NEXT:    s_mov_b32 s16, 0x4f7ffffe
1332; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1333; GCN-NEXT:    s_mov_b32 s7, 0xf000
1334; GCN-NEXT:    s_mov_b32 s6, -1
1335; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1336; GCN-NEXT:    s_ashr_i32 s2, s12, 31
1337; GCN-NEXT:    s_add_i32 s3, s12, s2
1338; GCN-NEXT:    s_xor_b32 s12, s3, s2
1339; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
1340; GCN-NEXT:    s_ashr_i32 s3, s13, 31
1341; GCN-NEXT:    s_add_i32 s0, s13, s3
1342; GCN-NEXT:    s_xor_b32 s13, s0, s3
1343; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1344; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s13
1345; GCN-NEXT:    s_sub_i32 s1, 0, s12
1346; GCN-NEXT:    s_ashr_i32 s0, s8, 31
1347; GCN-NEXT:    v_mul_f32_e32 v0, s16, v0
1348; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
1349; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1350; GCN-NEXT:    s_xor_b32 s2, s0, s2
1351; GCN-NEXT:    v_mul_lo_u32 v2, s1, v0
1352; GCN-NEXT:    s_add_i32 s1, s8, s0
1353; GCN-NEXT:    v_mul_f32_e32 v1, s16, v1
1354; GCN-NEXT:    s_xor_b32 s1, s1, s0
1355; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
1356; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1357; GCN-NEXT:    s_sub_i32 s0, 0, s13
1358; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
1359; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
1360; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
1361; GCN-NEXT:    v_mul_lo_u32 v3, v0, s12
1362; GCN-NEXT:    v_mul_hi_u32 v2, v1, v2
1363; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
1364; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
1365; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v3
1366; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
1367; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s12, v3
1368; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
1369; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
1370; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
1371; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
1372; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
1373; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
1374; GCN-NEXT:    s_ashr_i32 s0, s9, 31
1375; GCN-NEXT:    s_add_i32 s1, s9, s0
1376; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
1377; GCN-NEXT:    s_xor_b32 s2, s0, s3
1378; GCN-NEXT:    s_ashr_i32 s3, s14, 31
1379; GCN-NEXT:    s_xor_b32 s1, s1, s0
1380; GCN-NEXT:    s_add_i32 s0, s14, s3
1381; GCN-NEXT:    s_xor_b32 s9, s0, s3
1382; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s9
1383; GCN-NEXT:    v_mul_hi_u32 v1, s1, v1
1384; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1385; GCN-NEXT:    v_mul_lo_u32 v2, v1, s13
1386; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1387; GCN-NEXT:    v_mul_f32_e32 v3, s16, v3
1388; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
1389; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
1390; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v2
1391; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s[0:1]
1392; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s13, v2
1393; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
1394; GCN-NEXT:    s_sub_i32 s0, 0, s9
1395; GCN-NEXT:    v_mul_lo_u32 v5, s0, v3
1396; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
1397; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
1398; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
1399; GCN-NEXT:    v_mul_hi_u32 v2, v3, v5
1400; GCN-NEXT:    v_xor_b32_e32 v1, s2, v1
1401; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
1402; GCN-NEXT:    s_ashr_i32 s2, s15, 31
1403; GCN-NEXT:    s_ashr_i32 s0, s10, 31
1404; GCN-NEXT:    s_add_i32 s8, s15, s2
1405; GCN-NEXT:    s_add_i32 s1, s10, s0
1406; GCN-NEXT:    s_xor_b32 s8, s8, s2
1407; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s8
1408; GCN-NEXT:    s_xor_b32 s1, s1, s0
1409; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
1410; GCN-NEXT:    v_mul_hi_u32 v2, s1, v2
1411; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v4
1412; GCN-NEXT:    s_xor_b32 s3, s0, s3
1413; GCN-NEXT:    v_mul_lo_u32 v3, v2, s9
1414; GCN-NEXT:    v_mul_f32_e32 v4, s16, v4
1415; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
1416; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
1417; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
1418; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v3
1419; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
1420; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v3
1421; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1422; GCN-NEXT:    s_sub_i32 s0, 0, s8
1423; GCN-NEXT:    v_mul_lo_u32 v5, s0, v4
1424; GCN-NEXT:    s_ashr_i32 s0, s11, 31
1425; GCN-NEXT:    s_add_i32 s1, s11, s0
1426; GCN-NEXT:    s_xor_b32 s1, s1, s0
1427; GCN-NEXT:    v_mul_hi_u32 v5, v4, v5
1428; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
1429; GCN-NEXT:    s_xor_b32 s2, s0, s2
1430; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
1431; GCN-NEXT:    v_mul_hi_u32 v4, s1, v4
1432; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
1433; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
1434; GCN-NEXT:    v_xor_b32_e32 v2, s3, v2
1435; GCN-NEXT:    v_mul_lo_u32 v3, v4, s8
1436; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1437; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v2
1438; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s1, v3
1439; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v3
1440; GCN-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
1441; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s8, v3
1442; GCN-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
1443; GCN-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
1444; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
1445; GCN-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
1446; GCN-NEXT:    v_xor_b32_e32 v3, s2, v3
1447; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s2, v3
1448; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1449; GCN-NEXT:    s_endpgm
1450  %r = sdiv <4 x i32> %x, %y
1451  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
1452  ret void
1453}
1454
1455define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
1456; CHECK-LABEL: @srem_v4i32(
1457; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0
1458; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0
1459; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
1460; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
1461; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
1462; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]]
1463; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]]
1464; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]]
1465; CHECK-NEXT:    [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
1466; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
1467; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000
1468; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP11]] to i32
1469; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 0, [[TMP8]]
1470; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]]
1471; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP12]] to i64
1472; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
1473; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]]
1474; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
1475; CHECK-NEXT:    [[TMP19:%.*]] = lshr i64 [[TMP17]], 32
1476; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
1477; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
1478; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
1479; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP21]] to i64
1480; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]]
1481; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
1482; CHECK-NEXT:    [[TMP26:%.*]] = lshr i64 [[TMP24]], 32
1483; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
1484; CHECK-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]]
1485; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]]
1486; CHECK-NEXT:    [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]]
1487; CHECK-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]]
1488; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]]
1489; CHECK-NEXT:    [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]]
1490; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]]
1491; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]]
1492; CHECK-NEXT:    [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]]
1493; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]]
1494; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <4 x i32> undef, i32 [[TMP37]], i64 0
1495; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <4 x i32> [[X]], i64 1
1496; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <4 x i32> [[Y]], i64 1
1497; CHECK-NEXT:    [[TMP41:%.*]] = ashr i32 [[TMP39]], 31
1498; CHECK-NEXT:    [[TMP42:%.*]] = ashr i32 [[TMP40]], 31
1499; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]]
1500; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]]
1501; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]]
1502; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]]
1503; CHECK-NEXT:    [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float
1504; CHECK-NEXT:    [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]])
1505; CHECK-NEXT:    [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000
1506; CHECK-NEXT:    [[TMP50:%.*]] = fptoui float [[TMP49]] to i32
1507; CHECK-NEXT:    [[TMP51:%.*]] = sub i32 0, [[TMP46]]
1508; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]]
1509; CHECK-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP50]] to i64
1510; CHECK-NEXT:    [[TMP54:%.*]] = zext i32 [[TMP52]] to i64
1511; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]]
1512; CHECK-NEXT:    [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32
1513; CHECK-NEXT:    [[TMP57:%.*]] = lshr i64 [[TMP55]], 32
1514; CHECK-NEXT:    [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32
1515; CHECK-NEXT:    [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]]
1516; CHECK-NEXT:    [[TMP60:%.*]] = zext i32 [[TMP45]] to i64
1517; CHECK-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP59]] to i64
1518; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]]
1519; CHECK-NEXT:    [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32
1520; CHECK-NEXT:    [[TMP64:%.*]] = lshr i64 [[TMP62]], 32
1521; CHECK-NEXT:    [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32
1522; CHECK-NEXT:    [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]]
1523; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]]
1524; CHECK-NEXT:    [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]]
1525; CHECK-NEXT:    [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]]
1526; CHECK-NEXT:    [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]]
1527; CHECK-NEXT:    [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]]
1528; CHECK-NEXT:    [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]]
1529; CHECK-NEXT:    [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]]
1530; CHECK-NEXT:    [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]]
1531; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]]
1532; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP75]], i64 1
1533; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <4 x i32> [[X]], i64 2
1534; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <4 x i32> [[Y]], i64 2
1535; CHECK-NEXT:    [[TMP79:%.*]] = ashr i32 [[TMP77]], 31
1536; CHECK-NEXT:    [[TMP80:%.*]] = ashr i32 [[TMP78]], 31
1537; CHECK-NEXT:    [[TMP81:%.*]] = add i32 [[TMP77]], [[TMP79]]
1538; CHECK-NEXT:    [[TMP82:%.*]] = add i32 [[TMP78]], [[TMP80]]
1539; CHECK-NEXT:    [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP79]]
1540; CHECK-NEXT:    [[TMP84:%.*]] = xor i32 [[TMP82]], [[TMP80]]
1541; CHECK-NEXT:    [[TMP85:%.*]] = uitofp i32 [[TMP84]] to float
1542; CHECK-NEXT:    [[TMP86:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP85]])
1543; CHECK-NEXT:    [[TMP87:%.*]] = fmul fast float [[TMP86]], 0x41EFFFFFC0000000
1544; CHECK-NEXT:    [[TMP88:%.*]] = fptoui float [[TMP87]] to i32
1545; CHECK-NEXT:    [[TMP89:%.*]] = sub i32 0, [[TMP84]]
1546; CHECK-NEXT:    [[TMP90:%.*]] = mul i32 [[TMP89]], [[TMP88]]
1547; CHECK-NEXT:    [[TMP91:%.*]] = zext i32 [[TMP88]] to i64
1548; CHECK-NEXT:    [[TMP92:%.*]] = zext i32 [[TMP90]] to i64
1549; CHECK-NEXT:    [[TMP93:%.*]] = mul i64 [[TMP91]], [[TMP92]]
1550; CHECK-NEXT:    [[TMP94:%.*]] = trunc i64 [[TMP93]] to i32
1551; CHECK-NEXT:    [[TMP95:%.*]] = lshr i64 [[TMP93]], 32
1552; CHECK-NEXT:    [[TMP96:%.*]] = trunc i64 [[TMP95]] to i32
1553; CHECK-NEXT:    [[TMP97:%.*]] = add i32 [[TMP88]], [[TMP96]]
1554; CHECK-NEXT:    [[TMP98:%.*]] = zext i32 [[TMP83]] to i64
1555; CHECK-NEXT:    [[TMP99:%.*]] = zext i32 [[TMP97]] to i64
1556; CHECK-NEXT:    [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]]
1557; CHECK-NEXT:    [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32
1558; CHECK-NEXT:    [[TMP102:%.*]] = lshr i64 [[TMP100]], 32
1559; CHECK-NEXT:    [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32
1560; CHECK-NEXT:    [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP84]]
1561; CHECK-NEXT:    [[TMP105:%.*]] = sub i32 [[TMP83]], [[TMP104]]
1562; CHECK-NEXT:    [[TMP106:%.*]] = icmp uge i32 [[TMP105]], [[TMP84]]
1563; CHECK-NEXT:    [[TMP107:%.*]] = sub i32 [[TMP105]], [[TMP84]]
1564; CHECK-NEXT:    [[TMP108:%.*]] = select i1 [[TMP106]], i32 [[TMP107]], i32 [[TMP105]]
1565; CHECK-NEXT:    [[TMP109:%.*]] = icmp uge i32 [[TMP108]], [[TMP84]]
1566; CHECK-NEXT:    [[TMP110:%.*]] = sub i32 [[TMP108]], [[TMP84]]
1567; CHECK-NEXT:    [[TMP111:%.*]] = select i1 [[TMP109]], i32 [[TMP110]], i32 [[TMP108]]
1568; CHECK-NEXT:    [[TMP112:%.*]] = xor i32 [[TMP111]], [[TMP79]]
1569; CHECK-NEXT:    [[TMP113:%.*]] = sub i32 [[TMP112]], [[TMP79]]
1570; CHECK-NEXT:    [[TMP114:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP113]], i64 2
1571; CHECK-NEXT:    [[TMP115:%.*]] = extractelement <4 x i32> [[X]], i64 3
1572; CHECK-NEXT:    [[TMP116:%.*]] = extractelement <4 x i32> [[Y]], i64 3
1573; CHECK-NEXT:    [[TMP117:%.*]] = ashr i32 [[TMP115]], 31
1574; CHECK-NEXT:    [[TMP118:%.*]] = ashr i32 [[TMP116]], 31
1575; CHECK-NEXT:    [[TMP119:%.*]] = add i32 [[TMP115]], [[TMP117]]
1576; CHECK-NEXT:    [[TMP120:%.*]] = add i32 [[TMP116]], [[TMP118]]
1577; CHECK-NEXT:    [[TMP121:%.*]] = xor i32 [[TMP119]], [[TMP117]]
1578; CHECK-NEXT:    [[TMP122:%.*]] = xor i32 [[TMP120]], [[TMP118]]
1579; CHECK-NEXT:    [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float
1580; CHECK-NEXT:    [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]])
1581; CHECK-NEXT:    [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41EFFFFFC0000000
1582; CHECK-NEXT:    [[TMP126:%.*]] = fptoui float [[TMP125]] to i32
1583; CHECK-NEXT:    [[TMP127:%.*]] = sub i32 0, [[TMP122]]
1584; CHECK-NEXT:    [[TMP128:%.*]] = mul i32 [[TMP127]], [[TMP126]]
1585; CHECK-NEXT:    [[TMP129:%.*]] = zext i32 [[TMP126]] to i64
1586; CHECK-NEXT:    [[TMP130:%.*]] = zext i32 [[TMP128]] to i64
1587; CHECK-NEXT:    [[TMP131:%.*]] = mul i64 [[TMP129]], [[TMP130]]
1588; CHECK-NEXT:    [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32
1589; CHECK-NEXT:    [[TMP133:%.*]] = lshr i64 [[TMP131]], 32
1590; CHECK-NEXT:    [[TMP134:%.*]] = trunc i64 [[TMP133]] to i32
1591; CHECK-NEXT:    [[TMP135:%.*]] = add i32 [[TMP126]], [[TMP134]]
1592; CHECK-NEXT:    [[TMP136:%.*]] = zext i32 [[TMP121]] to i64
1593; CHECK-NEXT:    [[TMP137:%.*]] = zext i32 [[TMP135]] to i64
1594; CHECK-NEXT:    [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]]
1595; CHECK-NEXT:    [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32
1596; CHECK-NEXT:    [[TMP140:%.*]] = lshr i64 [[TMP138]], 32
1597; CHECK-NEXT:    [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32
1598; CHECK-NEXT:    [[TMP142:%.*]] = mul i32 [[TMP141]], [[TMP122]]
1599; CHECK-NEXT:    [[TMP143:%.*]] = sub i32 [[TMP121]], [[TMP142]]
1600; CHECK-NEXT:    [[TMP144:%.*]] = icmp uge i32 [[TMP143]], [[TMP122]]
1601; CHECK-NEXT:    [[TMP145:%.*]] = sub i32 [[TMP143]], [[TMP122]]
1602; CHECK-NEXT:    [[TMP146:%.*]] = select i1 [[TMP144]], i32 [[TMP145]], i32 [[TMP143]]
1603; CHECK-NEXT:    [[TMP147:%.*]] = icmp uge i32 [[TMP146]], [[TMP122]]
1604; CHECK-NEXT:    [[TMP148:%.*]] = sub i32 [[TMP146]], [[TMP122]]
1605; CHECK-NEXT:    [[TMP149:%.*]] = select i1 [[TMP147]], i32 [[TMP148]], i32 [[TMP146]]
1606; CHECK-NEXT:    [[TMP150:%.*]] = xor i32 [[TMP149]], [[TMP117]]
1607; CHECK-NEXT:    [[TMP151:%.*]] = sub i32 [[TMP150]], [[TMP117]]
1608; CHECK-NEXT:    [[TMP152:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP151]], i64 3
1609; CHECK-NEXT:    store <4 x i32> [[TMP152]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16
1610; CHECK-NEXT:    ret void
1611;
1612; GCN-LABEL: srem_v4i32:
1613; GCN:       ; %bb.0:
1614; GCN-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
1615; GCN-NEXT:    s_mov_b32 s13, 0x4f7ffffe
1616; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1617; GCN-NEXT:    s_mov_b32 s3, 0xf000
1618; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1619; GCN-NEXT:    s_ashr_i32 s2, s8, 31
1620; GCN-NEXT:    s_add_i32 s8, s8, s2
1621; GCN-NEXT:    s_xor_b32 s12, s8, s2
1622; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
1623; GCN-NEXT:    s_ashr_i32 s8, s9, 31
1624; GCN-NEXT:    s_add_i32 s9, s9, s8
1625; GCN-NEXT:    s_xor_b32 s14, s9, s8
1626; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
1627; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s14
1628; GCN-NEXT:    s_sub_i32 s9, 0, s12
1629; GCN-NEXT:    s_ashr_i32 s8, s4, 31
1630; GCN-NEXT:    v_mul_f32_e32 v0, s13, v0
1631; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
1632; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
1633; GCN-NEXT:    s_add_i32 s4, s4, s8
1634; GCN-NEXT:    s_xor_b32 s4, s4, s8
1635; GCN-NEXT:    v_mul_lo_u32 v2, s9, v0
1636; GCN-NEXT:    v_mul_f32_e32 v1, s13, v1
1637; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1638; GCN-NEXT:    s_sub_i32 s9, 0, s14
1639; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
1640; GCN-NEXT:    s_mov_b32 s2, -1
1641; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
1642; GCN-NEXT:    v_mul_hi_u32 v0, s4, v0
1643; GCN-NEXT:    v_mul_lo_u32 v2, s9, v1
1644; GCN-NEXT:    s_ashr_i32 s9, s5, 31
1645; GCN-NEXT:    s_add_i32 s5, s5, s9
1646; GCN-NEXT:    v_mul_lo_u32 v0, v0, s12
1647; GCN-NEXT:    v_mul_hi_u32 v2, v1, v2
1648; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1649; GCN-NEXT:    s_xor_b32 s4, s5, s9
1650; GCN-NEXT:    s_ashr_i32 s5, s10, 31
1651; GCN-NEXT:    s_add_i32 s10, s10, s5
1652; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s12, v0
1653; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
1654; GCN-NEXT:    s_xor_b32 s10, s10, s5
1655; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1656; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
1657; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s10
1658; GCN-NEXT:    v_mul_hi_u32 v1, s4, v1
1659; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s12, v0
1660; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
1661; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
1662; GCN-NEXT:    v_mul_lo_u32 v1, v1, s14
1663; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
1664; GCN-NEXT:    v_xor_b32_e32 v0, s8, v0
1665; GCN-NEXT:    v_mul_f32_e32 v2, s13, v2
1666; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
1667; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
1668; GCN-NEXT:    s_sub_i32 s4, 0, s10
1669; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
1670; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s14, v1
1671; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s14, v1
1672; GCN-NEXT:    v_mul_lo_u32 v4, s4, v2
1673; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
1674; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s14, v1
1675; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s14, v1
1676; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
1677; GCN-NEXT:    v_mul_hi_u32 v3, v2, v4
1678; GCN-NEXT:    s_ashr_i32 s4, s6, 31
1679; GCN-NEXT:    s_add_i32 s5, s6, s4
1680; GCN-NEXT:    s_ashr_i32 s6, s11, 31
1681; GCN-NEXT:    s_add_i32 s8, s11, s6
1682; GCN-NEXT:    s_xor_b32 s8, s8, s6
1683; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
1684; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s8
1685; GCN-NEXT:    s_xor_b32 s5, s5, s4
1686; GCN-NEXT:    v_mul_hi_u32 v2, s5, v2
1687; GCN-NEXT:    v_xor_b32_e32 v1, s9, v1
1688; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
1689; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s9, v1
1690; GCN-NEXT:    v_mul_lo_u32 v2, v2, s10
1691; GCN-NEXT:    v_mul_f32_e32 v3, s13, v3
1692; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
1693; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s5, v2
1694; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s10, v2
1695; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1696; GCN-NEXT:    s_sub_i32 s5, 0, s8
1697; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
1698; GCN-NEXT:    v_mul_lo_u32 v4, s5, v3
1699; GCN-NEXT:    s_ashr_i32 s5, s7, 31
1700; GCN-NEXT:    s_add_i32 s6, s7, s5
1701; GCN-NEXT:    s_xor_b32 s6, s6, s5
1702; GCN-NEXT:    v_mul_hi_u32 v4, v3, v4
1703; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, s10, v2
1704; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
1705; GCN-NEXT:    v_mul_hi_u32 v3, s6, v3
1706; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
1707; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
1708; GCN-NEXT:    v_xor_b32_e32 v2, s4, v2
1709; GCN-NEXT:    v_mul_lo_u32 v3, v3, s8
1710; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s4, v2
1711; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
1712; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
1713; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
1714; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1715; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s8, v3
1716; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
1717; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
1718; GCN-NEXT:    v_xor_b32_e32 v3, s5, v3
1719; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s5, v3
1720; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1721; GCN-NEXT:    s_endpgm
1722  %r = srem <4 x i32> %x, %y
1723  store <4 x i32> %r, <4 x i32> addrspace(1)* %out
1724  ret void
1725}
1726
1727define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
1728; CHECK-LABEL: @udiv_v4i16(
1729; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
1730; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
1731; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
1732; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
1733; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
1734; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
1735; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
1736; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
1737; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
1738; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
1739; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
1740; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
1741; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
1742; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
1743; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
1744; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
1745; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
1746; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
1747; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
1748; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> undef, i16 [[TMP19]], i64 0
1749; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1
1750; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1
1751; CHECK-NEXT:    [[TMP23:%.*]] = zext i16 [[TMP21]] to i32
1752; CHECK-NEXT:    [[TMP24:%.*]] = zext i16 [[TMP22]] to i32
1753; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
1754; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
1755; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
1756; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
1757; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
1758; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
1759; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
1760; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
1761; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
1762; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
1763; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
1764; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
1765; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
1766; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 65535
1767; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16
1768; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1
1769; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2
1770; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2
1771; CHECK-NEXT:    [[TMP43:%.*]] = zext i16 [[TMP41]] to i32
1772; CHECK-NEXT:    [[TMP44:%.*]] = zext i16 [[TMP42]] to i32
1773; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
1774; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
1775; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
1776; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
1777; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
1778; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
1779; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
1780; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
1781; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
1782; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
1783; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
1784; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
1785; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
1786; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 65535
1787; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
1788; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2
1789; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3
1790; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3
1791; CHECK-NEXT:    [[TMP63:%.*]] = zext i16 [[TMP61]] to i32
1792; CHECK-NEXT:    [[TMP64:%.*]] = zext i16 [[TMP62]] to i32
1793; CHECK-NEXT:    [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float
1794; CHECK-NEXT:    [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float
1795; CHECK-NEXT:    [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]])
1796; CHECK-NEXT:    [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]]
1797; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]])
1798; CHECK-NEXT:    [[TMP70:%.*]] = fneg fast float [[TMP69]]
1799; CHECK-NEXT:    [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]])
1800; CHECK-NEXT:    [[TMP72:%.*]] = fptoui float [[TMP69]] to i32
1801; CHECK-NEXT:    [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]])
1802; CHECK-NEXT:    [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
1803; CHECK-NEXT:    [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]]
1804; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0
1805; CHECK-NEXT:    [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]]
1806; CHECK-NEXT:    [[TMP78:%.*]] = and i32 [[TMP77]], 65535
1807; CHECK-NEXT:    [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16
1808; CHECK-NEXT:    [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3
1809; CHECK-NEXT:    store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
1810; CHECK-NEXT:    ret void
1811;
1812; GCN-LABEL: udiv_v4i16:
1813; GCN:       ; %bb.0:
1814; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1815; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
1816; GCN-NEXT:    s_mov_b32 s8, 0xffff
1817; GCN-NEXT:    s_mov_b32 s7, 0xf000
1818; GCN-NEXT:    s_mov_b32 s6, -1
1819; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1820; GCN-NEXT:    s_and_b32 s9, s2, s8
1821; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
1822; GCN-NEXT:    s_lshr_b32 s9, s0, 16
1823; GCN-NEXT:    s_and_b32 s0, s0, s8
1824; GCN-NEXT:    s_lshr_b32 s2, s2, 16
1825; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s2
1826; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s0
1827; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1828; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s9
1829; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v3
1830; GCN-NEXT:    s_and_b32 s2, s3, s8
1831; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
1832; GCN-NEXT:    v_trunc_f32_e32 v2, v2
1833; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
1834; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
1835; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1836; GCN-NEXT:    v_mul_f32_e32 v1, v4, v5
1837; GCN-NEXT:    v_trunc_f32_e32 v1, v1
1838; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
1839; GCN-NEXT:    v_mad_f32 v2, -v1, v3, v4
1840; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s2
1841; GCN-NEXT:    s_lshr_b32 s0, s1, 16
1842; GCN-NEXT:    s_and_b32 s1, s1, s8
1843; GCN-NEXT:    s_lshr_b32 s10, s3, 16
1844; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
1845; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1846; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s10
1847; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s1
1848; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
1849; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v1, vcc
1850; GCN-NEXT:    v_rcp_iflag_f32_e32 v7, v3
1851; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1852; GCN-NEXT:    v_mul_f32_e32 v1, v5, v6
1853; GCN-NEXT:    v_cvt_f32_u32_e32 v6, s0
1854; GCN-NEXT:    v_trunc_f32_e32 v1, v1
1855; GCN-NEXT:    v_mad_f32 v5, -v1, v4, v5
1856; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
1857; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
1858; GCN-NEXT:    v_mul_f32_e32 v4, v6, v7
1859; GCN-NEXT:    v_trunc_f32_e32 v4, v4
1860; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v4
1861; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1862; GCN-NEXT:    v_mad_f32 v4, -v4, v3, v6
1863; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v3
1864; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
1865; GCN-NEXT:    v_and_b32_e32 v0, s8, v0
1866; GCN-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1867; GCN-NEXT:    v_and_b32_e32 v1, s8, v1
1868; GCN-NEXT:    v_or_b32_e32 v1, v1, v3
1869; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
1870; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1871; GCN-NEXT:    s_endpgm
1872  %r = udiv <4 x i16> %x, %y
1873  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
1874  ret void
1875}
1876
1877define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
1878; CHECK-LABEL: @urem_v4i16(
1879; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
1880; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
1881; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
1882; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
1883; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
1884; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
1885; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
1886; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
1887; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
1888; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
1889; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
1890; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
1891; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
1892; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
1893; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
1894; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
1895; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
1896; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
1897; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
1898; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 65535
1899; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
1900; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i16> undef, i16 [[TMP21]], i64 0
1901; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1
1902; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1
1903; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP23]] to i32
1904; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TMP24]] to i32
1905; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
1906; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
1907; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
1908; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
1909; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
1910; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
1911; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
1912; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
1913; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
1914; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
1915; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
1916; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
1917; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
1918; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
1919; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
1920; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 65535
1921; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16
1922; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1
1923; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2
1924; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2
1925; CHECK-NEXT:    [[TMP47:%.*]] = zext i16 [[TMP45]] to i32
1926; CHECK-NEXT:    [[TMP48:%.*]] = zext i16 [[TMP46]] to i32
1927; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
1928; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
1929; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
1930; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
1931; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
1932; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
1933; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
1934; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
1935; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
1936; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
1937; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
1938; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
1939; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
1940; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
1941; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
1942; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 65535
1943; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
1944; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2
1945; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3
1946; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3
1947; CHECK-NEXT:    [[TMP69:%.*]] = zext i16 [[TMP67]] to i32
1948; CHECK-NEXT:    [[TMP70:%.*]] = zext i16 [[TMP68]] to i32
1949; CHECK-NEXT:    [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float
1950; CHECK-NEXT:    [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float
1951; CHECK-NEXT:    [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]])
1952; CHECK-NEXT:    [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]]
1953; CHECK-NEXT:    [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]])
1954; CHECK-NEXT:    [[TMP76:%.*]] = fneg fast float [[TMP75]]
1955; CHECK-NEXT:    [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]])
1956; CHECK-NEXT:    [[TMP78:%.*]] = fptoui float [[TMP75]] to i32
1957; CHECK-NEXT:    [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]])
1958; CHECK-NEXT:    [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]])
1959; CHECK-NEXT:    [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]]
1960; CHECK-NEXT:    [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0
1961; CHECK-NEXT:    [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]]
1962; CHECK-NEXT:    [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]]
1963; CHECK-NEXT:    [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]]
1964; CHECK-NEXT:    [[TMP86:%.*]] = and i32 [[TMP85]], 65535
1965; CHECK-NEXT:    [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16
1966; CHECK-NEXT:    [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3
1967; CHECK-NEXT:    store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
1968; CHECK-NEXT:    ret void
1969;
1970; GCN-LABEL: urem_v4i16:
1971; GCN:       ; %bb.0:
1972; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1973; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
1974; GCN-NEXT:    s_mov_b32 s8, 0xffff
1975; GCN-NEXT:    s_mov_b32 s7, 0xf000
1976; GCN-NEXT:    s_mov_b32 s6, -1
1977; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1978; GCN-NEXT:    s_and_b32 s9, s2, s8
1979; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
1980; GCN-NEXT:    s_and_b32 s10, s0, s8
1981; GCN-NEXT:    s_lshr_b32 s11, s2, 16
1982; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s10
1983; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
1984; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s11
1985; GCN-NEXT:    s_lshr_b32 s9, s0, 16
1986; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s9
1987; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
1988; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v3
1989; GCN-NEXT:    v_trunc_f32_e32 v2, v2
1990; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
1991; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
1992; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
1993; GCN-NEXT:    v_mul_f32_e32 v1, v4, v5
1994; GCN-NEXT:    v_trunc_f32_e32 v1, v1
1995; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
1996; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v1
1997; GCN-NEXT:    v_mad_f32 v1, -v1, v3, v4
1998; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v3
1999; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
2000; GCN-NEXT:    s_and_b32 s2, s3, s8
2001; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
2002; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s2
2003; GCN-NEXT:    s_and_b32 s2, s1, s8
2004; GCN-NEXT:    v_mul_lo_u32 v1, v1, s11
2005; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s2
2006; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2007; GCN-NEXT:    s_lshr_b32 s12, s3, 16
2008; GCN-NEXT:    v_sub_i32_e32 v5, vcc, s9, v1
2009; GCN-NEXT:    s_lshr_b32 s10, s1, 16
2010; GCN-NEXT:    v_mul_f32_e32 v1, v3, v4
2011; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s12
2012; GCN-NEXT:    v_cvt_f32_u32_e32 v6, s10
2013; GCN-NEXT:    v_trunc_f32_e32 v1, v1
2014; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2015; GCN-NEXT:    v_rcp_iflag_f32_e32 v7, v4
2016; GCN-NEXT:    v_mad_f32 v3, -v1, v2, v3
2017; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
2018; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
2019; GCN-NEXT:    v_mul_f32_e32 v2, v6, v7
2020; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2021; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
2022; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2023; GCN-NEXT:    v_mad_f32 v2, -v2, v4, v6
2024; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
2025; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
2026; GCN-NEXT:    v_mul_lo_u32 v1, v1, s3
2027; GCN-NEXT:    v_mul_lo_u32 v2, v2, s12
2028; GCN-NEXT:    v_and_b32_e32 v0, s8, v0
2029; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
2030; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s10, v2
2031; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2032; GCN-NEXT:    v_and_b32_e32 v1, s8, v1
2033; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
2034; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
2035; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
2036; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2037; GCN-NEXT:    s_endpgm
2038  %r = urem <4 x i16> %x, %y
2039  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
2040  ret void
2041}
2042
2043define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
2044; CHECK-LABEL: @sdiv_v4i16(
2045; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2046; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2047; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
2048; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
2049; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
2050; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
2051; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
2052; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
2053; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
2054; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2055; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
2056; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
2057; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
2058; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
2059; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
2060; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
2061; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2062; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
2063; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
2064; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
2065; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
2066; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
2067; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
2068; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i16> undef, i16 [[TMP23]], i64 0
2069; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1
2070; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2071; CHECK-NEXT:    [[TMP27:%.*]] = sext i16 [[TMP25]] to i32
2072; CHECK-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP26]] to i32
2073; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
2074; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
2075; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
2076; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
2077; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
2078; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
2079; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
2080; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
2081; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
2082; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
2083; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
2084; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
2085; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2086; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
2087; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
2088; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
2089; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 16
2090; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
2091; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
2092; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1
2093; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2
2094; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2095; CHECK-NEXT:    [[TMP51:%.*]] = sext i16 [[TMP49]] to i32
2096; CHECK-NEXT:    [[TMP52:%.*]] = sext i16 [[TMP50]] to i32
2097; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
2098; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
2099; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
2100; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
2101; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
2102; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
2103; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
2104; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
2105; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
2106; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
2107; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
2108; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
2109; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
2110; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
2111; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
2112; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
2113; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 16
2114; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
2115; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
2116; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2
2117; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3
2118; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2119; CHECK-NEXT:    [[TMP75:%.*]] = sext i16 [[TMP73]] to i32
2120; CHECK-NEXT:    [[TMP76:%.*]] = sext i16 [[TMP74]] to i32
2121; CHECK-NEXT:    [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]]
2122; CHECK-NEXT:    [[TMP78:%.*]] = ashr i32 [[TMP77]], 30
2123; CHECK-NEXT:    [[TMP79:%.*]] = or i32 [[TMP78]], 1
2124; CHECK-NEXT:    [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float
2125; CHECK-NEXT:    [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float
2126; CHECK-NEXT:    [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]])
2127; CHECK-NEXT:    [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]]
2128; CHECK-NEXT:    [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]])
2129; CHECK-NEXT:    [[TMP85:%.*]] = fneg fast float [[TMP84]]
2130; CHECK-NEXT:    [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]])
2131; CHECK-NEXT:    [[TMP87:%.*]] = fptosi float [[TMP84]] to i32
2132; CHECK-NEXT:    [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]])
2133; CHECK-NEXT:    [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]])
2134; CHECK-NEXT:    [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]]
2135; CHECK-NEXT:    [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0
2136; CHECK-NEXT:    [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]]
2137; CHECK-NEXT:    [[TMP93:%.*]] = shl i32 [[TMP92]], 16
2138; CHECK-NEXT:    [[TMP94:%.*]] = ashr i32 [[TMP93]], 16
2139; CHECK-NEXT:    [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16
2140; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3
2141; CHECK-NEXT:    store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
2142; CHECK-NEXT:    ret void
2143;
2144; GCN-LABEL: sdiv_v4i16:
2145; GCN:       ; %bb.0:
2146; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2147; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
2148; GCN-NEXT:    s_mov_b32 s7, 0xf000
2149; GCN-NEXT:    s_mov_b32 s6, -1
2150; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2151; GCN-NEXT:    s_sext_i32_i16 s8, s2
2152; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
2153; GCN-NEXT:    s_sext_i32_i16 s9, s0
2154; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s9
2155; GCN-NEXT:    s_xor_b32 s8, s9, s8
2156; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2157; GCN-NEXT:    s_ashr_i32 s2, s2, 16
2158; GCN-NEXT:    s_ashr_i32 s8, s8, 30
2159; GCN-NEXT:    s_or_b32 s8, s8, 1
2160; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2161; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2162; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
2163; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
2164; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
2165; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
2166; GCN-NEXT:    v_mov_b32_e32 v3, s8
2167; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
2168; GCN-NEXT:    s_ashr_i32 s0, s0, 16
2169; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
2170; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
2171; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v1
2172; GCN-NEXT:    s_xor_b32 s0, s0, s2
2173; GCN-NEXT:    s_ashr_i32 s0, s0, 30
2174; GCN-NEXT:    s_or_b32 s0, s0, 1
2175; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
2176; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2177; GCN-NEXT:    v_mad_f32 v2, -v3, v1, v2
2178; GCN-NEXT:    v_mov_b32_e32 v4, s0
2179; GCN-NEXT:    s_sext_i32_i16 s0, s3
2180; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
2181; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
2182; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
2183; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
2184; GCN-NEXT:    s_sext_i32_i16 s2, s1
2185; GCN-NEXT:    v_add_i32_e32 v3, vcc, v1, v3
2186; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
2187; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2188; GCN-NEXT:    s_xor_b32 s0, s2, s0
2189; GCN-NEXT:    s_ashr_i32 s0, s0, 30
2190; GCN-NEXT:    s_or_b32 s0, s0, 1
2191; GCN-NEXT:    v_mul_f32_e32 v4, v1, v4
2192; GCN-NEXT:    v_trunc_f32_e32 v4, v4
2193; GCN-NEXT:    v_mad_f32 v1, -v4, v2, v1
2194; GCN-NEXT:    v_mov_b32_e32 v5, s0
2195; GCN-NEXT:    s_ashr_i32 s0, s3, 16
2196; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
2197; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
2198; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
2199; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
2200; GCN-NEXT:    s_ashr_i32 s1, s1, 16
2201; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
2202; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s1
2203; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v2
2204; GCN-NEXT:    s_xor_b32 s0, s1, s0
2205; GCN-NEXT:    s_ashr_i32 s0, s0, 30
2206; GCN-NEXT:    s_or_b32 s0, s0, 1
2207; GCN-NEXT:    v_mul_f32_e32 v5, v4, v5
2208; GCN-NEXT:    v_trunc_f32_e32 v5, v5
2209; GCN-NEXT:    v_mad_f32 v4, -v5, v2, v4
2210; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
2211; GCN-NEXT:    v_mov_b32_e32 v6, s0
2212; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
2213; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
2214; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
2215; GCN-NEXT:    s_mov_b32 s0, 0xffff
2216; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2217; GCN-NEXT:    v_and_b32_e32 v1, s0, v1
2218; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
2219; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
2220; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
2221; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
2222; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2223; GCN-NEXT:    s_endpgm
2224  %r = sdiv <4 x i16> %x, %y
2225  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
2226  ret void
2227}
2228
2229define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) {
2230; CHECK-LABEL: @srem_v4i16(
2231; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0
2232; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0
2233; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
2234; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
2235; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
2236; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
2237; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
2238; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
2239; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
2240; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2241; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
2242; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
2243; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
2244; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
2245; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
2246; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
2247; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2248; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
2249; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
2250; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
2251; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
2252; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
2253; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 16
2254; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
2255; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
2256; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i16> undef, i16 [[TMP25]], i64 0
2257; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1
2258; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1
2259; CHECK-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP27]] to i32
2260; CHECK-NEXT:    [[TMP30:%.*]] = sext i16 [[TMP28]] to i32
2261; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
2262; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
2263; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
2264; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
2265; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
2266; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
2267; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
2268; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
2269; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
2270; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
2271; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
2272; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
2273; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
2274; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
2275; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
2276; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
2277; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
2278; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
2279; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 16
2280; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
2281; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
2282; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1
2283; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2
2284; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2
2285; CHECK-NEXT:    [[TMP55:%.*]] = sext i16 [[TMP53]] to i32
2286; CHECK-NEXT:    [[TMP56:%.*]] = sext i16 [[TMP54]] to i32
2287; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
2288; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
2289; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
2290; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
2291; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
2292; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
2293; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
2294; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
2295; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
2296; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
2297; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
2298; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
2299; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
2300; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
2301; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
2302; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
2303; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
2304; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
2305; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 16
2306; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
2307; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
2308; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2
2309; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3
2310; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3
2311; CHECK-NEXT:    [[TMP81:%.*]] = sext i16 [[TMP79]] to i32
2312; CHECK-NEXT:    [[TMP82:%.*]] = sext i16 [[TMP80]] to i32
2313; CHECK-NEXT:    [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]]
2314; CHECK-NEXT:    [[TMP84:%.*]] = ashr i32 [[TMP83]], 30
2315; CHECK-NEXT:    [[TMP85:%.*]] = or i32 [[TMP84]], 1
2316; CHECK-NEXT:    [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float
2317; CHECK-NEXT:    [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float
2318; CHECK-NEXT:    [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]])
2319; CHECK-NEXT:    [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]]
2320; CHECK-NEXT:    [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]])
2321; CHECK-NEXT:    [[TMP91:%.*]] = fneg fast float [[TMP90]]
2322; CHECK-NEXT:    [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]])
2323; CHECK-NEXT:    [[TMP93:%.*]] = fptosi float [[TMP90]] to i32
2324; CHECK-NEXT:    [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]])
2325; CHECK-NEXT:    [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]])
2326; CHECK-NEXT:    [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]]
2327; CHECK-NEXT:    [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0
2328; CHECK-NEXT:    [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]]
2329; CHECK-NEXT:    [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]]
2330; CHECK-NEXT:    [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]]
2331; CHECK-NEXT:    [[TMP101:%.*]] = shl i32 [[TMP100]], 16
2332; CHECK-NEXT:    [[TMP102:%.*]] = ashr i32 [[TMP101]], 16
2333; CHECK-NEXT:    [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16
2334; CHECK-NEXT:    [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3
2335; CHECK-NEXT:    store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8
2336; CHECK-NEXT:    ret void
2337;
2338; GCN-LABEL: srem_v4i16:
2339; GCN:       ; %bb.0:
2340; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2341; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xb
2342; GCN-NEXT:    s_mov_b32 s7, 0xf000
2343; GCN-NEXT:    s_mov_b32 s6, -1
2344; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2345; GCN-NEXT:    s_sext_i32_i16 s8, s2
2346; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
2347; GCN-NEXT:    s_sext_i32_i16 s9, s0
2348; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s9
2349; GCN-NEXT:    s_xor_b32 s8, s9, s8
2350; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2351; GCN-NEXT:    s_ashr_i32 s8, s8, 30
2352; GCN-NEXT:    s_or_b32 s8, s8, 1
2353; GCN-NEXT:    v_mov_b32_e32 v3, s8
2354; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2355; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2356; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
2357; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
2358; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
2359; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
2360; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
2361; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
2362; GCN-NEXT:    s_ashr_i32 s2, s2, 16
2363; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
2364; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2365; GCN-NEXT:    s_ashr_i32 s0, s0, 16
2366; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
2367; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v1
2368; GCN-NEXT:    s_xor_b32 s8, s0, s2
2369; GCN-NEXT:    s_ashr_i32 s8, s8, 30
2370; GCN-NEXT:    s_or_b32 s8, s8, 1
2371; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
2372; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2373; GCN-NEXT:    v_mad_f32 v2, -v3, v1, v2
2374; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
2375; GCN-NEXT:    v_mov_b32_e32 v4, s8
2376; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
2377; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
2378; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
2379; GCN-NEXT:    v_mul_lo_u32 v1, v1, s2
2380; GCN-NEXT:    s_sext_i32_i16 s2, s3
2381; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s2
2382; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s0, v1
2383; GCN-NEXT:    s_sext_i32_i16 s0, s1
2384; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
2385; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
2386; GCN-NEXT:    s_xor_b32 s0, s0, s2
2387; GCN-NEXT:    s_ashr_i32 s0, s0, 30
2388; GCN-NEXT:    s_or_b32 s0, s0, 1
2389; GCN-NEXT:    v_mul_f32_e32 v4, v1, v4
2390; GCN-NEXT:    v_trunc_f32_e32 v4, v4
2391; GCN-NEXT:    v_mad_f32 v1, -v4, v2, v1
2392; GCN-NEXT:    v_mov_b32_e32 v5, s0
2393; GCN-NEXT:    s_ashr_i32 s0, s3, 16
2394; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v2|
2395; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
2396; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
2397; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
2398; GCN-NEXT:    s_ashr_i32 s2, s1, 16
2399; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
2400; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s2
2401; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v2
2402; GCN-NEXT:    v_mul_lo_u32 v1, v1, s3
2403; GCN-NEXT:    s_xor_b32 s3, s2, s0
2404; GCN-NEXT:    s_ashr_i32 s3, s3, 30
2405; GCN-NEXT:    v_mul_f32_e32 v5, v4, v5
2406; GCN-NEXT:    v_trunc_f32_e32 v5, v5
2407; GCN-NEXT:    v_mad_f32 v4, -v5, v2, v4
2408; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
2409; GCN-NEXT:    s_or_b32 s3, s3, 1
2410; GCN-NEXT:    v_mov_b32_e32 v6, s3
2411; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
2412; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
2413; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
2414; GCN-NEXT:    v_mul_lo_u32 v2, v2, s0
2415; GCN-NEXT:    s_mov_b32 s0, 0xffff
2416; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
2417; GCN-NEXT:    v_and_b32_e32 v1, s0, v1
2418; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
2419; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2420; GCN-NEXT:    v_or_b32_e32 v1, v1, v2
2421; GCN-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
2422; GCN-NEXT:    v_and_b32_e32 v0, s0, v0
2423; GCN-NEXT:    v_or_b32_e32 v0, v0, v2
2424; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2425; GCN-NEXT:    s_endpgm
2426  %r = srem <4 x i16> %x, %y
2427  store <4 x i16> %r, <4 x i16> addrspace(1)* %out
2428  ret void
2429}
2430
2431define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
2432; CHECK-LABEL: @udiv_i3(
2433; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
2434; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
2435; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
2436; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
2437; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
2438; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
2439; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
2440; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
2441; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
2442; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
2443; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2444; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
2445; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
2446; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
2447; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
2448; CHECK-NEXT:    [[TMP16:%.*]] = and i32 [[TMP15]], 7
2449; CHECK-NEXT:    [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3
2450; CHECK-NEXT:    store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]], align 1
2451; CHECK-NEXT:    ret void
2452;
2453; GCN-LABEL: udiv_i3:
2454; GCN:       ; %bb.0:
2455; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2456; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
2457; GCN-NEXT:    s_mov_b32 s7, 0xf000
2458; GCN-NEXT:    s_mov_b32 s6, -1
2459; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2460; GCN-NEXT:    s_bfe_u32 s1, s0, 0x30008
2461; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, s1
2462; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
2463; GCN-NEXT:    s_and_b32 s0, s0, 7
2464; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, s0
2465; GCN-NEXT:    v_mul_f32_e32 v1, v2, v1
2466; GCN-NEXT:    v_trunc_f32_e32 v1, v1
2467; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v1
2468; GCN-NEXT:    v_mad_f32 v1, -v1, v0, v2
2469; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2470; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
2471; GCN-NEXT:    v_and_b32_e32 v0, 7, v0
2472; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
2473; GCN-NEXT:    s_endpgm
2474  %r = udiv i3 %x, %y
2475  store i3 %r, i3 addrspace(1)* %out
2476  ret void
2477}
2478
2479define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
2480; CHECK-LABEL: @urem_i3(
2481; CHECK-NEXT:    [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32
2482; CHECK-NEXT:    [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32
2483; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float
2484; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float
2485; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]])
2486; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]]
2487; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]])
2488; CHECK-NEXT:    [[TMP8:%.*]] = fneg fast float [[TMP7]]
2489; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]])
2490; CHECK-NEXT:    [[TMP10:%.*]] = fptoui float [[TMP7]] to i32
2491; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2492; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]])
2493; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]]
2494; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0
2495; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]]
2496; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]]
2497; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]]
2498; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 7
2499; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3
2500; CHECK-NEXT:    store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]], align 1
2501; CHECK-NEXT:    ret void
2502;
2503; GCN-LABEL: urem_i3:
2504; GCN:       ; %bb.0:
2505; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2506; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
2507; GCN-NEXT:    s_mov_b32 s7, 0xf000
2508; GCN-NEXT:    s_mov_b32 s6, -1
2509; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2510; GCN-NEXT:    s_bfe_u32 s1, s0, 0x30008
2511; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v0, s1
2512; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
2513; GCN-NEXT:    s_and_b32 s2, s0, 7
2514; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, s2
2515; GCN-NEXT:    s_lshr_b32 s1, s0, 8
2516; GCN-NEXT:    v_mul_f32_e32 v1, v2, v1
2517; GCN-NEXT:    v_trunc_f32_e32 v1, v1
2518; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v1
2519; GCN-NEXT:    v_mad_f32 v1, -v1, v0, v2
2520; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2521; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
2522; GCN-NEXT:    v_mul_lo_u32 v0, v0, s1
2523; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2524; GCN-NEXT:    v_and_b32_e32 v0, 7, v0
2525; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
2526; GCN-NEXT:    s_endpgm
2527  %r = urem i3 %x, %y
2528  store i3 %r, i3 addrspace(1)* %out
2529  ret void
2530}
2531
2532define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
2533; CHECK-LABEL: @sdiv_i3(
2534; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
2535; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
2536; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
2537; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
2538; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
2539; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
2540; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
2541; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
2542; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
2543; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
2544; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
2545; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
2546; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
2547; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
2548; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
2549; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
2550; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
2551; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
2552; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 29
2553; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 29
2554; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3
2555; CHECK-NEXT:    store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]], align 1
2556; CHECK-NEXT:    ret void
2557;
2558; GCN-LABEL: sdiv_i3:
2559; GCN:       ; %bb.0:
2560; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2561; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
2562; GCN-NEXT:    s_mov_b32 s7, 0xf000
2563; GCN-NEXT:    s_mov_b32 s6, -1
2564; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2565; GCN-NEXT:    s_bfe_i32 s1, s0, 0x30008
2566; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
2567; GCN-NEXT:    s_bfe_i32 s0, s0, 0x30000
2568; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
2569; GCN-NEXT:    s_xor_b32 s0, s0, s1
2570; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2571; GCN-NEXT:    s_ashr_i32 s0, s0, 30
2572; GCN-NEXT:    s_or_b32 s0, s0, 1
2573; GCN-NEXT:    v_mov_b32_e32 v3, s0
2574; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2575; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2576; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
2577; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
2578; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
2579; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
2580; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
2581; GCN-NEXT:    v_and_b32_e32 v0, 7, v0
2582; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
2583; GCN-NEXT:    s_endpgm
2584  %r = sdiv i3 %x, %y
2585  store i3 %r, i3 addrspace(1)* %out
2586  ret void
2587}
2588
2589define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
2590; CHECK-LABEL: @srem_i3(
2591; CHECK-NEXT:    [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32
2592; CHECK-NEXT:    [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32
2593; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]]
2594; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP3]], 30
2595; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP4]], 1
2596; CHECK-NEXT:    [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float
2597; CHECK-NEXT:    [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float
2598; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]])
2599; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]]
2600; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]])
2601; CHECK-NEXT:    [[TMP11:%.*]] = fneg fast float [[TMP10]]
2602; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]])
2603; CHECK-NEXT:    [[TMP13:%.*]] = fptosi float [[TMP10]] to i32
2604; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]])
2605; CHECK-NEXT:    [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]])
2606; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
2607; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
2608; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
2609; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
2610; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
2611; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 29
2612; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 29
2613; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3
2614; CHECK-NEXT:    store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]], align 1
2615; CHECK-NEXT:    ret void
2616;
2617; GCN-LABEL: srem_i3:
2618; GCN:       ; %bb.0:
2619; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2620; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
2621; GCN-NEXT:    s_mov_b32 s7, 0xf000
2622; GCN-NEXT:    s_mov_b32 s6, -1
2623; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2624; GCN-NEXT:    s_bfe_i32 s1, s0, 0x30008
2625; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s1
2626; GCN-NEXT:    s_bfe_i32 s3, s0, 0x30000
2627; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s3
2628; GCN-NEXT:    s_xor_b32 s1, s3, s1
2629; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2630; GCN-NEXT:    s_ashr_i32 s1, s1, 30
2631; GCN-NEXT:    s_or_b32 s1, s1, 1
2632; GCN-NEXT:    v_mov_b32_e32 v3, s1
2633; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2634; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2635; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
2636; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
2637; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
2638; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
2639; GCN-NEXT:    s_lshr_b32 s2, s0, 8
2640; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
2641; GCN-NEXT:    v_mul_lo_u32 v0, v0, s2
2642; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
2643; GCN-NEXT:    v_and_b32_e32 v0, 7, v0
2644; GCN-NEXT:    buffer_store_byte v0, off, s[4:7], 0
2645; GCN-NEXT:    s_endpgm
2646  %r = srem i3 %x, %y
2647  store i3 %r, i3 addrspace(1)* %out
2648  ret void
2649}
2650
2651define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
2652; CHECK-LABEL: @udiv_v3i16(
2653; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
2654; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
2655; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
2656; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
2657; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
2658; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
2659; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
2660; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
2661; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
2662; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
2663; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
2664; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
2665; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
2666; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
2667; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
2668; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
2669; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
2670; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 65535
2671; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
2672; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <3 x i16> undef, i16 [[TMP19]], i64 0
2673; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1
2674; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1
2675; CHECK-NEXT:    [[TMP23:%.*]] = zext i16 [[TMP21]] to i32
2676; CHECK-NEXT:    [[TMP24:%.*]] = zext i16 [[TMP22]] to i32
2677; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
2678; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
2679; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
2680; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
2681; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
2682; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
2683; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
2684; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
2685; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
2686; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
2687; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
2688; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
2689; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
2690; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 65535
2691; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16
2692; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1
2693; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2
2694; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2
2695; CHECK-NEXT:    [[TMP43:%.*]] = zext i16 [[TMP41]] to i32
2696; CHECK-NEXT:    [[TMP44:%.*]] = zext i16 [[TMP42]] to i32
2697; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
2698; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
2699; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
2700; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
2701; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
2702; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
2703; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
2704; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
2705; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
2706; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
2707; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
2708; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
2709; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
2710; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 65535
2711; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16
2712; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2
2713; CHECK-NEXT:    store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
2714; CHECK-NEXT:    ret void
2715;
2716; GCN-LABEL: udiv_v3i16:
2717; GCN:       ; %bb.0:
2718; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2719; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
2720; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2721; GCN-NEXT:    s_mov_b32 s8, 0xffff
2722; GCN-NEXT:    s_mov_b32 s7, 0xf000
2723; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2724; GCN-NEXT:    s_and_b32 s6, s0, s8
2725; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
2726; GCN-NEXT:    s_and_b32 s6, s2, s8
2727; GCN-NEXT:    s_lshr_b32 s0, s0, 16
2728; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s0
2729; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s6
2730; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2731; GCN-NEXT:    s_lshr_b32 s0, s2, 16
2732; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s0
2733; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v3
2734; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2735; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2736; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
2737; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
2738; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
2739; GCN-NEXT:    v_mul_f32_e32 v1, v4, v5
2740; GCN-NEXT:    v_trunc_f32_e32 v1, v1
2741; GCN-NEXT:    s_and_b32 s0, s1, s8
2742; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
2743; GCN-NEXT:    v_mad_f32 v2, -v1, v3, v4
2744; GCN-NEXT:    v_cvt_f32_u32_e32 v4, s0
2745; GCN-NEXT:    s_and_b32 s0, s3, s8
2746; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s0
2747; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
2748; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
2749; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v3
2750; GCN-NEXT:    s_mov_b32 s6, -1
2751; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2752; GCN-NEXT:    v_mul_f32_e32 v2, v5, v6
2753; GCN-NEXT:    v_trunc_f32_e32 v2, v2
2754; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
2755; GCN-NEXT:    v_mad_f32 v2, -v2, v4, v5
2756; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
2757; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2758; GCN-NEXT:    v_and_b32_e32 v0, s8, v0
2759; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
2760; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
2761; GCN-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
2762; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2763; GCN-NEXT:    s_endpgm
2764  %r = udiv <3 x i16> %x, %y
2765  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
2766  ret void
2767}
2768
2769define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
2770; CHECK-LABEL: @urem_v3i16(
2771; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
2772; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
2773; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
2774; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
2775; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
2776; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
2777; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
2778; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
2779; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
2780; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
2781; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
2782; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
2783; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
2784; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
2785; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
2786; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
2787; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
2788; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
2789; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
2790; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 65535
2791; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
2792; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x i16> undef, i16 [[TMP21]], i64 0
2793; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1
2794; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1
2795; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP23]] to i32
2796; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TMP24]] to i32
2797; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
2798; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
2799; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
2800; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
2801; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
2802; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
2803; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
2804; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
2805; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2806; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
2807; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
2808; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
2809; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
2810; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
2811; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
2812; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 65535
2813; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16
2814; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1
2815; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2
2816; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2
2817; CHECK-NEXT:    [[TMP47:%.*]] = zext i16 [[TMP45]] to i32
2818; CHECK-NEXT:    [[TMP48:%.*]] = zext i16 [[TMP46]] to i32
2819; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
2820; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
2821; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
2822; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
2823; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
2824; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
2825; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
2826; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
2827; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
2828; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
2829; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
2830; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
2831; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
2832; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
2833; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
2834; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 65535
2835; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16
2836; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2
2837; CHECK-NEXT:    store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
2838; CHECK-NEXT:    ret void
2839;
2840; GCN-LABEL: urem_v3i16:
2841; GCN:       ; %bb.0:
2842; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2843; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
2844; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2845; GCN-NEXT:    s_mov_b32 s8, 0xffff
2846; GCN-NEXT:    s_mov_b32 s7, 0xf000
2847; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2848; GCN-NEXT:    v_mov_b32_e32 v1, s2
2849; GCN-NEXT:    s_and_b32 s6, s0, s8
2850; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
2851; GCN-NEXT:    s_and_b32 s6, s2, s8
2852; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s6
2853; GCN-NEXT:    v_mov_b32_e32 v4, s0
2854; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v0
2855; GCN-NEXT:    v_alignbit_b32 v4, s1, v4, 16
2856; GCN-NEXT:    v_and_b32_e32 v5, s8, v4
2857; GCN-NEXT:    v_alignbit_b32 v1, s3, v1, 16
2858; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
2859; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2860; GCN-NEXT:    v_mad_f32 v2, -v3, v0, v2
2861; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v3
2862; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
2863; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v5
2864; GCN-NEXT:    v_and_b32_e32 v3, s8, v1
2865; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
2866; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
2867; GCN-NEXT:    s_and_b32 s0, s1, s8
2868; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v3
2869; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v2
2870; GCN-NEXT:    v_cvt_f32_u32_e32 v6, s0
2871; GCN-NEXT:    s_and_b32 s0, s3, s8
2872; GCN-NEXT:    v_cvt_f32_u32_e32 v7, s0
2873; GCN-NEXT:    v_mul_f32_e32 v5, v3, v5
2874; GCN-NEXT:    v_trunc_f32_e32 v5, v5
2875; GCN-NEXT:    v_rcp_iflag_f32_e32 v8, v6
2876; GCN-NEXT:    v_mad_f32 v3, -v5, v2, v3
2877; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
2878; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
2879; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
2880; GCN-NEXT:    v_mul_f32_e32 v3, v7, v8
2881; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
2882; GCN-NEXT:    v_trunc_f32_e32 v3, v3
2883; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
2884; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v3
2885; GCN-NEXT:    v_mad_f32 v3, -v3, v6, v7
2886; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v6
2887; GCN-NEXT:    s_mov_b32 s6, -1
2888; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
2889; GCN-NEXT:    v_mul_lo_u32 v3, v3, s1
2890; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
2891; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2892; GCN-NEXT:    v_and_b32_e32 v0, s8, v0
2893; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s3, v3
2894; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
2895; GCN-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
2896; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2897; GCN-NEXT:    s_endpgm
2898  %r = urem <3 x i16> %x, %y
2899  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
2900  ret void
2901}
2902
2903define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
2904; CHECK-LABEL: @sdiv_v3i16(
2905; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
2906; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
2907; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
2908; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
2909; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
2910; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
2911; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
2912; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
2913; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
2914; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
2915; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
2916; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
2917; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
2918; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
2919; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
2920; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
2921; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
2922; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
2923; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
2924; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
2925; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
2926; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
2927; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
2928; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x i16> undef, i16 [[TMP23]], i64 0
2929; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1
2930; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1
2931; CHECK-NEXT:    [[TMP27:%.*]] = sext i16 [[TMP25]] to i32
2932; CHECK-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP26]] to i32
2933; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
2934; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
2935; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
2936; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
2937; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
2938; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
2939; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
2940; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
2941; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
2942; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
2943; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
2944; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
2945; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
2946; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
2947; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
2948; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
2949; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 16
2950; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
2951; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
2952; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1
2953; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2
2954; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2
2955; CHECK-NEXT:    [[TMP51:%.*]] = sext i16 [[TMP49]] to i32
2956; CHECK-NEXT:    [[TMP52:%.*]] = sext i16 [[TMP50]] to i32
2957; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
2958; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
2959; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
2960; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
2961; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
2962; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
2963; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
2964; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
2965; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
2966; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
2967; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
2968; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
2969; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
2970; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
2971; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
2972; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
2973; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 16
2974; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
2975; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
2976; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2
2977; CHECK-NEXT:    store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
2978; CHECK-NEXT:    ret void
2979;
2980; GCN-LABEL: sdiv_v3i16:
2981; GCN:       ; %bb.0:
2982; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2983; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
2984; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2985; GCN-NEXT:    s_mov_b32 s7, 0xf000
2986; GCN-NEXT:    s_mov_b32 s6, -1
2987; GCN-NEXT:    s_waitcnt lgkmcnt(0)
2988; GCN-NEXT:    s_sext_i32_i16 s9, s2
2989; GCN-NEXT:    s_sext_i32_i16 s8, s0
2990; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s8
2991; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s9
2992; GCN-NEXT:    s_xor_b32 s8, s9, s8
2993; GCN-NEXT:    s_ashr_i32 s0, s0, 16
2994; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
2995; GCN-NEXT:    s_ashr_i32 s8, s8, 30
2996; GCN-NEXT:    s_or_b32 s8, s8, 1
2997; GCN-NEXT:    v_mov_b32_e32 v3, s8
2998; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
2999; GCN-NEXT:    v_trunc_f32_e32 v2, v2
3000; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
3001; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
3002; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
3003; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s0
3004; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
3005; GCN-NEXT:    s_ashr_i32 s2, s2, 16
3006; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
3007; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s2
3008; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v1
3009; GCN-NEXT:    s_xor_b32 s0, s2, s0
3010; GCN-NEXT:    s_ashr_i32 s0, s0, 30
3011; GCN-NEXT:    s_or_b32 s0, s0, 1
3012; GCN-NEXT:    v_mul_f32_e32 v3, v2, v3
3013; GCN-NEXT:    v_trunc_f32_e32 v3, v3
3014; GCN-NEXT:    v_mad_f32 v2, -v3, v1, v2
3015; GCN-NEXT:    v_mov_b32_e32 v4, s0
3016; GCN-NEXT:    s_sext_i32_i16 s0, s1
3017; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v1|
3018; GCN-NEXT:    v_cvt_i32_f32_e32 v3, v3
3019; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s0
3020; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
3021; GCN-NEXT:    s_sext_i32_i16 s1, s3
3022; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
3023; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s1
3024; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
3025; GCN-NEXT:    s_xor_b32 s0, s1, s0
3026; GCN-NEXT:    s_ashr_i32 s0, s0, 30
3027; GCN-NEXT:    s_or_b32 s0, s0, 1
3028; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
3029; GCN-NEXT:    v_trunc_f32_e32 v4, v4
3030; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
3031; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
3032; GCN-NEXT:    v_mov_b32_e32 v5, s0
3033; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
3034; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
3035; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
3036; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3037; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3038; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
3039; GCN-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
3040; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3041; GCN-NEXT:    s_endpgm
3042  %r = sdiv <3 x i16> %x, %y
3043  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
3044  ret void
3045}
3046
3047define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) {
3048; CHECK-LABEL: @srem_v3i16(
3049; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0
3050; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0
3051; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP1]] to i32
3052; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP2]] to i32
3053; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
3054; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
3055; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
3056; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
3057; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
3058; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
3059; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
3060; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
3061; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
3062; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
3063; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
3064; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
3065; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3066; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
3067; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
3068; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
3069; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
3070; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
3071; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 16
3072; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
3073; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
3074; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x i16> undef, i16 [[TMP25]], i64 0
3075; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1
3076; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1
3077; CHECK-NEXT:    [[TMP29:%.*]] = sext i16 [[TMP27]] to i32
3078; CHECK-NEXT:    [[TMP30:%.*]] = sext i16 [[TMP28]] to i32
3079; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
3080; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
3081; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
3082; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
3083; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
3084; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
3085; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
3086; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
3087; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
3088; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
3089; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
3090; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
3091; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
3092; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
3093; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
3094; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
3095; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
3096; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
3097; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 16
3098; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
3099; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
3100; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1
3101; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2
3102; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2
3103; CHECK-NEXT:    [[TMP55:%.*]] = sext i16 [[TMP53]] to i32
3104; CHECK-NEXT:    [[TMP56:%.*]] = sext i16 [[TMP54]] to i32
3105; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
3106; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
3107; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
3108; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
3109; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
3110; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
3111; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
3112; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
3113; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
3114; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
3115; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
3116; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
3117; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
3118; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
3119; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
3120; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
3121; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
3122; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
3123; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 16
3124; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
3125; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
3126; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2
3127; CHECK-NEXT:    store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8
3128; CHECK-NEXT:    ret void
3129;
3130; GCN-LABEL: srem_v3i16:
3131; GCN:       ; %bb.0:
3132; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3133; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3134; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3135; GCN-NEXT:    s_mov_b32 s7, 0xf000
3136; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3137; GCN-NEXT:    s_sext_i32_i16 s8, s2
3138; GCN-NEXT:    s_sext_i32_i16 s6, s0
3139; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s6
3140; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s8
3141; GCN-NEXT:    s_xor_b32 s6, s8, s6
3142; GCN-NEXT:    s_ashr_i32 s6, s6, 30
3143; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
3144; GCN-NEXT:    s_or_b32 s6, s6, 1
3145; GCN-NEXT:    v_mov_b32_e32 v3, s6
3146; GCN-NEXT:    s_mov_b32 s6, -1
3147; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
3148; GCN-NEXT:    v_trunc_f32_e32 v2, v2
3149; GCN-NEXT:    v_mad_f32 v1, -v2, v0, v1
3150; GCN-NEXT:    v_cvt_i32_f32_e32 v2, v2
3151; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v0|
3152; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
3153; GCN-NEXT:    v_mov_b32_e32 v1, s2
3154; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
3155; GCN-NEXT:    v_mov_b32_e32 v2, s0
3156; GCN-NEXT:    v_alignbit_b32 v2, s1, v2, 16
3157; GCN-NEXT:    v_bfe_i32 v3, v2, 0, 16
3158; GCN-NEXT:    v_cvt_f32_i32_e32 v4, v3
3159; GCN-NEXT:    v_alignbit_b32 v1, s3, v1, 16
3160; GCN-NEXT:    v_bfe_i32 v5, v1, 0, 16
3161; GCN-NEXT:    v_cvt_f32_i32_e32 v6, v5
3162; GCN-NEXT:    v_rcp_iflag_f32_e32 v7, v4
3163; GCN-NEXT:    v_mul_lo_u32 v0, v0, s0
3164; GCN-NEXT:    v_xor_b32_e32 v3, v5, v3
3165; GCN-NEXT:    s_sext_i32_i16 s0, s1
3166; GCN-NEXT:    v_mul_f32_e32 v5, v6, v7
3167; GCN-NEXT:    v_trunc_f32_e32 v5, v5
3168; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
3169; GCN-NEXT:    v_mad_f32 v6, -v5, v4, v6
3170; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
3171; GCN-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
3172; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v6|, |v4|
3173; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s0
3174; GCN-NEXT:    v_or_b32_e32 v3, 1, v3
3175; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
3176; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
3177; GCN-NEXT:    s_sext_i32_i16 s2, s3
3178; GCN-NEXT:    v_mul_lo_u32 v2, v3, v2
3179; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s2
3180; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v4
3181; GCN-NEXT:    s_xor_b32 s0, s2, s0
3182; GCN-NEXT:    s_ashr_i32 s0, s0, 30
3183; GCN-NEXT:    s_or_b32 s0, s0, 1
3184; GCN-NEXT:    v_mul_f32_e32 v5, v3, v5
3185; GCN-NEXT:    v_trunc_f32_e32 v5, v5
3186; GCN-NEXT:    v_mad_f32 v3, -v5, v4, v3
3187; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
3188; GCN-NEXT:    v_mov_b32_e32 v6, s0
3189; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v4|
3190; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
3191; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
3192; GCN-NEXT:    v_mul_lo_u32 v3, v3, s1
3193; GCN-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
3194; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
3195; GCN-NEXT:    v_and_b32_e32 v0, 0xffff, v0
3196; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s3, v3
3197; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
3198; GCN-NEXT:    buffer_store_short v2, off, s[4:7], 0 offset:4
3199; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3200; GCN-NEXT:    s_endpgm
3201  %r = srem <3 x i16> %x, %y
3202  store <3 x i16> %r, <3 x i16> addrspace(1)* %out
3203  ret void
3204}
3205
3206define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
3207; CHECK-LABEL: @udiv_v3i15(
3208; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
3209; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
3210; CHECK-NEXT:    [[TMP3:%.*]] = zext i15 [[TMP1]] to i32
3211; CHECK-NEXT:    [[TMP4:%.*]] = zext i15 [[TMP2]] to i32
3212; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3213; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3214; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3215; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3216; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3217; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
3218; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3219; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3220; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3221; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3222; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3223; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3224; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3225; CHECK-NEXT:    [[TMP18:%.*]] = and i32 [[TMP17]], 32767
3226; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15
3227; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <3 x i15> undef, i15 [[TMP19]], i64 0
3228; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1
3229; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1
3230; CHECK-NEXT:    [[TMP23:%.*]] = zext i15 [[TMP21]] to i32
3231; CHECK-NEXT:    [[TMP24:%.*]] = zext i15 [[TMP22]] to i32
3232; CHECK-NEXT:    [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float
3233; CHECK-NEXT:    [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float
3234; CHECK-NEXT:    [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]])
3235; CHECK-NEXT:    [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]]
3236; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]])
3237; CHECK-NEXT:    [[TMP30:%.*]] = fneg fast float [[TMP29]]
3238; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]])
3239; CHECK-NEXT:    [[TMP32:%.*]] = fptoui float [[TMP29]] to i32
3240; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]])
3241; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]])
3242; CHECK-NEXT:    [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]]
3243; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0
3244; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]]
3245; CHECK-NEXT:    [[TMP38:%.*]] = and i32 [[TMP37]], 32767
3246; CHECK-NEXT:    [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15
3247; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1
3248; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2
3249; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2
3250; CHECK-NEXT:    [[TMP43:%.*]] = zext i15 [[TMP41]] to i32
3251; CHECK-NEXT:    [[TMP44:%.*]] = zext i15 [[TMP42]] to i32
3252; CHECK-NEXT:    [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float
3253; CHECK-NEXT:    [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float
3254; CHECK-NEXT:    [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]])
3255; CHECK-NEXT:    [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]]
3256; CHECK-NEXT:    [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]])
3257; CHECK-NEXT:    [[TMP50:%.*]] = fneg fast float [[TMP49]]
3258; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]])
3259; CHECK-NEXT:    [[TMP52:%.*]] = fptoui float [[TMP49]] to i32
3260; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]])
3261; CHECK-NEXT:    [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]])
3262; CHECK-NEXT:    [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]]
3263; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0
3264; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]]
3265; CHECK-NEXT:    [[TMP58:%.*]] = and i32 [[TMP57]], 32767
3266; CHECK-NEXT:    [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15
3267; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2
3268; CHECK-NEXT:    store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
3269; CHECK-NEXT:    ret void
3270;
3271; GCN-LABEL: udiv_v3i15:
3272; GCN:       ; %bb.0:
3273; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3274; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3275; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3276; GCN-NEXT:    s_mov_b32 s7, 0xf000
3277; GCN-NEXT:    s_mov_b32 s6, -1
3278; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3279; GCN-NEXT:    v_mov_b32_e32 v0, s2
3280; GCN-NEXT:    v_alignbit_b32 v0, s3, v0, 30
3281; GCN-NEXT:    s_movk_i32 s3, 0x7fff
3282; GCN-NEXT:    s_and_b32 s9, s0, s3
3283; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
3284; GCN-NEXT:    v_mov_b32_e32 v2, s0
3285; GCN-NEXT:    s_and_b32 s8, s2, s3
3286; GCN-NEXT:    s_bfe_u32 s0, s0, 0xf000f
3287; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s0
3288; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s8
3289; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v1
3290; GCN-NEXT:    s_bfe_u32 s2, s2, 0xf000f
3291; GCN-NEXT:    v_alignbit_b32 v2, s1, v2, 30
3292; GCN-NEXT:    v_cvt_f32_u32_e32 v6, s2
3293; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
3294; GCN-NEXT:    v_rcp_iflag_f32_e32 v7, v5
3295; GCN-NEXT:    v_and_b32_e32 v2, s3, v2
3296; GCN-NEXT:    v_trunc_f32_e32 v4, v4
3297; GCN-NEXT:    v_mad_f32 v3, -v4, v1, v3
3298; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
3299; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v2
3300; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
3301; GCN-NEXT:    v_mul_f32_e32 v1, v6, v7
3302; GCN-NEXT:    v_and_b32_e32 v0, s3, v0
3303; GCN-NEXT:    v_trunc_f32_e32 v1, v1
3304; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
3305; GCN-NEXT:    v_mad_f32 v4, -v1, v5, v6
3306; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
3307; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
3308; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v2
3309; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v5
3310; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
3311; GCN-NEXT:    v_mul_f32_e32 v1, v0, v6
3312; GCN-NEXT:    v_trunc_f32_e32 v1, v1
3313; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v1
3314; GCN-NEXT:    v_mad_f32 v0, -v1, v2, v0
3315; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v2
3316; GCN-NEXT:    v_and_b32_e32 v2, s3, v3
3317; GCN-NEXT:    v_addc_u32_e32 v0, vcc, 0, v5, vcc
3318; GCN-NEXT:    v_and_b32_e32 v3, s3, v4
3319; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
3320; GCN-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
3321; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
3322; GCN-NEXT:    v_or_b32_e32 v0, v2, v0
3323; GCN-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
3324; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3325; GCN-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
3326; GCN-NEXT:    s_endpgm
3327  %r = udiv <3 x i15> %x, %y
3328  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
3329  ret void
3330}
3331
3332define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
3333; CHECK-LABEL: @urem_v3i15(
3334; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
3335; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
3336; CHECK-NEXT:    [[TMP3:%.*]] = zext i15 [[TMP1]] to i32
3337; CHECK-NEXT:    [[TMP4:%.*]] = zext i15 [[TMP2]] to i32
3338; CHECK-NEXT:    [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float
3339; CHECK-NEXT:    [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float
3340; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]])
3341; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]]
3342; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]])
3343; CHECK-NEXT:    [[TMP10:%.*]] = fneg fast float [[TMP9]]
3344; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]])
3345; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP9]] to i32
3346; CHECK-NEXT:    [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]])
3347; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]])
3348; CHECK-NEXT:    [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]]
3349; CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0
3350; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]]
3351; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]]
3352; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]]
3353; CHECK-NEXT:    [[TMP20:%.*]] = and i32 [[TMP19]], 32767
3354; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15
3355; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x i15> undef, i15 [[TMP21]], i64 0
3356; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1
3357; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1
3358; CHECK-NEXT:    [[TMP25:%.*]] = zext i15 [[TMP23]] to i32
3359; CHECK-NEXT:    [[TMP26:%.*]] = zext i15 [[TMP24]] to i32
3360; CHECK-NEXT:    [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float
3361; CHECK-NEXT:    [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float
3362; CHECK-NEXT:    [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]])
3363; CHECK-NEXT:    [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]]
3364; CHECK-NEXT:    [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]])
3365; CHECK-NEXT:    [[TMP32:%.*]] = fneg fast float [[TMP31]]
3366; CHECK-NEXT:    [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]])
3367; CHECK-NEXT:    [[TMP34:%.*]] = fptoui float [[TMP31]] to i32
3368; CHECK-NEXT:    [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
3369; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]])
3370; CHECK-NEXT:    [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]]
3371; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0
3372; CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]]
3373; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]]
3374; CHECK-NEXT:    [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]]
3375; CHECK-NEXT:    [[TMP42:%.*]] = and i32 [[TMP41]], 32767
3376; CHECK-NEXT:    [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15
3377; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1
3378; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2
3379; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2
3380; CHECK-NEXT:    [[TMP47:%.*]] = zext i15 [[TMP45]] to i32
3381; CHECK-NEXT:    [[TMP48:%.*]] = zext i15 [[TMP46]] to i32
3382; CHECK-NEXT:    [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float
3383; CHECK-NEXT:    [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float
3384; CHECK-NEXT:    [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]])
3385; CHECK-NEXT:    [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]]
3386; CHECK-NEXT:    [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]])
3387; CHECK-NEXT:    [[TMP54:%.*]] = fneg fast float [[TMP53]]
3388; CHECK-NEXT:    [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]])
3389; CHECK-NEXT:    [[TMP56:%.*]] = fptoui float [[TMP53]] to i32
3390; CHECK-NEXT:    [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]])
3391; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]])
3392; CHECK-NEXT:    [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]]
3393; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0
3394; CHECK-NEXT:    [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]]
3395; CHECK-NEXT:    [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]]
3396; CHECK-NEXT:    [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]]
3397; CHECK-NEXT:    [[TMP64:%.*]] = and i32 [[TMP63]], 32767
3398; CHECK-NEXT:    [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15
3399; CHECK-NEXT:    [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2
3400; CHECK-NEXT:    store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
3401; CHECK-NEXT:    ret void
3402;
3403; GCN-LABEL: urem_v3i15:
3404; GCN:       ; %bb.0:
3405; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3406; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3407; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3408; GCN-NEXT:    s_mov_b32 s7, 0xf000
3409; GCN-NEXT:    s_mov_b32 s6, -1
3410; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3411; GCN-NEXT:    v_mov_b32_e32 v0, s2
3412; GCN-NEXT:    v_alignbit_b32 v0, s3, v0, 30
3413; GCN-NEXT:    s_movk_i32 s3, 0x7fff
3414; GCN-NEXT:    s_and_b32 s10, s0, s3
3415; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s10
3416; GCN-NEXT:    s_and_b32 s9, s2, s3
3417; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s9
3418; GCN-NEXT:    v_mov_b32_e32 v2, s0
3419; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v1
3420; GCN-NEXT:    v_alignbit_b32 v2, s1, v2, 30
3421; GCN-NEXT:    s_bfe_u32 s1, s0, 0xf000f
3422; GCN-NEXT:    v_cvt_f32_u32_e32 v5, s1
3423; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
3424; GCN-NEXT:    v_trunc_f32_e32 v4, v4
3425; GCN-NEXT:    v_mad_f32 v3, -v4, v1, v3
3426; GCN-NEXT:    v_cvt_u32_f32_e32 v4, v4
3427; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
3428; GCN-NEXT:    s_bfe_u32 s10, s2, 0xf000f
3429; GCN-NEXT:    v_cvt_f32_u32_e32 v3, s10
3430; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
3431; GCN-NEXT:    v_mul_lo_u32 v1, v1, s0
3432; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v5
3433; GCN-NEXT:    v_and_b32_e32 v2, s3, v2
3434; GCN-NEXT:    v_and_b32_e32 v0, s3, v0
3435; GCN-NEXT:    v_sub_i32_e32 v6, vcc, s2, v1
3436; GCN-NEXT:    v_mul_f32_e32 v1, v3, v4
3437; GCN-NEXT:    v_cvt_f32_u32_e32 v4, v2
3438; GCN-NEXT:    v_cvt_f32_u32_e32 v7, v0
3439; GCN-NEXT:    v_trunc_f32_e32 v1, v1
3440; GCN-NEXT:    v_mad_f32 v3, -v1, v5, v3
3441; GCN-NEXT:    v_rcp_iflag_f32_e32 v8, v4
3442; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
3443; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
3444; GCN-NEXT:    s_lshr_b32 s0, s0, 15
3445; GCN-NEXT:    v_mul_f32_e32 v3, v7, v8
3446; GCN-NEXT:    v_trunc_f32_e32 v3, v3
3447; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v3
3448; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
3449; GCN-NEXT:    v_mad_f32 v3, -v3, v4, v7
3450; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
3451; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
3452; GCN-NEXT:    v_mul_lo_u32 v1, v1, s0
3453; GCN-NEXT:    v_mul_lo_u32 v2, v3, v2
3454; GCN-NEXT:    s_lshr_b32 s8, s2, 15
3455; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s8, v1
3456; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v2, v0
3457; GCN-NEXT:    v_and_b32_e32 v3, s3, v3
3458; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
3459; GCN-NEXT:    v_and_b32_e32 v2, s3, v6
3460; GCN-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
3461; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
3462; GCN-NEXT:    v_or_b32_e32 v0, v2, v0
3463; GCN-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
3464; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3465; GCN-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
3466; GCN-NEXT:    s_endpgm
3467  %r = urem <3 x i15> %x, %y
3468  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
3469  ret void
3470}
3471
3472define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
3473; CHECK-LABEL: @sdiv_v3i15(
3474; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
3475; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
3476; CHECK-NEXT:    [[TMP3:%.*]] = sext i15 [[TMP1]] to i32
3477; CHECK-NEXT:    [[TMP4:%.*]] = sext i15 [[TMP2]] to i32
3478; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
3479; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
3480; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
3481; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
3482; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
3483; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
3484; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
3485; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
3486; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
3487; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
3488; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
3489; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
3490; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3491; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
3492; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
3493; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
3494; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 17
3495; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 17
3496; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15
3497; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x i15> undef, i15 [[TMP23]], i64 0
3498; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1
3499; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1
3500; CHECK-NEXT:    [[TMP27:%.*]] = sext i15 [[TMP25]] to i32
3501; CHECK-NEXT:    [[TMP28:%.*]] = sext i15 [[TMP26]] to i32
3502; CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]]
3503; CHECK-NEXT:    [[TMP30:%.*]] = ashr i32 [[TMP29]], 30
3504; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 1
3505; CHECK-NEXT:    [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float
3506; CHECK-NEXT:    [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float
3507; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
3508; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]]
3509; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]])
3510; CHECK-NEXT:    [[TMP37:%.*]] = fneg fast float [[TMP36]]
3511; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]])
3512; CHECK-NEXT:    [[TMP39:%.*]] = fptosi float [[TMP36]] to i32
3513; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]])
3514; CHECK-NEXT:    [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]])
3515; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
3516; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
3517; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
3518; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 17
3519; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 17
3520; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15
3521; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1
3522; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2
3523; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2
3524; CHECK-NEXT:    [[TMP51:%.*]] = sext i15 [[TMP49]] to i32
3525; CHECK-NEXT:    [[TMP52:%.*]] = sext i15 [[TMP50]] to i32
3526; CHECK-NEXT:    [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]]
3527; CHECK-NEXT:    [[TMP54:%.*]] = ashr i32 [[TMP53]], 30
3528; CHECK-NEXT:    [[TMP55:%.*]] = or i32 [[TMP54]], 1
3529; CHECK-NEXT:    [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float
3530; CHECK-NEXT:    [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float
3531; CHECK-NEXT:    [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]])
3532; CHECK-NEXT:    [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]]
3533; CHECK-NEXT:    [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]])
3534; CHECK-NEXT:    [[TMP61:%.*]] = fneg fast float [[TMP60]]
3535; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]])
3536; CHECK-NEXT:    [[TMP63:%.*]] = fptosi float [[TMP60]] to i32
3537; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]])
3538; CHECK-NEXT:    [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]])
3539; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
3540; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
3541; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
3542; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 17
3543; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 17
3544; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15
3545; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2
3546; CHECK-NEXT:    store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
3547; CHECK-NEXT:    ret void
3548;
3549; GCN-LABEL: sdiv_v3i15:
3550; GCN:       ; %bb.0:
3551; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3552; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3553; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3554; GCN-NEXT:    s_mov_b32 s7, 0xf000
3555; GCN-NEXT:    s_mov_b32 s6, -1
3556; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3557; GCN-NEXT:    v_mov_b32_e32 v0, s2
3558; GCN-NEXT:    v_alignbit_b32 v0, s3, v0, 30
3559; GCN-NEXT:    s_bfe_i32 s3, s0, 0xf0000
3560; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s3
3561; GCN-NEXT:    v_mov_b32_e32 v1, s0
3562; GCN-NEXT:    v_alignbit_b32 v1, s1, v1, 30
3563; GCN-NEXT:    s_bfe_i32 s1, s2, 0xf0000
3564; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s1
3565; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
3566; GCN-NEXT:    s_xor_b32 s1, s1, s3
3567; GCN-NEXT:    s_bfe_i32 s0, s0, 0xf000f
3568; GCN-NEXT:    s_ashr_i32 s1, s1, 30
3569; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
3570; GCN-NEXT:    v_trunc_f32_e32 v4, v4
3571; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
3572; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
3573; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
3574; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s0
3575; GCN-NEXT:    s_or_b32 s1, s1, 1
3576; GCN-NEXT:    v_mov_b32_e32 v5, s1
3577; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
3578; GCN-NEXT:    s_bfe_i32 s1, s2, 0xf000f
3579; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
3580; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s1
3581; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v3
3582; GCN-NEXT:    s_xor_b32 s0, s1, s0
3583; GCN-NEXT:    v_bfe_i32 v1, v1, 0, 15
3584; GCN-NEXT:    s_ashr_i32 s0, s0, 30
3585; GCN-NEXT:    v_mul_f32_e32 v5, v4, v5
3586; GCN-NEXT:    v_trunc_f32_e32 v5, v5
3587; GCN-NEXT:    v_mad_f32 v4, -v5, v3, v4
3588; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
3589; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
3590; GCN-NEXT:    v_cvt_f32_i32_e32 v4, v1
3591; GCN-NEXT:    s_or_b32 s0, s0, 1
3592; GCN-NEXT:    v_mov_b32_e32 v6, s0
3593; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
3594; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 15
3595; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
3596; GCN-NEXT:    v_cvt_f32_i32_e32 v5, v0
3597; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
3598; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
3599; GCN-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
3600; GCN-NEXT:    v_or_b32_e32 v0, 1, v0
3601; GCN-NEXT:    v_mul_f32_e32 v1, v5, v6
3602; GCN-NEXT:    v_trunc_f32_e32 v1, v1
3603; GCN-NEXT:    v_mad_f32 v5, -v1, v4, v5
3604; GCN-NEXT:    v_cvt_i32_f32_e32 v1, v1
3605; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, |v4|
3606; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
3607; GCN-NEXT:    s_movk_i32 s0, 0x7fff
3608; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
3609; GCN-NEXT:    v_and_b32_e32 v3, s0, v3
3610; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
3611; GCN-NEXT:    v_and_b32_e32 v2, s0, v2
3612; GCN-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
3613; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
3614; GCN-NEXT:    v_or_b32_e32 v0, v2, v0
3615; GCN-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
3616; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3617; GCN-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
3618; GCN-NEXT:    s_endpgm
3619  %r = sdiv <3 x i15> %x, %y
3620  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
3621  ret void
3622}
3623
3624define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) {
3625; CHECK-LABEL: @srem_v3i15(
3626; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0
3627; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0
3628; CHECK-NEXT:    [[TMP3:%.*]] = sext i15 [[TMP1]] to i32
3629; CHECK-NEXT:    [[TMP4:%.*]] = sext i15 [[TMP2]] to i32
3630; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
3631; CHECK-NEXT:    [[TMP6:%.*]] = ashr i32 [[TMP5]], 30
3632; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], 1
3633; CHECK-NEXT:    [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float
3634; CHECK-NEXT:    [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float
3635; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
3636; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]]
3637; CHECK-NEXT:    [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]])
3638; CHECK-NEXT:    [[TMP13:%.*]] = fneg fast float [[TMP12]]
3639; CHECK-NEXT:    [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]])
3640; CHECK-NEXT:    [[TMP15:%.*]] = fptosi float [[TMP12]] to i32
3641; CHECK-NEXT:    [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]])
3642; CHECK-NEXT:    [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]])
3643; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
3644; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
3645; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
3646; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
3647; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
3648; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 17
3649; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 17
3650; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15
3651; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x i15> undef, i15 [[TMP25]], i64 0
3652; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1
3653; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1
3654; CHECK-NEXT:    [[TMP29:%.*]] = sext i15 [[TMP27]] to i32
3655; CHECK-NEXT:    [[TMP30:%.*]] = sext i15 [[TMP28]] to i32
3656; CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]]
3657; CHECK-NEXT:    [[TMP32:%.*]] = ashr i32 [[TMP31]], 30
3658; CHECK-NEXT:    [[TMP33:%.*]] = or i32 [[TMP32]], 1
3659; CHECK-NEXT:    [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float
3660; CHECK-NEXT:    [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float
3661; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
3662; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]]
3663; CHECK-NEXT:    [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]])
3664; CHECK-NEXT:    [[TMP39:%.*]] = fneg fast float [[TMP38]]
3665; CHECK-NEXT:    [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]])
3666; CHECK-NEXT:    [[TMP41:%.*]] = fptosi float [[TMP38]] to i32
3667; CHECK-NEXT:    [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]])
3668; CHECK-NEXT:    [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]])
3669; CHECK-NEXT:    [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]]
3670; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0
3671; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
3672; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
3673; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
3674; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 17
3675; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 17
3676; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15
3677; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1
3678; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2
3679; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2
3680; CHECK-NEXT:    [[TMP55:%.*]] = sext i15 [[TMP53]] to i32
3681; CHECK-NEXT:    [[TMP56:%.*]] = sext i15 [[TMP54]] to i32
3682; CHECK-NEXT:    [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]]
3683; CHECK-NEXT:    [[TMP58:%.*]] = ashr i32 [[TMP57]], 30
3684; CHECK-NEXT:    [[TMP59:%.*]] = or i32 [[TMP58]], 1
3685; CHECK-NEXT:    [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float
3686; CHECK-NEXT:    [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float
3687; CHECK-NEXT:    [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]])
3688; CHECK-NEXT:    [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]]
3689; CHECK-NEXT:    [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]])
3690; CHECK-NEXT:    [[TMP65:%.*]] = fneg fast float [[TMP64]]
3691; CHECK-NEXT:    [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]])
3692; CHECK-NEXT:    [[TMP67:%.*]] = fptosi float [[TMP64]] to i32
3693; CHECK-NEXT:    [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]])
3694; CHECK-NEXT:    [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]])
3695; CHECK-NEXT:    [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]]
3696; CHECK-NEXT:    [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0
3697; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
3698; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
3699; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
3700; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 17
3701; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 17
3702; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15
3703; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2
3704; CHECK-NEXT:    store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8
3705; CHECK-NEXT:    ret void
3706;
3707; GCN-LABEL: srem_v3i15:
3708; GCN:       ; %bb.0:
3709; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3710; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
3711; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
3712; GCN-NEXT:    s_mov_b32 s7, 0xf000
3713; GCN-NEXT:    s_mov_b32 s6, -1
3714; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3715; GCN-NEXT:    v_mov_b32_e32 v0, s2
3716; GCN-NEXT:    v_alignbit_b32 v0, s3, v0, 30
3717; GCN-NEXT:    s_movk_i32 s3, 0x7fff
3718; GCN-NEXT:    s_and_b32 s11, s0, s3
3719; GCN-NEXT:    s_bfe_i32 s11, s11, 0xf0000
3720; GCN-NEXT:    v_cvt_f32_i32_e32 v2, s11
3721; GCN-NEXT:    s_and_b32 s9, s2, s3
3722; GCN-NEXT:    s_bfe_i32 s9, s9, 0xf0000
3723; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s9
3724; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
3725; GCN-NEXT:    s_xor_b32 s9, s9, s11
3726; GCN-NEXT:    s_ashr_i32 s9, s9, 30
3727; GCN-NEXT:    s_or_b32 s9, s9, 1
3728; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
3729; GCN-NEXT:    v_trunc_f32_e32 v4, v4
3730; GCN-NEXT:    v_mad_f32 v3, -v4, v2, v3
3731; GCN-NEXT:    v_cvt_i32_f32_e32 v4, v4
3732; GCN-NEXT:    v_mov_b32_e32 v5, s9
3733; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
3734; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
3735; GCN-NEXT:    v_mov_b32_e32 v1, s0
3736; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
3737; GCN-NEXT:    s_bfe_u32 s12, s0, 0xf000f
3738; GCN-NEXT:    v_alignbit_b32 v1, s1, v1, 30
3739; GCN-NEXT:    v_mul_lo_u32 v2, v2, s0
3740; GCN-NEXT:    s_lshr_b32 s1, s0, 15
3741; GCN-NEXT:    s_bfe_i32 s0, s12, 0xf0000
3742; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s0
3743; GCN-NEXT:    s_bfe_u32 s10, s2, 0xf000f
3744; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
3745; GCN-NEXT:    s_lshr_b32 s8, s2, 15
3746; GCN-NEXT:    s_bfe_i32 s2, s10, 0xf0000
3747; GCN-NEXT:    v_cvt_f32_i32_e32 v4, s2
3748; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v3
3749; GCN-NEXT:    s_xor_b32 s0, s2, s0
3750; GCN-NEXT:    s_ashr_i32 s0, s0, 30
3751; GCN-NEXT:    s_or_b32 s0, s0, 1
3752; GCN-NEXT:    v_mul_f32_e32 v5, v4, v5
3753; GCN-NEXT:    v_trunc_f32_e32 v5, v5
3754; GCN-NEXT:    v_mad_f32 v4, -v5, v3, v4
3755; GCN-NEXT:    v_cvt_i32_f32_e32 v5, v5
3756; GCN-NEXT:    v_and_b32_e32 v1, s3, v1
3757; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v3|
3758; GCN-NEXT:    v_mov_b32_e32 v6, s0
3759; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v6, vcc
3760; GCN-NEXT:    v_bfe_i32 v4, v1, 0, 15
3761; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
3762; GCN-NEXT:    v_cvt_f32_i32_e32 v5, v4
3763; GCN-NEXT:    v_and_b32_e32 v0, s3, v0
3764; GCN-NEXT:    v_bfe_i32 v6, v0, 0, 15
3765; GCN-NEXT:    v_cvt_f32_i32_e32 v7, v6
3766; GCN-NEXT:    v_rcp_iflag_f32_e32 v8, v5
3767; GCN-NEXT:    v_xor_b32_e32 v4, v6, v4
3768; GCN-NEXT:    v_ashrrev_i32_e32 v4, 30, v4
3769; GCN-NEXT:    v_or_b32_e32 v4, 1, v4
3770; GCN-NEXT:    v_mul_f32_e32 v6, v7, v8
3771; GCN-NEXT:    v_trunc_f32_e32 v6, v6
3772; GCN-NEXT:    v_mad_f32 v7, -v6, v5, v7
3773; GCN-NEXT:    v_cvt_i32_f32_e32 v6, v6
3774; GCN-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, |v5|
3775; GCN-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
3776; GCN-NEXT:    v_mul_lo_u32 v3, v3, s1
3777; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
3778; GCN-NEXT:    v_mul_lo_u32 v1, v4, v1
3779; GCN-NEXT:    v_and_b32_e32 v2, s3, v2
3780; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
3781; GCN-NEXT:    v_and_b32_e32 v3, s3, v3
3782; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, v1, v0
3783; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], 30
3784; GCN-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
3785; GCN-NEXT:    v_or_b32_e32 v2, v2, v3
3786; GCN-NEXT:    v_or_b32_e32 v0, v2, v0
3787; GCN-NEXT:    v_and_b32_e32 v1, 0x1fff, v1
3788; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3789; GCN-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
3790; GCN-NEXT:    s_endpgm
3791  %r = srem <3 x i15> %x, %y
3792  store <3 x i15> %r, <3 x i15> addrspace(1)* %out
3793  ret void
3794}
3795
3796define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
3797; CHECK-LABEL: @udiv_i32_oddk_denom(
3798; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 1235195
3799; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
3800; CHECK-NEXT:    ret void
3801;
3802; GCN-LABEL: udiv_i32_oddk_denom:
3803; GCN:       ; %bb.0:
3804; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3805; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
3806; GCN-NEXT:    v_mov_b32_e32 v0, 0xb2a50881
3807; GCN-NEXT:    s_mov_b32 s7, 0xf000
3808; GCN-NEXT:    s_mov_b32 s6, -1
3809; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3810; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
3811; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s0, v0
3812; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
3813; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
3814; GCN-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
3815; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3816; GCN-NEXT:    s_endpgm
3817  %r = udiv i32 %x, 1235195
3818  store i32 %r, i32 addrspace(1)* %out
3819  ret void
3820}
3821
3822define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
3823; CHECK-LABEL: @udiv_i32_pow2k_denom(
3824; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], 4096
3825; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
3826; CHECK-NEXT:    ret void
3827;
3828; GCN-LABEL: udiv_i32_pow2k_denom:
3829; GCN:       ; %bb.0:
3830; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3831; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
3832; GCN-NEXT:    s_mov_b32 s7, 0xf000
3833; GCN-NEXT:    s_mov_b32 s6, -1
3834; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3835; GCN-NEXT:    s_lshr_b32 s0, s0, 12
3836; GCN-NEXT:    v_mov_b32_e32 v0, s0
3837; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3838; GCN-NEXT:    s_endpgm
3839  %r = udiv i32 %x, 4096
3840  store i32 %r, i32 addrspace(1)* %out
3841  ret void
3842}
3843
3844define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
3845; CHECK-LABEL: @udiv_i32_pow2_shl_denom(
3846; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
3847; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]]
3848; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
3849; CHECK-NEXT:    ret void
3850;
3851; GCN-LABEL: udiv_i32_pow2_shl_denom:
3852; GCN:       ; %bb.0:
3853; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3854; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3855; GCN-NEXT:    s_mov_b32 s7, 0xf000
3856; GCN-NEXT:    s_mov_b32 s6, -1
3857; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3858; GCN-NEXT:    s_add_i32 s1, s1, 12
3859; GCN-NEXT:    s_lshr_b32 s0, s0, s1
3860; GCN-NEXT:    v_mov_b32_e32 v0, s0
3861; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
3862; GCN-NEXT:    s_endpgm
3863  %shl.y = shl i32 4096, %y
3864  %r = udiv i32 %x, %shl.y
3865  store i32 %r, i32 addrspace(1)* %out
3866  ret void
3867}
3868
3869define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
3870; CHECK-LABEL: @udiv_v2i32_pow2k_denom(
3871; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
3872; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
3873; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
3874; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
3875; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096
3876; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
3877; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
3878; CHECK-NEXT:    ret void
3879;
3880; GCN-LABEL: udiv_v2i32_pow2k_denom:
3881; GCN:       ; %bb.0:
3882; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3883; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3884; GCN-NEXT:    s_mov_b32 s7, 0xf000
3885; GCN-NEXT:    s_mov_b32 s6, -1
3886; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3887; GCN-NEXT:    s_lshr_b32 s0, s0, 12
3888; GCN-NEXT:    s_lshr_b32 s1, s1, 12
3889; GCN-NEXT:    v_mov_b32_e32 v0, s0
3890; GCN-NEXT:    v_mov_b32_e32 v1, s1
3891; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3892; GCN-NEXT:    s_endpgm
3893  %r = udiv <2 x i32> %x, <i32 4096, i32 4096>
3894  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
3895  ret void
3896}
3897
3898define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
3899; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom(
3900; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
3901; CHECK-NEXT:    [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096
3902; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
3903; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
3904; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095
3905; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
3906; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
3907; CHECK-NEXT:    ret void
3908;
3909; GCN-LABEL: udiv_v2i32_mixed_pow2k_denom:
3910; GCN:       ; %bb.0:
3911; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
3912; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
3913; GCN-NEXT:    v_mov_b32_e32 v0, 0x100101
3914; GCN-NEXT:    s_mov_b32 s7, 0xf000
3915; GCN-NEXT:    s_mov_b32 s6, -1
3916; GCN-NEXT:    s_waitcnt lgkmcnt(0)
3917; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
3918; GCN-NEXT:    s_lshr_b32 s0, s0, 12
3919; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v0
3920; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
3921; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
3922; GCN-NEXT:    v_lshrrev_b32_e32 v1, 11, v0
3923; GCN-NEXT:    v_mov_b32_e32 v0, s0
3924; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3925; GCN-NEXT:    s_endpgm
3926  %r = udiv <2 x i32> %x, <i32 4096, i32 4095>
3927  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
3928  ret void
3929}
3930
3931define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
3932; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom(
3933; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
3934; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
3935; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
3936; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
3937; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
3938; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
3939; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
3940; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
3941; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
3942; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
3943; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
3944; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
3945; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
3946; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
3947; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
3948; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
3949; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
3950; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
3951; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
3952; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
3953; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
3954; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
3955; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
3956; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
3957; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
3958; CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[TMP21]], 1
3959; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]]
3960; CHECK-NEXT:    [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]]
3961; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]]
3962; CHECK-NEXT:    [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]]
3963; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP26]], 1
3964; CHECK-NEXT:    [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]]
3965; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x i32> undef, i32 [[TMP31]], i64 0
3966; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x i32> [[X]], i64 1
3967; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
3968; CHECK-NEXT:    [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float
3969; CHECK-NEXT:    [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]])
3970; CHECK-NEXT:    [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000
3971; CHECK-NEXT:    [[TMP38:%.*]] = fptoui float [[TMP37]] to i32
3972; CHECK-NEXT:    [[TMP39:%.*]] = sub i32 0, [[TMP34]]
3973; CHECK-NEXT:    [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]]
3974; CHECK-NEXT:    [[TMP41:%.*]] = zext i32 [[TMP38]] to i64
3975; CHECK-NEXT:    [[TMP42:%.*]] = zext i32 [[TMP40]] to i64
3976; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]]
3977; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
3978; CHECK-NEXT:    [[TMP45:%.*]] = lshr i64 [[TMP43]], 32
3979; CHECK-NEXT:    [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32
3980; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]]
3981; CHECK-NEXT:    [[TMP48:%.*]] = zext i32 [[TMP33]] to i64
3982; CHECK-NEXT:    [[TMP49:%.*]] = zext i32 [[TMP47]] to i64
3983; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]]
3984; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
3985; CHECK-NEXT:    [[TMP52:%.*]] = lshr i64 [[TMP50]], 32
3986; CHECK-NEXT:    [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32
3987; CHECK-NEXT:    [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]]
3988; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]]
3989; CHECK-NEXT:    [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]]
3990; CHECK-NEXT:    [[TMP57:%.*]] = add i32 [[TMP53]], 1
3991; CHECK-NEXT:    [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]]
3992; CHECK-NEXT:    [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]]
3993; CHECK-NEXT:    [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]]
3994; CHECK-NEXT:    [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]]
3995; CHECK-NEXT:    [[TMP62:%.*]] = add i32 [[TMP58]], 1
3996; CHECK-NEXT:    [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]]
3997; CHECK-NEXT:    [[TMP64:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP63]], i64 1
3998; CHECK-NEXT:    store <2 x i32> [[TMP64]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
3999; CHECK-NEXT:    ret void
4000;
4001; GCN-LABEL: udiv_v2i32_pow2_shl_denom:
4002; GCN:       ; %bb.0:
4003; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
4004; GCN-NEXT:    s_movk_i32 s4, 0x1000
4005; GCN-NEXT:    s_mov_b32 s7, 0xf000
4006; GCN-NEXT:    s_mov_b32 s6, -1
4007; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4008; GCN-NEXT:    s_lshl_b32 s8, s4, s2
4009; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
4010; GCN-NEXT:    s_lshl_b32 s9, s4, s3
4011; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
4012; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4013; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4014; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
4015; GCN-NEXT:    s_mov_b32 s0, 0x4f7ffffe
4016; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
4017; GCN-NEXT:    v_mul_f32_e32 v0, s0, v0
4018; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4019; GCN-NEXT:    v_mul_f32_e32 v1, s0, v1
4020; GCN-NEXT:    s_sub_i32 s0, 0, s8
4021; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
4022; GCN-NEXT:    v_mul_lo_u32 v2, s0, v0
4023; GCN-NEXT:    s_sub_i32 s0, 0, s9
4024; GCN-NEXT:    v_mul_lo_u32 v3, s0, v1
4025; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
4026; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
4027; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
4028; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4029; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
4030; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
4031; GCN-NEXT:    v_mul_hi_u32 v1, s3, v1
4032; GCN-NEXT:    v_mul_lo_u32 v2, v0, s8
4033; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
4034; GCN-NEXT:    v_mul_lo_u32 v4, v1, s9
4035; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
4036; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v2
4037; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
4038; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v2
4039; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
4040; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
4041; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
4042; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4043; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s3, v4
4044; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
4045; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
4046; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
4047; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v2
4048; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
4049; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
4050; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
4051; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
4052; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4053; GCN-NEXT:    s_endpgm
4054  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
4055  %r = udiv <2 x i32> %x, %shl.y
4056  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4057  ret void
4058}
4059
4060define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
4061; CHECK-LABEL: @urem_i32_oddk_denom(
4062; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 1235195
4063; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4064; CHECK-NEXT:    ret void
4065;
4066; GCN-LABEL: urem_i32_oddk_denom:
4067; GCN:       ; %bb.0:
4068; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4069; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4070; GCN-NEXT:    v_mov_b32_e32 v0, 0xb2a50881
4071; GCN-NEXT:    s_mov_b32 s7, 0xf000
4072; GCN-NEXT:    s_mov_b32 s6, -1
4073; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4074; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
4075; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s0, v0
4076; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
4077; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
4078; GCN-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
4079; GCN-NEXT:    v_mul_u32_u24_e32 v0, 0x12d8fb, v0
4080; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
4081; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4082; GCN-NEXT:    s_endpgm
4083  %r = urem i32 %x, 1235195
4084  store i32 %r, i32 addrspace(1)* %out
4085  ret void
4086}
4087
4088define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
4089; CHECK-LABEL: @urem_i32_pow2k_denom(
4090; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], 4096
4091; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4092; CHECK-NEXT:    ret void
4093;
4094; GCN-LABEL: urem_i32_pow2k_denom:
4095; GCN:       ; %bb.0:
4096; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4097; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4098; GCN-NEXT:    s_mov_b32 s7, 0xf000
4099; GCN-NEXT:    s_mov_b32 s6, -1
4100; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4101; GCN-NEXT:    s_and_b32 s0, s0, 0xfff
4102; GCN-NEXT:    v_mov_b32_e32 v0, s0
4103; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4104; GCN-NEXT:    s_endpgm
4105  %r = urem i32 %x, 4096
4106  store i32 %r, i32 addrspace(1)* %out
4107  ret void
4108}
4109
4110define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
4111; CHECK-LABEL: @urem_i32_pow2_shl_denom(
4112; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
4113; CHECK-NEXT:    [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]]
4114; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4115; CHECK-NEXT:    ret void
4116;
4117; GCN-LABEL: urem_i32_pow2_shl_denom:
4118; GCN:       ; %bb.0:
4119; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4120; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4121; GCN-NEXT:    s_mov_b32 s7, 0xf000
4122; GCN-NEXT:    s_mov_b32 s6, -1
4123; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4124; GCN-NEXT:    s_lshl_b32 s1, 0x1000, s1
4125; GCN-NEXT:    s_add_i32 s1, s1, -1
4126; GCN-NEXT:    s_and_b32 s0, s0, s1
4127; GCN-NEXT:    v_mov_b32_e32 v0, s0
4128; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4129; GCN-NEXT:    s_endpgm
4130  %shl.y = shl i32 4096, %y
4131  %r = urem i32 %x, %shl.y
4132  store i32 %r, i32 addrspace(1)* %out
4133  ret void
4134}
4135
4136define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
4137; CHECK-LABEL: @urem_v2i32_pow2k_denom(
4138; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4139; CHECK-NEXT:    [[TMP2:%.*]] = urem i32 [[TMP1]], 4096
4140; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
4141; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
4142; CHECK-NEXT:    [[TMP5:%.*]] = urem i32 [[TMP4]], 4096
4143; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
4144; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4145; CHECK-NEXT:    ret void
4146;
4147; GCN-LABEL: urem_v2i32_pow2k_denom:
4148; GCN:       ; %bb.0:
4149; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4150; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4151; GCN-NEXT:    s_movk_i32 s2, 0xfff
4152; GCN-NEXT:    s_mov_b32 s7, 0xf000
4153; GCN-NEXT:    s_mov_b32 s6, -1
4154; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4155; GCN-NEXT:    s_and_b32 s0, s0, s2
4156; GCN-NEXT:    s_and_b32 s1, s1, s2
4157; GCN-NEXT:    v_mov_b32_e32 v0, s0
4158; GCN-NEXT:    v_mov_b32_e32 v1, s1
4159; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4160; GCN-NEXT:    s_endpgm
4161  %r = urem <2 x i32> %x, <i32 4096, i32 4096>
4162  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4163  ret void
4164}
4165
4166define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
4167; CHECK-LABEL: @urem_v2i32_pow2_shl_denom(
4168; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
4169; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4170; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
4171; CHECK-NEXT:    [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float
4172; CHECK-NEXT:    [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]])
4173; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000
4174; CHECK-NEXT:    [[TMP6:%.*]] = fptoui float [[TMP5]] to i32
4175; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 0, [[TMP2]]
4176; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]]
4177; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
4178; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP8]] to i64
4179; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
4180; CHECK-NEXT:    [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32
4181; CHECK-NEXT:    [[TMP13:%.*]] = lshr i64 [[TMP11]], 32
4182; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
4183; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]]
4184; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP1]] to i64
4185; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
4186; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
4187; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
4188; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
4189; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
4190; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]]
4191; CHECK-NEXT:    [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]]
4192; CHECK-NEXT:    [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]]
4193; CHECK-NEXT:    [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]]
4194; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]]
4195; CHECK-NEXT:    [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]]
4196; CHECK-NEXT:    [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]]
4197; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]]
4198; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <2 x i32> undef, i32 [[TMP29]], i64 0
4199; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x i32> [[X]], i64 1
4200; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
4201; CHECK-NEXT:    [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float
4202; CHECK-NEXT:    [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]])
4203; CHECK-NEXT:    [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000
4204; CHECK-NEXT:    [[TMP36:%.*]] = fptoui float [[TMP35]] to i32
4205; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 0, [[TMP32]]
4206; CHECK-NEXT:    [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]]
4207; CHECK-NEXT:    [[TMP39:%.*]] = zext i32 [[TMP36]] to i64
4208; CHECK-NEXT:    [[TMP40:%.*]] = zext i32 [[TMP38]] to i64
4209; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]]
4210; CHECK-NEXT:    [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32
4211; CHECK-NEXT:    [[TMP43:%.*]] = lshr i64 [[TMP41]], 32
4212; CHECK-NEXT:    [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32
4213; CHECK-NEXT:    [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]]
4214; CHECK-NEXT:    [[TMP46:%.*]] = zext i32 [[TMP31]] to i64
4215; CHECK-NEXT:    [[TMP47:%.*]] = zext i32 [[TMP45]] to i64
4216; CHECK-NEXT:    [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]]
4217; CHECK-NEXT:    [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32
4218; CHECK-NEXT:    [[TMP50:%.*]] = lshr i64 [[TMP48]], 32
4219; CHECK-NEXT:    [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32
4220; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]]
4221; CHECK-NEXT:    [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]]
4222; CHECK-NEXT:    [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]]
4223; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]]
4224; CHECK-NEXT:    [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]]
4225; CHECK-NEXT:    [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]]
4226; CHECK-NEXT:    [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]]
4227; CHECK-NEXT:    [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]]
4228; CHECK-NEXT:    [[TMP60:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP59]], i64 1
4229; CHECK-NEXT:    store <2 x i32> [[TMP60]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4230; CHECK-NEXT:    ret void
4231;
4232; GCN-LABEL: urem_v2i32_pow2_shl_denom:
4233; GCN:       ; %bb.0:
4234; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
4235; GCN-NEXT:    s_movk_i32 s4, 0x1000
4236; GCN-NEXT:    s_mov_b32 s7, 0xf000
4237; GCN-NEXT:    s_mov_b32 s6, -1
4238; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4239; GCN-NEXT:    s_lshl_b32 s8, s4, s2
4240; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
4241; GCN-NEXT:    s_lshl_b32 s3, s4, s3
4242; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
4243; GCN-NEXT:    s_mov_b32 s4, 0x4f7ffffe
4244; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
4245; GCN-NEXT:    s_sub_i32 s2, 0, s8
4246; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
4247; GCN-NEXT:    v_mul_f32_e32 v0, s4, v0
4248; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4249; GCN-NEXT:    v_mul_f32_e32 v1, s4, v1
4250; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
4251; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4252; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4253; GCN-NEXT:    v_mul_lo_u32 v2, s2, v0
4254; GCN-NEXT:    s_sub_i32 s2, 0, s3
4255; GCN-NEXT:    v_mul_lo_u32 v3, s2, v1
4256; GCN-NEXT:    v_mul_hi_u32 v2, v0, v2
4257; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
4258; GCN-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
4259; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4260; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
4261; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
4262; GCN-NEXT:    v_mul_hi_u32 v1, s1, v1
4263; GCN-NEXT:    v_mul_lo_u32 v0, v0, s8
4264; GCN-NEXT:    v_mul_lo_u32 v1, v1, s3
4265; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
4266; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
4267; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
4268; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4269; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
4270; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
4271; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4272; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
4273; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
4274; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
4275; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4276; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v1
4277; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
4278; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4279; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4280; GCN-NEXT:    s_endpgm
4281  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
4282  %r = urem <2 x i32> %x, %shl.y
4283  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4284  ret void
4285}
4286
4287define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
4288; CHECK-LABEL: @sdiv_i32_oddk_denom(
4289; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195
4290; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4291; CHECK-NEXT:    ret void
4292;
4293; GCN-LABEL: sdiv_i32_oddk_denom:
4294; GCN:       ; %bb.0:
4295; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4296; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4297; GCN-NEXT:    v_mov_b32_e32 v0, 0xd9528441
4298; GCN-NEXT:    s_mov_b32 s7, 0xf000
4299; GCN-NEXT:    s_mov_b32 s6, -1
4300; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4301; GCN-NEXT:    v_mul_hi_i32 v0, s0, v0
4302; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
4303; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
4304; GCN-NEXT:    v_ashrrev_i32_e32 v0, 20, v0
4305; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
4306; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4307; GCN-NEXT:    s_endpgm
4308  %r = sdiv i32 %x, 1235195
4309  store i32 %r, i32 addrspace(1)* %out
4310  ret void
4311}
4312
4313define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
4314; CHECK-LABEL: @sdiv_i32_pow2k_denom(
4315; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], 4096
4316; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4317; CHECK-NEXT:    ret void
4318;
4319; GCN-LABEL: sdiv_i32_pow2k_denom:
4320; GCN:       ; %bb.0:
4321; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4322; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4323; GCN-NEXT:    s_mov_b32 s7, 0xf000
4324; GCN-NEXT:    s_mov_b32 s6, -1
4325; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4326; GCN-NEXT:    s_ashr_i32 s1, s0, 31
4327; GCN-NEXT:    s_lshr_b32 s1, s1, 20
4328; GCN-NEXT:    s_add_i32 s0, s0, s1
4329; GCN-NEXT:    s_ashr_i32 s0, s0, 12
4330; GCN-NEXT:    v_mov_b32_e32 v0, s0
4331; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4332; GCN-NEXT:    s_endpgm
4333  %r = sdiv i32 %x, 4096
4334  store i32 %r, i32 addrspace(1)* %out
4335  ret void
4336}
4337
4338define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
4339; CHECK-LABEL: @sdiv_i32_pow2_shl_denom(
4340; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
4341; CHECK-NEXT:    [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]]
4342; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4343; CHECK-NEXT:    ret void
4344;
4345; GCN-LABEL: sdiv_i32_pow2_shl_denom:
4346; GCN:       ; %bb.0:
4347; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4348; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4349; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4350; GCN-NEXT:    s_lshl_b32 s3, 0x1000, s3
4351; GCN-NEXT:    s_ashr_i32 s4, s3, 31
4352; GCN-NEXT:    s_add_i32 s3, s3, s4
4353; GCN-NEXT:    s_xor_b32 s7, s3, s4
4354; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s7
4355; GCN-NEXT:    s_sub_i32 s3, 0, s7
4356; GCN-NEXT:    s_ashr_i32 s5, s2, 31
4357; GCN-NEXT:    s_add_i32 s2, s2, s5
4358; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
4359; GCN-NEXT:    s_xor_b32 s6, s2, s5
4360; GCN-NEXT:    s_xor_b32 s4, s5, s4
4361; GCN-NEXT:    s_mov_b32 s2, -1
4362; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
4363; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4364; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
4365; GCN-NEXT:    s_mov_b32 s3, 0xf000
4366; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
4367; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
4368; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
4369; GCN-NEXT:    v_mul_lo_u32 v1, v0, s7
4370; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
4371; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s6, v1
4372; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s7, v1
4373; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
4374; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4375; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
4376; GCN-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
4377; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
4378; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
4379; GCN-NEXT:    v_xor_b32_e32 v0, s4, v0
4380; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
4381; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4382; GCN-NEXT:    s_endpgm
4383  %shl.y = shl i32 4096, %y
4384  %r = sdiv i32 %x, %shl.y
4385  store i32 %r, i32 addrspace(1)* %out
4386  ret void
4387}
4388
4389define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
4390; CHECK-LABEL: @sdiv_v2i32_pow2k_denom(
4391; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4392; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
4393; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
4394; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
4395; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096
4396; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
4397; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4398; CHECK-NEXT:    ret void
4399;
4400; GCN-LABEL: sdiv_v2i32_pow2k_denom:
4401; GCN:       ; %bb.0:
4402; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4403; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4404; GCN-NEXT:    s_mov_b32 s7, 0xf000
4405; GCN-NEXT:    s_mov_b32 s6, -1
4406; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4407; GCN-NEXT:    s_ashr_i32 s2, s0, 31
4408; GCN-NEXT:    s_lshr_b32 s2, s2, 20
4409; GCN-NEXT:    s_ashr_i32 s3, s1, 31
4410; GCN-NEXT:    s_add_i32 s0, s0, s2
4411; GCN-NEXT:    s_lshr_b32 s2, s3, 20
4412; GCN-NEXT:    s_add_i32 s1, s1, s2
4413; GCN-NEXT:    s_ashr_i32 s0, s0, 12
4414; GCN-NEXT:    s_ashr_i32 s1, s1, 12
4415; GCN-NEXT:    v_mov_b32_e32 v0, s0
4416; GCN-NEXT:    v_mov_b32_e32 v1, s1
4417; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4418; GCN-NEXT:    s_endpgm
4419  %r = sdiv <2 x i32> %x, <i32 4096, i32 4096>
4420  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4421  ret void
4422}
4423
4424define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
4425; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom(
4426; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4427; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096
4428; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
4429; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
4430; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095
4431; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
4432; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4433; CHECK-NEXT:    ret void
4434;
4435; GCN-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
4436; GCN:       ; %bb.0:
4437; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4438; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4439; GCN-NEXT:    v_mov_b32_e32 v0, 0x80080081
4440; GCN-NEXT:    s_mov_b32 s7, 0xf000
4441; GCN-NEXT:    s_mov_b32 s6, -1
4442; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4443; GCN-NEXT:    v_mul_hi_i32 v0, s1, v0
4444; GCN-NEXT:    s_ashr_i32 s2, s0, 31
4445; GCN-NEXT:    s_lshr_b32 s2, s2, 20
4446; GCN-NEXT:    s_add_i32 s0, s0, s2
4447; GCN-NEXT:    v_add_i32_e32 v0, vcc, s1, v0
4448; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
4449; GCN-NEXT:    v_ashrrev_i32_e32 v0, 11, v0
4450; GCN-NEXT:    s_ashr_i32 s0, s0, 12
4451; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
4452; GCN-NEXT:    v_mov_b32_e32 v0, s0
4453; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4454; GCN-NEXT:    s_endpgm
4455  %r = sdiv <2 x i32> %x, <i32 4096, i32 4095>
4456  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4457  ret void
4458}
4459
4460define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
4461; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom(
4462; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
4463; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4464; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
4465; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
4466; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
4467; CHECK-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
4468; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]]
4469; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]]
4470; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]]
4471; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]]
4472; CHECK-NEXT:    [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float
4473; CHECK-NEXT:    [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]])
4474; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000
4475; CHECK-NEXT:    [[TMP13:%.*]] = fptoui float [[TMP12]] to i32
4476; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 0, [[TMP9]]
4477; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]]
4478; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP13]] to i64
4479; CHECK-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP15]] to i64
4480; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]]
4481; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
4482; CHECK-NEXT:    [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
4483; CHECK-NEXT:    [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
4484; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]]
4485; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP8]] to i64
4486; CHECK-NEXT:    [[TMP24:%.*]] = zext i32 [[TMP22]] to i64
4487; CHECK-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]]
4488; CHECK-NEXT:    [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32
4489; CHECK-NEXT:    [[TMP27:%.*]] = lshr i64 [[TMP25]], 32
4490; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
4491; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]]
4492; CHECK-NEXT:    [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]]
4493; CHECK-NEXT:    [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]]
4494; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP28]], 1
4495; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]]
4496; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]]
4497; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]]
4498; CHECK-NEXT:    [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]]
4499; CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP33]], 1
4500; CHECK-NEXT:    [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]]
4501; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]]
4502; CHECK-NEXT:    [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]]
4503; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <2 x i32> undef, i32 [[TMP40]], i64 0
4504; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i32> [[X]], i64 1
4505; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
4506; CHECK-NEXT:    [[TMP44:%.*]] = ashr i32 [[TMP42]], 31
4507; CHECK-NEXT:    [[TMP45:%.*]] = ashr i32 [[TMP43]], 31
4508; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]]
4509; CHECK-NEXT:    [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]]
4510; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]]
4511; CHECK-NEXT:    [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]]
4512; CHECK-NEXT:    [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]]
4513; CHECK-NEXT:    [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float
4514; CHECK-NEXT:    [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]])
4515; CHECK-NEXT:    [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000
4516; CHECK-NEXT:    [[TMP54:%.*]] = fptoui float [[TMP53]] to i32
4517; CHECK-NEXT:    [[TMP55:%.*]] = sub i32 0, [[TMP50]]
4518; CHECK-NEXT:    [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]]
4519; CHECK-NEXT:    [[TMP57:%.*]] = zext i32 [[TMP54]] to i64
4520; CHECK-NEXT:    [[TMP58:%.*]] = zext i32 [[TMP56]] to i64
4521; CHECK-NEXT:    [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]]
4522; CHECK-NEXT:    [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32
4523; CHECK-NEXT:    [[TMP61:%.*]] = lshr i64 [[TMP59]], 32
4524; CHECK-NEXT:    [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32
4525; CHECK-NEXT:    [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]]
4526; CHECK-NEXT:    [[TMP64:%.*]] = zext i32 [[TMP49]] to i64
4527; CHECK-NEXT:    [[TMP65:%.*]] = zext i32 [[TMP63]] to i64
4528; CHECK-NEXT:    [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]]
4529; CHECK-NEXT:    [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32
4530; CHECK-NEXT:    [[TMP68:%.*]] = lshr i64 [[TMP66]], 32
4531; CHECK-NEXT:    [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32
4532; CHECK-NEXT:    [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]]
4533; CHECK-NEXT:    [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]]
4534; CHECK-NEXT:    [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]]
4535; CHECK-NEXT:    [[TMP73:%.*]] = add i32 [[TMP69]], 1
4536; CHECK-NEXT:    [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]]
4537; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]]
4538; CHECK-NEXT:    [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]]
4539; CHECK-NEXT:    [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]]
4540; CHECK-NEXT:    [[TMP78:%.*]] = add i32 [[TMP74]], 1
4541; CHECK-NEXT:    [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]]
4542; CHECK-NEXT:    [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]]
4543; CHECK-NEXT:    [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]]
4544; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <2 x i32> [[TMP41]], i32 [[TMP81]], i64 1
4545; CHECK-NEXT:    store <2 x i32> [[TMP82]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4546; CHECK-NEXT:    ret void
4547;
4548; GCN-LABEL: sdiv_v2i32_pow2_shl_denom:
4549; GCN:       ; %bb.0:
4550; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
4551; GCN-NEXT:    s_movk_i32 s10, 0x1000
4552; GCN-NEXT:    s_mov_b32 s13, 0x4f7ffffe
4553; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4554; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
4555; GCN-NEXT:    s_mov_b32 s7, 0xf000
4556; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4557; GCN-NEXT:    s_lshl_b32 s2, s10, s2
4558; GCN-NEXT:    s_ashr_i32 s11, s2, 31
4559; GCN-NEXT:    s_add_i32 s2, s2, s11
4560; GCN-NEXT:    s_xor_b32 s12, s2, s11
4561; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
4562; GCN-NEXT:    s_lshl_b32 s0, s10, s3
4563; GCN-NEXT:    s_sub_i32 s3, 0, s12
4564; GCN-NEXT:    s_ashr_i32 s2, s0, 31
4565; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
4566; GCN-NEXT:    s_add_i32 s0, s0, s2
4567; GCN-NEXT:    s_xor_b32 s10, s0, s2
4568; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s10
4569; GCN-NEXT:    v_mul_f32_e32 v0, s13, v0
4570; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4571; GCN-NEXT:    s_ashr_i32 s1, s8, 31
4572; GCN-NEXT:    s_add_i32 s0, s8, s1
4573; GCN-NEXT:    s_xor_b32 s0, s0, s1
4574; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
4575; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
4576; GCN-NEXT:    s_xor_b32 s3, s1, s11
4577; GCN-NEXT:    s_mov_b32 s6, -1
4578; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
4579; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
4580; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
4581; GCN-NEXT:    v_mul_f32_e32 v1, s13, v2
4582; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
4583; GCN-NEXT:    v_mul_lo_u32 v2, v0, s12
4584; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
4585; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s0, v2
4586; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v2
4587; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
4588; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s12, v2
4589; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
4590; GCN-NEXT:    s_sub_i32 s0, 0, s10
4591; GCN-NEXT:    v_mul_lo_u32 v3, s0, v1
4592; GCN-NEXT:    s_ashr_i32 s0, s9, 31
4593; GCN-NEXT:    s_add_i32 s1, s9, s0
4594; GCN-NEXT:    s_xor_b32 s1, s1, s0
4595; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
4596; GCN-NEXT:    v_add_i32_e32 v4, vcc, 1, v0
4597; GCN-NEXT:    s_xor_b32 s2, s0, s2
4598; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
4599; GCN-NEXT:    v_mul_hi_u32 v1, s1, v1
4600; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v2
4601; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
4602; GCN-NEXT:    v_xor_b32_e32 v0, s3, v0
4603; GCN-NEXT:    v_mul_lo_u32 v2, v1, s10
4604; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
4605; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
4606; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s1, v2
4607; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
4608; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
4609; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s10, v2
4610; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
4611; GCN-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
4612; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v2
4613; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
4614; GCN-NEXT:    v_xor_b32_e32 v1, s2, v1
4615; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s2, v1
4616; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4617; GCN-NEXT:    s_endpgm
4618  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
4619  %r = sdiv <2 x i32> %x, %shl.y
4620  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4621  ret void
4622}
4623
4624define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) {
4625; CHECK-LABEL: @srem_i32_oddk_denom(
4626; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 1235195
4627; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4628; CHECK-NEXT:    ret void
4629;
4630; GCN-LABEL: srem_i32_oddk_denom:
4631; GCN:       ; %bb.0:
4632; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4633; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4634; GCN-NEXT:    v_mov_b32_e32 v0, 0xd9528441
4635; GCN-NEXT:    s_mov_b32 s7, 0xf000
4636; GCN-NEXT:    s_mov_b32 s6, -1
4637; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4638; GCN-NEXT:    v_mul_hi_i32 v0, s0, v0
4639; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
4640; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v0
4641; GCN-NEXT:    v_ashrrev_i32_e32 v0, 20, v0
4642; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
4643; GCN-NEXT:    v_mul_i32_i24_e32 v0, 0x12d8fb, v0
4644; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
4645; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4646; GCN-NEXT:    s_endpgm
4647  %r = srem i32 %x, 1235195
4648  store i32 %r, i32 addrspace(1)* %out
4649  ret void
4650}
4651
4652define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) {
4653; CHECK-LABEL: @srem_i32_pow2k_denom(
4654; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], 4096
4655; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4656; CHECK-NEXT:    ret void
4657;
4658; GCN-LABEL: srem_i32_pow2k_denom:
4659; GCN:       ; %bb.0:
4660; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4661; GCN-NEXT:    s_load_dword s0, s[0:1], 0xb
4662; GCN-NEXT:    s_mov_b32 s7, 0xf000
4663; GCN-NEXT:    s_mov_b32 s6, -1
4664; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4665; GCN-NEXT:    s_ashr_i32 s1, s0, 31
4666; GCN-NEXT:    s_lshr_b32 s1, s1, 20
4667; GCN-NEXT:    s_add_i32 s1, s0, s1
4668; GCN-NEXT:    s_and_b32 s1, s1, 0xfffff000
4669; GCN-NEXT:    s_sub_i32 s0, s0, s1
4670; GCN-NEXT:    v_mov_b32_e32 v0, s0
4671; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
4672; GCN-NEXT:    s_endpgm
4673  %r = srem i32 %x, 4096
4674  store i32 %r, i32 addrspace(1)* %out
4675  ret void
4676}
4677
4678define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) {
4679; CHECK-LABEL: @srem_i32_pow2_shl_denom(
4680; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]]
4681; CHECK-NEXT:    [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]]
4682; CHECK-NEXT:    store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4
4683; CHECK-NEXT:    ret void
4684;
4685; GCN-LABEL: srem_i32_pow2_shl_denom:
4686; GCN:       ; %bb.0:
4687; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xb
4688; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4689; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4690; GCN-NEXT:    s_lshl_b32 s3, 0x1000, s3
4691; GCN-NEXT:    s_ashr_i32 s4, s3, 31
4692; GCN-NEXT:    s_add_i32 s3, s3, s4
4693; GCN-NEXT:    s_xor_b32 s6, s3, s4
4694; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
4695; GCN-NEXT:    s_sub_i32 s3, 0, s6
4696; GCN-NEXT:    s_ashr_i32 s4, s2, 31
4697; GCN-NEXT:    s_add_i32 s2, s2, s4
4698; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
4699; GCN-NEXT:    s_xor_b32 s5, s2, s4
4700; GCN-NEXT:    s_mov_b32 s2, -1
4701; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
4702; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4703; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
4704; GCN-NEXT:    s_mov_b32 s3, 0xf000
4705; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
4706; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
4707; GCN-NEXT:    v_mul_hi_u32 v0, s5, v0
4708; GCN-NEXT:    v_mul_lo_u32 v0, v0, s6
4709; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
4710; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v0
4711; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
4712; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
4713; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v0
4714; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
4715; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
4716; GCN-NEXT:    v_xor_b32_e32 v0, s4, v0
4717; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s4, v0
4718; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4719; GCN-NEXT:    s_endpgm
4720  %shl.y = shl i32 4096, %y
4721  %r = srem i32 %x, %shl.y
4722  store i32 %r, i32 addrspace(1)* %out
4723  ret void
4724}
4725
4726define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) {
4727; CHECK-LABEL: @srem_v2i32_pow2k_denom(
4728; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4729; CHECK-NEXT:    [[TMP2:%.*]] = srem i32 [[TMP1]], 4096
4730; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0
4731; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1
4732; CHECK-NEXT:    [[TMP5:%.*]] = srem i32 [[TMP4]], 4096
4733; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1
4734; CHECK-NEXT:    store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4735; CHECK-NEXT:    ret void
4736;
4737; GCN-LABEL: srem_v2i32_pow2k_denom:
4738; GCN:       ; %bb.0:
4739; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4740; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4741; GCN-NEXT:    s_movk_i32 s2, 0xf000
4742; GCN-NEXT:    s_mov_b32 s7, 0xf000
4743; GCN-NEXT:    s_mov_b32 s6, -1
4744; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4745; GCN-NEXT:    s_ashr_i32 s3, s0, 31
4746; GCN-NEXT:    s_lshr_b32 s3, s3, 20
4747; GCN-NEXT:    s_add_i32 s3, s0, s3
4748; GCN-NEXT:    s_and_b32 s3, s3, s2
4749; GCN-NEXT:    s_sub_i32 s0, s0, s3
4750; GCN-NEXT:    s_ashr_i32 s3, s1, 31
4751; GCN-NEXT:    s_lshr_b32 s3, s3, 20
4752; GCN-NEXT:    s_add_i32 s3, s1, s3
4753; GCN-NEXT:    s_and_b32 s2, s3, s2
4754; GCN-NEXT:    s_sub_i32 s1, s1, s2
4755; GCN-NEXT:    v_mov_b32_e32 v0, s0
4756; GCN-NEXT:    v_mov_b32_e32 v1, s1
4757; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4758; GCN-NEXT:    s_endpgm
4759  %r = srem <2 x i32> %x, <i32 4096, i32 4096>
4760  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4761  ret void
4762}
4763
4764define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
4765; CHECK-LABEL: @srem_v2i32_pow2_shl_denom(
4766; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]]
4767; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0
4768; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0
4769; CHECK-NEXT:    [[TMP3:%.*]] = ashr i32 [[TMP1]], 31
4770; CHECK-NEXT:    [[TMP4:%.*]] = ashr i32 [[TMP2]], 31
4771; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]]
4772; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]]
4773; CHECK-NEXT:    [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]]
4774; CHECK-NEXT:    [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]]
4775; CHECK-NEXT:    [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float
4776; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]])
4777; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000
4778; CHECK-NEXT:    [[TMP12:%.*]] = fptoui float [[TMP11]] to i32
4779; CHECK-NEXT:    [[TMP13:%.*]] = sub i32 0, [[TMP8]]
4780; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]]
4781; CHECK-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP12]] to i64
4782; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[TMP14]] to i64
4783; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]]
4784; CHECK-NEXT:    [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32
4785; CHECK-NEXT:    [[TMP19:%.*]] = lshr i64 [[TMP17]], 32
4786; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32
4787; CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]]
4788; CHECK-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP7]] to i64
4789; CHECK-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP21]] to i64
4790; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]]
4791; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
4792; CHECK-NEXT:    [[TMP26:%.*]] = lshr i64 [[TMP24]], 32
4793; CHECK-NEXT:    [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32
4794; CHECK-NEXT:    [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]]
4795; CHECK-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]]
4796; CHECK-NEXT:    [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]]
4797; CHECK-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]]
4798; CHECK-NEXT:    [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]]
4799; CHECK-NEXT:    [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]]
4800; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]]
4801; CHECK-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]]
4802; CHECK-NEXT:    [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]]
4803; CHECK-NEXT:    [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]]
4804; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <2 x i32> undef, i32 [[TMP37]], i64 0
4805; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <2 x i32> [[X]], i64 1
4806; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1
4807; CHECK-NEXT:    [[TMP41:%.*]] = ashr i32 [[TMP39]], 31
4808; CHECK-NEXT:    [[TMP42:%.*]] = ashr i32 [[TMP40]], 31
4809; CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]]
4810; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]]
4811; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]]
4812; CHECK-NEXT:    [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]]
4813; CHECK-NEXT:    [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float
4814; CHECK-NEXT:    [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]])
4815; CHECK-NEXT:    [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000
4816; CHECK-NEXT:    [[TMP50:%.*]] = fptoui float [[TMP49]] to i32
4817; CHECK-NEXT:    [[TMP51:%.*]] = sub i32 0, [[TMP46]]
4818; CHECK-NEXT:    [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]]
4819; CHECK-NEXT:    [[TMP53:%.*]] = zext i32 [[TMP50]] to i64
4820; CHECK-NEXT:    [[TMP54:%.*]] = zext i32 [[TMP52]] to i64
4821; CHECK-NEXT:    [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]]
4822; CHECK-NEXT:    [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32
4823; CHECK-NEXT:    [[TMP57:%.*]] = lshr i64 [[TMP55]], 32
4824; CHECK-NEXT:    [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32
4825; CHECK-NEXT:    [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]]
4826; CHECK-NEXT:    [[TMP60:%.*]] = zext i32 [[TMP45]] to i64
4827; CHECK-NEXT:    [[TMP61:%.*]] = zext i32 [[TMP59]] to i64
4828; CHECK-NEXT:    [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]]
4829; CHECK-NEXT:    [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32
4830; CHECK-NEXT:    [[TMP64:%.*]] = lshr i64 [[TMP62]], 32
4831; CHECK-NEXT:    [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32
4832; CHECK-NEXT:    [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]]
4833; CHECK-NEXT:    [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]]
4834; CHECK-NEXT:    [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]]
4835; CHECK-NEXT:    [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]]
4836; CHECK-NEXT:    [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]]
4837; CHECK-NEXT:    [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]]
4838; CHECK-NEXT:    [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]]
4839; CHECK-NEXT:    [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]]
4840; CHECK-NEXT:    [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]]
4841; CHECK-NEXT:    [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]]
4842; CHECK-NEXT:    [[TMP76:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP75]], i64 1
4843; CHECK-NEXT:    store <2 x i32> [[TMP76]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8
4844; CHECK-NEXT:    ret void
4845;
4846; GCN-LABEL: srem_v2i32_pow2_shl_denom:
4847; GCN:       ; %bb.0:
4848; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
4849; GCN-NEXT:    s_movk_i32 s6, 0x1000
4850; GCN-NEXT:    s_mov_b32 s10, 0x4f7ffffe
4851; GCN-NEXT:    s_mov_b32 s7, 0xf000
4852; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4853; GCN-NEXT:    s_lshl_b32 s2, s6, s2
4854; GCN-NEXT:    s_ashr_i32 s4, s2, 31
4855; GCN-NEXT:    s_add_i32 s2, s2, s4
4856; GCN-NEXT:    s_xor_b32 s9, s2, s4
4857; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s9
4858; GCN-NEXT:    s_lshl_b32 s2, s6, s3
4859; GCN-NEXT:    s_ashr_i32 s6, s2, 31
4860; GCN-NEXT:    s_add_i32 s2, s2, s6
4861; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
4862; GCN-NEXT:    s_sub_i32 s8, 0, s9
4863; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
4864; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xb
4865; GCN-NEXT:    v_mul_f32_e32 v0, s10, v0
4866; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4867; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4868; GCN-NEXT:    s_ashr_i32 s3, s0, 31
4869; GCN-NEXT:    s_add_i32 s0, s0, s3
4870; GCN-NEXT:    v_mul_lo_u32 v1, s8, v0
4871; GCN-NEXT:    s_xor_b32 s8, s2, s6
4872; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s8
4873; GCN-NEXT:    s_xor_b32 s0, s0, s3
4874; GCN-NEXT:    v_mul_hi_u32 v1, v0, v1
4875; GCN-NEXT:    s_sub_i32 s2, 0, s8
4876; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
4877; GCN-NEXT:    s_mov_b32 s6, -1
4878; GCN-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
4879; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
4880; GCN-NEXT:    v_mul_f32_e32 v1, s10, v2
4881; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
4882; GCN-NEXT:    v_mul_lo_u32 v0, v0, s9
4883; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
4884; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
4885; GCN-NEXT:    s_ashr_i32 s0, s1, 31
4886; GCN-NEXT:    v_mul_hi_u32 v2, v1, v2
4887; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v0
4888; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v0
4889; GCN-NEXT:    s_add_i32 s1, s1, s0
4890; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4891; GCN-NEXT:    s_xor_b32 s1, s1, s0
4892; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
4893; GCN-NEXT:    v_mul_hi_u32 v1, s1, v1
4894; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v0
4895; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v0
4896; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
4897; GCN-NEXT:    v_mul_lo_u32 v1, v1, s8
4898; GCN-NEXT:    v_xor_b32_e32 v0, s3, v0
4899; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s3, v0
4900; GCN-NEXT:    v_sub_i32_e32 v1, vcc, s1, v1
4901; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v1
4902; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
4903; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4904; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v1
4905; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
4906; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
4907; GCN-NEXT:    v_xor_b32_e32 v1, s0, v1
4908; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, s0, v1
4909; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4910; GCN-NEXT:    s_endpgm
4911  %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
4912  %r = srem <2 x i32> %x, %shl.y
4913  store <2 x i32> %r, <2 x i32> addrspace(1)* %out
4914  ret void
4915}
4916
4917define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
4918; CHECK-LABEL: @udiv_i64_oddk_denom(
4919; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943
4920; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
4921; CHECK-NEXT:    ret void
4922;
4923; GCN-LABEL: udiv_i64_oddk_denom:
4924; GCN:       ; %bb.0:
4925; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f176a73
4926; GCN-NEXT:    v_mov_b32_e32 v1, 0x4f800000
4927; GCN-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
4928; GCN-NEXT:    v_rcp_f32_e32 v0, v0
4929; GCN-NEXT:    s_movk_i32 s2, 0xfee0
4930; GCN-NEXT:    s_mov_b32 s3, 0x68958c89
4931; GCN-NEXT:    v_mov_b32_e32 v8, 0
4932; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
4933; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
4934; GCN-NEXT:    v_trunc_f32_e32 v1, v1
4935; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
4936; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
4937; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
4938; GCN-NEXT:    v_mov_b32_e32 v7, 0
4939; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
4940; GCN-NEXT:    v_mul_lo_u32 v2, v0, s2
4941; GCN-NEXT:    v_mul_hi_u32 v3, v0, s3
4942; GCN-NEXT:    v_mul_lo_u32 v4, v1, s3
4943; GCN-NEXT:    s_mov_b32 s11, 0xf000
4944; GCN-NEXT:    s_waitcnt lgkmcnt(0)
4945; GCN-NEXT:    s_mov_b32 s8, s4
4946; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
4947; GCN-NEXT:    v_mul_lo_u32 v3, v0, s3
4948; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
4949; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
4950; GCN-NEXT:    v_mul_hi_u32 v4, v0, v2
4951; GCN-NEXT:    v_mul_hi_u32 v6, v0, v3
4952; GCN-NEXT:    v_mul_hi_u32 v9, v1, v2
4953; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
4954; GCN-NEXT:    s_movk_i32 s4, 0x11e
4955; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
4956; GCN-NEXT:    v_mul_lo_u32 v6, v1, v3
4957; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
4958; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
4959; GCN-NEXT:    s_mov_b32 s10, -1
4960; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
4961; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
4962; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
4963; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
4964; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
4965; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
4966; GCN-NEXT:    v_mul_lo_u32 v4, v0, s2
4967; GCN-NEXT:    v_mul_hi_u32 v5, v0, s3
4968; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
4969; GCN-NEXT:    v_mul_lo_u32 v6, v2, s3
4970; GCN-NEXT:    s_mov_b32 s2, 0x976a7377
4971; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
4972; GCN-NEXT:    v_mul_lo_u32 v5, v0, s3
4973; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
4974; GCN-NEXT:    v_mul_lo_u32 v6, v0, v4
4975; GCN-NEXT:    v_mul_hi_u32 v10, v0, v4
4976; GCN-NEXT:    v_mul_hi_u32 v9, v0, v5
4977; GCN-NEXT:    v_mul_hi_u32 v11, v2, v4
4978; GCN-NEXT:    s_movk_i32 s3, 0x11f
4979; GCN-NEXT:    s_mov_b32 s9, s5
4980; GCN-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
4981; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
4982; GCN-NEXT:    v_mul_lo_u32 v10, v2, v5
4983; GCN-NEXT:    v_mul_hi_u32 v5, v2, v5
4984; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
4985; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
4986; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
4987; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
4988; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
4989; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
4990; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
4991; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
4992; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
4993; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
4994; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
4995; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
4996; GCN-NEXT:    v_mul_hi_u32 v4, s6, v1
4997; GCN-NEXT:    v_mul_hi_u32 v5, s7, v1
4998; GCN-NEXT:    v_mul_lo_u32 v1, s7, v1
4999; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5000; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
5001; GCN-NEXT:    v_mul_lo_u32 v4, s7, v0
5002; GCN-NEXT:    v_mul_hi_u32 v0, s7, v0
5003; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
5004; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
5005; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
5006; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5007; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
5008; GCN-NEXT:    v_mul_lo_u32 v2, v0, s3
5009; GCN-NEXT:    v_mul_hi_u32 v3, v0, s2
5010; GCN-NEXT:    v_mul_lo_u32 v4, v1, s2
5011; GCN-NEXT:    v_mov_b32_e32 v5, s3
5012; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5013; GCN-NEXT:    v_mul_lo_u32 v3, v0, s2
5014; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
5015; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s7, v2
5016; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
5017; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
5018; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s2, v3
5019; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
5020; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s4, v4
5021; GCN-NEXT:    s_mov_b32 s2, 0x976a7376
5022; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
5023; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s2, v5
5024; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
5025; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v4
5026; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
5027; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
5028; GCN-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
5029; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
5030; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
5031; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
5032; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
5033; GCN-NEXT:    v_mov_b32_e32 v6, s7
5034; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
5035; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v2
5036; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
5037; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v3
5038; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
5039; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v2
5040; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
5041; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
5042; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
5043; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
5044; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5045; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
5046; GCN-NEXT:    s_endpgm
5047  %r = udiv i64 %x, 1235195949943
5048  store i64 %r, i64 addrspace(1)* %out
5049  ret void
5050}
5051
5052define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
5053; CHECK-LABEL: @udiv_i64_pow2k_denom(
5054; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 4096
5055; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5056; CHECK-NEXT:    ret void
5057;
5058; GCN-LABEL: udiv_i64_pow2k_denom:
5059; GCN:       ; %bb.0:
5060; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
5061; GCN-NEXT:    s_mov_b32 s7, 0xf000
5062; GCN-NEXT:    s_mov_b32 s6, -1
5063; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5064; GCN-NEXT:    s_mov_b32 s4, s0
5065; GCN-NEXT:    s_mov_b32 s5, s1
5066; GCN-NEXT:    s_lshr_b64 s[0:1], s[2:3], 12
5067; GCN-NEXT:    v_mov_b32_e32 v0, s0
5068; GCN-NEXT:    v_mov_b32_e32 v1, s1
5069; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5070; GCN-NEXT:    s_endpgm
5071  %r = udiv i64 %x, 4096
5072  store i64 %r, i64 addrspace(1)* %out
5073  ret void
5074}
5075
5076define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
5077; CHECK-LABEL: @udiv_i64_pow2_shl_denom(
5078; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
5079; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]]
5080; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5081; CHECK-NEXT:    ret void
5082;
5083; GCN-LABEL: udiv_i64_pow2_shl_denom:
5084; GCN:       ; %bb.0:
5085; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
5086; GCN-NEXT:    s_load_dword s8, s[0:1], 0xd
5087; GCN-NEXT:    s_mov_b32 s3, 0xf000
5088; GCN-NEXT:    s_mov_b32 s2, -1
5089; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5090; GCN-NEXT:    s_mov_b32 s0, s4
5091; GCN-NEXT:    s_add_i32 s8, s8, 12
5092; GCN-NEXT:    s_mov_b32 s1, s5
5093; GCN-NEXT:    s_lshr_b64 s[4:5], s[6:7], s8
5094; GCN-NEXT:    v_mov_b32_e32 v0, s4
5095; GCN-NEXT:    v_mov_b32_e32 v1, s5
5096; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5097; GCN-NEXT:    s_endpgm
5098  %shl.y = shl i64 4096, %y
5099  %r = udiv i64 %x, %shl.y
5100  store i64 %r, i64 addrspace(1)* %out
5101  ret void
5102}
5103
5104define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
5105; CHECK-LABEL: @udiv_v2i64_pow2k_denom(
5106; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5107; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
5108; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
5109; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
5110; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096
5111; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
5112; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
5113; CHECK-NEXT:    ret void
5114;
5115; GCN-LABEL: udiv_v2i64_pow2k_denom:
5116; GCN:       ; %bb.0:
5117; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5118; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
5119; GCN-NEXT:    s_mov_b32 s7, 0xf000
5120; GCN-NEXT:    s_mov_b32 s6, -1
5121; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5122; GCN-NEXT:    s_lshr_b64 s[0:1], s[0:1], 12
5123; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
5124; GCN-NEXT:    v_mov_b32_e32 v0, s0
5125; GCN-NEXT:    v_mov_b32_e32 v1, s1
5126; GCN-NEXT:    v_mov_b32_e32 v2, s2
5127; GCN-NEXT:    v_mov_b32_e32 v3, s3
5128; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5129; GCN-NEXT:    s_endpgm
5130  %r = udiv <2 x i64> %x, <i64 4096, i64 4096>
5131  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5132  ret void
5133}
5134
5135define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
5136; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom(
5137; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5138; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096
5139; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
5140; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
5141; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095
5142; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
5143; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
5144; CHECK-NEXT:    ret void
5145;
5146; GCN-LABEL: udiv_v2i64_mixed_pow2k_denom:
5147; GCN:       ; %bb.0:
5148; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
5149; GCN-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
5150; GCN-NEXT:    v_rcp_f32_e32 v0, v0
5151; GCN-NEXT:    s_movk_i32 s6, 0xf001
5152; GCN-NEXT:    v_mov_b32_e32 v7, 0
5153; GCN-NEXT:    v_mov_b32_e32 v2, 0
5154; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
5155; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
5156; GCN-NEXT:    v_trunc_f32_e32 v1, v1
5157; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
5158; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
5159; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
5160; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5161; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
5162; GCN-NEXT:    s_movk_i32 s0, 0xfff
5163; GCN-NEXT:    v_mul_hi_u32 v3, v0, s6
5164; GCN-NEXT:    v_mul_lo_u32 v5, v1, s6
5165; GCN-NEXT:    v_mul_lo_u32 v4, v0, s6
5166; GCN-NEXT:    s_mov_b32 s7, 0xf000
5167; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, v0, v3
5168; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
5169; GCN-NEXT:    v_mul_hi_u32 v6, v0, v4
5170; GCN-NEXT:    v_mul_lo_u32 v5, v0, v3
5171; GCN-NEXT:    v_mul_hi_u32 v8, v0, v3
5172; GCN-NEXT:    v_mul_hi_u32 v9, v1, v3
5173; GCN-NEXT:    v_mul_lo_u32 v3, v1, v3
5174; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
5175; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v7, v8, vcc
5176; GCN-NEXT:    v_mul_lo_u32 v8, v1, v4
5177; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
5178; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
5179; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v4, vcc
5180; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v2, vcc
5181; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
5182; GCN-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v3
5183; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
5184; GCN-NEXT:    v_mul_hi_u32 v5, v0, s6
5185; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v1, v4, s[2:3]
5186; GCN-NEXT:    v_mul_lo_u32 v6, v3, s6
5187; GCN-NEXT:    v_mul_lo_u32 v8, v0, s6
5188; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, v0, v5
5189; GCN-NEXT:    s_mov_b32 s6, -1
5190; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
5191; GCN-NEXT:    v_mul_lo_u32 v6, v0, v5
5192; GCN-NEXT:    v_mul_hi_u32 v9, v0, v8
5193; GCN-NEXT:    v_mul_hi_u32 v10, v0, v5
5194; GCN-NEXT:    v_mul_hi_u32 v11, v3, v5
5195; GCN-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
5196; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v7, v10, vcc
5197; GCN-NEXT:    v_mul_lo_u32 v10, v3, v8
5198; GCN-NEXT:    v_mul_hi_u32 v8, v3, v8
5199; GCN-NEXT:    v_mul_lo_u32 v3, v3, v5
5200; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
5201; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v9, v8, vcc
5202; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v11, v2, vcc
5203; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
5204; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
5205; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
5206; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
5207; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
5208; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5209; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5210; GCN-NEXT:    v_mul_lo_u32 v3, s10, v1
5211; GCN-NEXT:    v_mul_hi_u32 v4, s10, v0
5212; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
5213; GCN-NEXT:    v_mul_hi_u32 v6, s11, v1
5214; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
5215; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
5216; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
5217; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
5218; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
5219; GCN-NEXT:    s_lshr_b64 s[2:3], s[8:9], 12
5220; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
5221; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v4, v0, vcc
5222; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v6, v2, vcc
5223; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5224; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v7, v2, vcc
5225; GCN-NEXT:    v_mul_lo_u32 v2, v1, s0
5226; GCN-NEXT:    v_mul_hi_u32 v3, v0, s0
5227; GCN-NEXT:    v_mul_lo_u32 v4, v0, s0
5228; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5229; GCN-NEXT:    v_mov_b32_e32 v3, s11
5230; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s10, v4
5231; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
5232; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s0, v4
5233; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
5234; GCN-NEXT:    s_movk_i32 s0, 0xffe
5235; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v3
5236; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
5237; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
5238; GCN-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
5239; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
5240; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
5241; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
5242; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v4
5243; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
5244; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
5245; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
5246; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
5247; GCN-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[0:1]
5248; GCN-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
5249; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
5250; GCN-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s[0:1]
5251; GCN-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
5252; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
5253; GCN-NEXT:    v_mov_b32_e32 v0, s2
5254; GCN-NEXT:    v_mov_b32_e32 v1, s3
5255; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5256; GCN-NEXT:    s_endpgm
5257  %r = udiv <2 x i64> %x, <i64 4096, i64 4095>
5258  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5259  ret void
5260}
5261
5262define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
5263; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom(
5264; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
5265; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5266; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
5267; CHECK-NEXT:    [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]]
5268; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
5269; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
5270; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
5271; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]]
5272; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
5273; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
5274; CHECK-NEXT:    ret void
5275;
5276; GCN-LABEL: udiv_v2i64_pow2_shl_denom:
5277; GCN:       ; %bb.0:
5278; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5279; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
5280; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
5281; GCN-NEXT:    s_mov_b32 s7, 0xf000
5282; GCN-NEXT:    s_mov_b32 s6, -1
5283; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5284; GCN-NEXT:    s_add_i32 s0, s0, 12
5285; GCN-NEXT:    s_add_i32 s2, s2, 12
5286; GCN-NEXT:    s_lshr_b64 s[0:1], s[8:9], s0
5287; GCN-NEXT:    s_lshr_b64 s[2:3], s[10:11], s2
5288; GCN-NEXT:    v_mov_b32_e32 v0, s0
5289; GCN-NEXT:    v_mov_b32_e32 v1, s1
5290; GCN-NEXT:    v_mov_b32_e32 v2, s2
5291; GCN-NEXT:    v_mov_b32_e32 v3, s3
5292; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5293; GCN-NEXT:    s_endpgm
5294  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
5295  %r = udiv <2 x i64> %x, %shl.y
5296  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5297  ret void
5298}
5299
5300define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
5301; CHECK-LABEL: @urem_i64_oddk_denom(
5302; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993
5303; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5304; CHECK-NEXT:    ret void
5305;
5306; GCN-LABEL: urem_i64_oddk_denom:
5307; GCN:       ; %bb.0:
5308; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f1761f8
5309; GCN-NEXT:    v_mov_b32_e32 v1, 0x4f800000
5310; GCN-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
5311; GCN-NEXT:    v_rcp_f32_e32 v0, v0
5312; GCN-NEXT:    s_movk_i32 s2, 0xfee0
5313; GCN-NEXT:    s_mov_b32 s3, 0x689e0837
5314; GCN-NEXT:    v_mov_b32_e32 v8, 0
5315; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
5316; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
5317; GCN-NEXT:    v_trunc_f32_e32 v1, v1
5318; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
5319; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
5320; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
5321; GCN-NEXT:    v_mov_b32_e32 v7, 0
5322; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
5323; GCN-NEXT:    v_mul_lo_u32 v2, v0, s2
5324; GCN-NEXT:    v_mul_hi_u32 v3, v0, s3
5325; GCN-NEXT:    v_mul_lo_u32 v4, v1, s3
5326; GCN-NEXT:    s_movk_i32 s12, 0x11f
5327; GCN-NEXT:    s_mov_b32 s13, 0x9761f7c9
5328; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5329; GCN-NEXT:    v_mul_lo_u32 v3, v0, s3
5330; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
5331; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
5332; GCN-NEXT:    v_mul_hi_u32 v4, v0, v2
5333; GCN-NEXT:    v_mul_hi_u32 v6, v0, v3
5334; GCN-NEXT:    v_mul_hi_u32 v9, v1, v2
5335; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
5336; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5337; GCN-NEXT:    s_mov_b32 s9, s5
5338; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
5339; GCN-NEXT:    v_mul_lo_u32 v6, v1, v3
5340; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
5341; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
5342; GCN-NEXT:    s_movk_i32 s5, 0x11e
5343; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
5344; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
5345; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
5346; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5347; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
5348; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
5349; GCN-NEXT:    v_mul_lo_u32 v4, v0, s2
5350; GCN-NEXT:    v_mul_hi_u32 v5, v0, s3
5351; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
5352; GCN-NEXT:    v_mul_lo_u32 v6, v2, s3
5353; GCN-NEXT:    s_mov_b32 s8, s4
5354; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
5355; GCN-NEXT:    v_mul_lo_u32 v5, v0, s3
5356; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
5357; GCN-NEXT:    v_mul_lo_u32 v6, v0, v4
5358; GCN-NEXT:    v_mul_hi_u32 v10, v0, v4
5359; GCN-NEXT:    v_mul_hi_u32 v9, v0, v5
5360; GCN-NEXT:    v_mul_hi_u32 v11, v2, v4
5361; GCN-NEXT:    s_mov_b32 s4, 0x9761f7c8
5362; GCN-NEXT:    s_mov_b32 s11, 0xf000
5363; GCN-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
5364; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
5365; GCN-NEXT:    v_mul_lo_u32 v10, v2, v5
5366; GCN-NEXT:    v_mul_hi_u32 v5, v2, v5
5367; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
5368; GCN-NEXT:    s_mov_b32 s10, -1
5369; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
5370; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
5371; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
5372; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
5373; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
5374; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
5375; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
5376; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
5377; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5378; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
5379; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
5380; GCN-NEXT:    v_mul_hi_u32 v4, s6, v1
5381; GCN-NEXT:    v_mul_hi_u32 v5, s7, v1
5382; GCN-NEXT:    v_mul_lo_u32 v1, s7, v1
5383; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5384; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
5385; GCN-NEXT:    v_mul_lo_u32 v4, s7, v0
5386; GCN-NEXT:    v_mul_hi_u32 v0, s7, v0
5387; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
5388; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
5389; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
5390; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5391; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
5392; GCN-NEXT:    v_mul_lo_u32 v2, v0, s12
5393; GCN-NEXT:    v_mul_hi_u32 v3, v0, s13
5394; GCN-NEXT:    v_mul_lo_u32 v1, v1, s13
5395; GCN-NEXT:    v_mul_lo_u32 v0, v0, s13
5396; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5397; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
5398; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s7, v1
5399; GCN-NEXT:    v_mov_b32_e32 v3, s12
5400; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
5401; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
5402; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s13, v0
5403; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
5404; GCN-NEXT:    v_cmp_lt_u32_e64 s[2:3], s5, v5
5405; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
5406; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
5407; GCN-NEXT:    v_cmp_lt_u32_e64 s[2:3], s4, v4
5408; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s13, v4
5409; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
5410; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s12, v5
5411; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
5412; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
5413; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
5414; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
5415; GCN-NEXT:    v_mov_b32_e32 v5, s7
5416; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
5417; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v1
5418; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
5419; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v0
5420; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
5421; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s12, v1
5422; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
5423; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
5424; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
5425; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
5426; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5427; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
5428; GCN-NEXT:    s_endpgm
5429  %r = urem i64 %x, 1235195393993
5430  store i64 %r, i64 addrspace(1)* %out
5431  ret void
5432}
5433
5434define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
5435; CHECK-LABEL: @urem_i64_pow2k_denom(
5436; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 4096
5437; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5438; CHECK-NEXT:    ret void
5439;
5440; GCN-LABEL: urem_i64_pow2k_denom:
5441; GCN:       ; %bb.0:
5442; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
5443; GCN-NEXT:    s_mov_b32 s3, 0xf000
5444; GCN-NEXT:    s_mov_b32 s2, -1
5445; GCN-NEXT:    v_mov_b32_e32 v1, 0
5446; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5447; GCN-NEXT:    s_mov_b32 s0, s4
5448; GCN-NEXT:    s_and_b32 s4, s6, 0xfff
5449; GCN-NEXT:    s_mov_b32 s1, s5
5450; GCN-NEXT:    v_mov_b32_e32 v0, s4
5451; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5452; GCN-NEXT:    s_endpgm
5453  %r = urem i64 %x, 4096
5454  store i64 %r, i64 addrspace(1)* %out
5455  ret void
5456}
5457
5458define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
5459; CHECK-LABEL: @urem_i64_pow2_shl_denom(
5460; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
5461; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]]
5462; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5463; CHECK-NEXT:    ret void
5464;
5465; GCN-LABEL: urem_i64_pow2_shl_denom:
5466; GCN:       ; %bb.0:
5467; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
5468; GCN-NEXT:    s_load_dword s8, s[0:1], 0xd
5469; GCN-NEXT:    s_mov_b32 s3, 0xf000
5470; GCN-NEXT:    s_mov_b32 s2, -1
5471; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5472; GCN-NEXT:    s_mov_b32 s0, s4
5473; GCN-NEXT:    s_mov_b32 s1, s5
5474; GCN-NEXT:    s_mov_b32 s5, 0
5475; GCN-NEXT:    s_movk_i32 s4, 0x1000
5476; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], s8
5477; GCN-NEXT:    s_add_u32 s4, s4, -1
5478; GCN-NEXT:    s_addc_u32 s5, s5, -1
5479; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
5480; GCN-NEXT:    v_mov_b32_e32 v0, s4
5481; GCN-NEXT:    v_mov_b32_e32 v1, s5
5482; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5483; GCN-NEXT:    s_endpgm
5484  %shl.y = shl i64 4096, %y
5485  %r = urem i64 %x, %shl.y
5486  store i64 %r, i64 addrspace(1)* %out
5487  ret void
5488}
5489
5490define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
5491; CHECK-LABEL: @urem_v2i64_pow2k_denom(
5492; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5493; CHECK-NEXT:    [[TMP2:%.*]] = urem i64 [[TMP1]], 4096
5494; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
5495; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
5496; CHECK-NEXT:    [[TMP5:%.*]] = urem i64 [[TMP4]], 4096
5497; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
5498; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
5499; CHECK-NEXT:    ret void
5500;
5501; GCN-LABEL: urem_v2i64_pow2k_denom:
5502; GCN:       ; %bb.0:
5503; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5504; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
5505; GCN-NEXT:    s_movk_i32 s8, 0xfff
5506; GCN-NEXT:    v_mov_b32_e32 v1, 0
5507; GCN-NEXT:    s_mov_b32 s7, 0xf000
5508; GCN-NEXT:    s_mov_b32 s6, -1
5509; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5510; GCN-NEXT:    s_and_b32 s0, s0, s8
5511; GCN-NEXT:    s_and_b32 s1, s2, s8
5512; GCN-NEXT:    v_mov_b32_e32 v0, s0
5513; GCN-NEXT:    v_mov_b32_e32 v2, s1
5514; GCN-NEXT:    v_mov_b32_e32 v3, v1
5515; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5516; GCN-NEXT:    s_endpgm
5517  %r = urem <2 x i64> %x, <i64 4096, i64 4096>
5518  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5519  ret void
5520}
5521
5522define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
5523; CHECK-LABEL: @urem_v2i64_pow2_shl_denom(
5524; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
5525; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5526; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
5527; CHECK-NEXT:    [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]]
5528; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
5529; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
5530; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
5531; CHECK-NEXT:    [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]]
5532; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
5533; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
5534; CHECK-NEXT:    ret void
5535;
5536; GCN-LABEL: urem_v2i64_pow2_shl_denom:
5537; GCN:       ; %bb.0:
5538; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5539; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
5540; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
5541; GCN-NEXT:    s_mov_b32 s13, 0
5542; GCN-NEXT:    s_movk_i32 s12, 0x1000
5543; GCN-NEXT:    s_mov_b32 s7, 0xf000
5544; GCN-NEXT:    s_mov_b32 s6, -1
5545; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5546; GCN-NEXT:    s_lshl_b64 s[2:3], s[12:13], s2
5547; GCN-NEXT:    s_lshl_b64 s[0:1], s[12:13], s0
5548; GCN-NEXT:    s_add_u32 s0, s0, -1
5549; GCN-NEXT:    s_addc_u32 s1, s1, -1
5550; GCN-NEXT:    s_and_b64 s[0:1], s[8:9], s[0:1]
5551; GCN-NEXT:    s_add_u32 s2, s2, -1
5552; GCN-NEXT:    s_addc_u32 s3, s3, -1
5553; GCN-NEXT:    s_and_b64 s[2:3], s[10:11], s[2:3]
5554; GCN-NEXT:    v_mov_b32_e32 v0, s0
5555; GCN-NEXT:    v_mov_b32_e32 v1, s1
5556; GCN-NEXT:    v_mov_b32_e32 v2, s2
5557; GCN-NEXT:    v_mov_b32_e32 v3, s3
5558; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5559; GCN-NEXT:    s_endpgm
5560  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
5561  %r = urem <2 x i64> %x, %shl.y
5562  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5563  ret void
5564}
5565
5566define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
5567; CHECK-LABEL: @sdiv_i64_oddk_denom(
5568; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195
5569; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5570; CHECK-NEXT:    ret void
5571;
5572; GCN-LABEL: sdiv_i64_oddk_denom:
5573; GCN:       ; %bb.0:
5574; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
5575; GCN-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
5576; GCN-NEXT:    v_rcp_f32_e32 v0, v0
5577; GCN-NEXT:    s_mov_b32 s2, 0xffed2705
5578; GCN-NEXT:    v_mov_b32_e32 v8, 0
5579; GCN-NEXT:    v_mov_b32_e32 v7, 0
5580; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
5581; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
5582; GCN-NEXT:    v_trunc_f32_e32 v1, v1
5583; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
5584; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
5585; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
5586; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
5587; GCN-NEXT:    s_mov_b32 s7, 0xf000
5588; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
5589; GCN-NEXT:    v_mul_lo_u32 v2, v1, s2
5590; GCN-NEXT:    v_mul_lo_u32 v4, v0, s2
5591; GCN-NEXT:    s_mov_b32 s6, -1
5592; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5593; GCN-NEXT:    s_mov_b32 s4, s8
5594; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5595; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
5596; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
5597; GCN-NEXT:    v_mul_hi_u32 v6, v0, v4
5598; GCN-NEXT:    v_mul_hi_u32 v3, v0, v2
5599; GCN-NEXT:    v_mul_hi_u32 v9, v1, v2
5600; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
5601; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
5602; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
5603; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
5604; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
5605; GCN-NEXT:    s_mov_b32 s5, s9
5606; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
5607; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
5608; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
5609; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5610; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
5611; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
5612; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
5613; GCN-NEXT:    v_mul_lo_u32 v4, v2, s2
5614; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
5615; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
5616; GCN-NEXT:    v_mul_lo_u32 v5, v0, s2
5617; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
5618; GCN-NEXT:    v_mul_lo_u32 v10, v0, v4
5619; GCN-NEXT:    v_mul_hi_u32 v12, v0, v4
5620; GCN-NEXT:    v_mul_hi_u32 v11, v0, v5
5621; GCN-NEXT:    v_mul_hi_u32 v9, v2, v5
5622; GCN-NEXT:    v_mul_lo_u32 v5, v2, v5
5623; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
5624; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
5625; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v8, v12, vcc
5626; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
5627; GCN-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
5628; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v11, v9, vcc
5629; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v7, vcc
5630; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
5631; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
5632; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
5633; GCN-NEXT:    s_ashr_i32 s2, s11, 31
5634; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
5635; GCN-NEXT:    s_add_u32 s0, s10, s2
5636; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
5637; GCN-NEXT:    s_mov_b32 s3, s2
5638; GCN-NEXT:    s_addc_u32 s1, s11, s2
5639; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
5640; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5641; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
5642; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
5643; GCN-NEXT:    v_mul_hi_u32 v4, s0, v1
5644; GCN-NEXT:    v_mul_hi_u32 v5, s1, v1
5645; GCN-NEXT:    v_mul_lo_u32 v1, s1, v1
5646; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5647; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
5648; GCN-NEXT:    v_mul_lo_u32 v4, s1, v0
5649; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
5650; GCN-NEXT:    s_mov_b32 s3, 0x12d8fb
5651; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
5652; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
5653; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
5654; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5655; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
5656; GCN-NEXT:    v_mul_lo_u32 v2, v1, s3
5657; GCN-NEXT:    v_mul_hi_u32 v3, s3, v0
5658; GCN-NEXT:    v_mul_lo_u32 v4, v0, s3
5659; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5660; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s0, v4
5661; GCN-NEXT:    v_mov_b32_e32 v3, s1
5662; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
5663; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v4
5664; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
5665; GCN-NEXT:    s_mov_b32 s0, 0x12d8fa
5666; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v3
5667; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
5668; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
5669; GCN-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
5670; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
5671; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
5672; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
5673; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v4
5674; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
5675; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
5676; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
5677; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
5678; GCN-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[0:1]
5679; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
5680; GCN-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
5681; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
5682; GCN-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
5683; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
5684; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
5685; GCN-NEXT:    v_xor_b32_e32 v1, s2, v1
5686; GCN-NEXT:    v_mov_b32_e32 v2, s2
5687; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
5688; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
5689; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5690; GCN-NEXT:    s_endpgm
5691  %r = sdiv i64 %x, 1235195
5692  store i64 %r, i64 addrspace(1)* %out
5693  ret void
5694}
5695
5696define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
5697; CHECK-LABEL: @sdiv_i64_pow2k_denom(
5698; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 4096
5699; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5700; CHECK-NEXT:    ret void
5701;
5702; GCN-LABEL: sdiv_i64_pow2k_denom:
5703; GCN:       ; %bb.0:
5704; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
5705; GCN-NEXT:    s_mov_b32 s7, 0xf000
5706; GCN-NEXT:    s_mov_b32 s6, -1
5707; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5708; GCN-NEXT:    s_mov_b32 s4, s0
5709; GCN-NEXT:    s_ashr_i32 s0, s3, 31
5710; GCN-NEXT:    s_lshr_b32 s0, s0, 20
5711; GCN-NEXT:    s_add_u32 s0, s2, s0
5712; GCN-NEXT:    s_mov_b32 s5, s1
5713; GCN-NEXT:    s_addc_u32 s1, s3, 0
5714; GCN-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
5715; GCN-NEXT:    v_mov_b32_e32 v0, s0
5716; GCN-NEXT:    v_mov_b32_e32 v1, s1
5717; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5718; GCN-NEXT:    s_endpgm
5719  %r = sdiv i64 %x, 4096
5720  store i64 %r, i64 addrspace(1)* %out
5721  ret void
5722}
5723
5724define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
5725; CHECK-LABEL: @sdiv_i64_pow2_shl_denom(
5726; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
5727; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]]
5728; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
5729; CHECK-NEXT:    ret void
5730;
5731; GCN-LABEL: sdiv_i64_pow2_shl_denom:
5732; GCN:       ; %bb.0:
5733; GCN-NEXT:    s_load_dword s4, s[0:1], 0xd
5734; GCN-NEXT:    s_mov_b32 s3, 0
5735; GCN-NEXT:    s_movk_i32 s2, 0x1000
5736; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
5737; GCN-NEXT:    s_mov_b32 s7, 0xf000
5738; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5739; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
5740; GCN-NEXT:    s_ashr_i32 s12, s3, 31
5741; GCN-NEXT:    s_add_u32 s2, s2, s12
5742; GCN-NEXT:    s_mov_b32 s13, s12
5743; GCN-NEXT:    s_addc_u32 s3, s3, s12
5744; GCN-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
5745; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
5746; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
5747; GCN-NEXT:    s_sub_u32 s4, 0, s2
5748; GCN-NEXT:    s_subb_u32 s5, 0, s3
5749; GCN-NEXT:    s_ashr_i32 s14, s11, 31
5750; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
5751; GCN-NEXT:    v_rcp_f32_e32 v0, v0
5752; GCN-NEXT:    s_mov_b32 s15, s14
5753; GCN-NEXT:    s_mov_b32 s6, -1
5754; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
5755; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
5756; GCN-NEXT:    v_trunc_f32_e32 v1, v1
5757; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
5758; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
5759; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
5760; GCN-NEXT:    v_mul_hi_u32 v3, s4, v0
5761; GCN-NEXT:    v_mul_lo_u32 v2, s4, v1
5762; GCN-NEXT:    v_mul_lo_u32 v5, s5, v0
5763; GCN-NEXT:    v_mul_lo_u32 v4, s4, v0
5764; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5765; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
5766; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
5767; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
5768; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
5769; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
5770; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
5771; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
5772; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
5773; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
5774; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
5775; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
5776; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
5777; GCN-NEXT:    v_mov_b32_e32 v4, 0
5778; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
5779; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5780; GCN-NEXT:    v_mov_b32_e32 v6, 0
5781; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
5782; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
5783; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
5784; GCN-NEXT:    v_mul_lo_u32 v5, s4, v2
5785; GCN-NEXT:    v_mul_hi_u32 v7, s4, v0
5786; GCN-NEXT:    v_mul_lo_u32 v8, s5, v0
5787; GCN-NEXT:    s_mov_b32 s5, s9
5788; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
5789; GCN-NEXT:    v_mul_lo_u32 v7, s4, v0
5790; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
5791; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
5792; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
5793; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
5794; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
5795; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
5796; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
5797; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
5798; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
5799; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
5800; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
5801; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
5802; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
5803; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
5804; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
5805; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
5806; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
5807; GCN-NEXT:    s_add_u32 s0, s10, s14
5808; GCN-NEXT:    s_addc_u32 s1, s11, s14
5809; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
5810; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
5811; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
5812; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
5813; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
5814; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
5815; GCN-NEXT:    v_mul_hi_u32 v7, s11, v1
5816; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
5817; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5818; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
5819; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
5820; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
5821; GCN-NEXT:    s_mov_b32 s4, s8
5822; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
5823; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
5824; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
5825; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
5826; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
5827; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
5828; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
5829; GCN-NEXT:    v_mul_lo_u32 v4, s3, v0
5830; GCN-NEXT:    v_mov_b32_e32 v5, s3
5831; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5832; GCN-NEXT:    v_mul_lo_u32 v3, s2, v0
5833; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
5834; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s11, v2
5835; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
5836; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
5837; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s2, v3
5838; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
5839; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v4
5840; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
5841; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v5
5842; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
5843; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v4
5844; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
5845; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
5846; GCN-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
5847; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
5848; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
5849; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
5850; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
5851; GCN-NEXT:    v_mov_b32_e32 v6, s11
5852; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
5853; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
5854; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
5855; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
5856; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
5857; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v2
5858; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
5859; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
5860; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
5861; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
5862; GCN-NEXT:    s_xor_b64 s[0:1], s[14:15], s[12:13]
5863; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
5864; GCN-NEXT:    v_xor_b32_e32 v0, s0, v0
5865; GCN-NEXT:    v_xor_b32_e32 v1, s1, v1
5866; GCN-NEXT:    v_mov_b32_e32 v2, s1
5867; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
5868; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
5869; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5870; GCN-NEXT:    s_endpgm
5871  %shl.y = shl i64 4096, %y
5872  %r = sdiv i64 %x, %shl.y
5873  store i64 %r, i64 addrspace(1)* %out
5874  ret void
5875}
5876
5877define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
5878; CHECK-LABEL: @sdiv_v2i64_pow2k_denom(
5879; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5880; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
5881; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
5882; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
5883; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096
5884; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
5885; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
5886; CHECK-NEXT:    ret void
5887;
5888; GCN-LABEL: sdiv_v2i64_pow2k_denom:
5889; GCN:       ; %bb.0:
5890; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5891; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
5892; GCN-NEXT:    s_mov_b32 s7, 0xf000
5893; GCN-NEXT:    s_mov_b32 s6, -1
5894; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5895; GCN-NEXT:    s_ashr_i32 s8, s1, 31
5896; GCN-NEXT:    s_lshr_b32 s8, s8, 20
5897; GCN-NEXT:    s_add_u32 s0, s0, s8
5898; GCN-NEXT:    s_addc_u32 s1, s1, 0
5899; GCN-NEXT:    s_ashr_i32 s8, s3, 31
5900; GCN-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
5901; GCN-NEXT:    s_lshr_b32 s8, s8, 20
5902; GCN-NEXT:    s_add_u32 s2, s2, s8
5903; GCN-NEXT:    s_addc_u32 s3, s3, 0
5904; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
5905; GCN-NEXT:    v_mov_b32_e32 v0, s0
5906; GCN-NEXT:    v_mov_b32_e32 v1, s1
5907; GCN-NEXT:    v_mov_b32_e32 v2, s2
5908; GCN-NEXT:    v_mov_b32_e32 v3, s3
5909; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5910; GCN-NEXT:    s_endpgm
5911  %r = sdiv <2 x i64> %x, <i64 4096, i64 4096>
5912  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
5913  ret void
5914}
5915
5916define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
5917; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom(
5918; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
5919; CHECK-NEXT:    [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096
5920; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
5921; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
5922; CHECK-NEXT:    [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095
5923; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
5924; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
5925; CHECK-NEXT:    ret void
5926;
5927; GCN-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
5928; GCN:       ; %bb.0:
5929; GCN-NEXT:    v_mov_b32_e32 v0, 0x457ff000
5930; GCN-NEXT:    v_mov_b32_e32 v1, 0x4f800000
5931; GCN-NEXT:    v_mac_f32_e32 v0, 0, v1
5932; GCN-NEXT:    v_rcp_f32_e32 v0, v0
5933; GCN-NEXT:    s_movk_i32 s6, 0xf001
5934; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
5935; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
5936; GCN-NEXT:    s_mov_b32 s7, 0xf000
5937; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
5938; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
5939; GCN-NEXT:    v_trunc_f32_e32 v1, v1
5940; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
5941; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
5942; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
5943; GCN-NEXT:    s_waitcnt lgkmcnt(0)
5944; GCN-NEXT:    s_ashr_i32 s0, s9, 31
5945; GCN-NEXT:    s_lshr_b32 s0, s0, 20
5946; GCN-NEXT:    v_mul_hi_u32 v2, s6, v0
5947; GCN-NEXT:    v_mul_lo_u32 v3, v1, s6
5948; GCN-NEXT:    s_add_u32 s2, s8, s0
5949; GCN-NEXT:    s_addc_u32 s3, s9, 0
5950; GCN-NEXT:    s_ashr_i32 s8, s11, 31
5951; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
5952; GCN-NEXT:    v_mul_lo_u32 v3, v0, s6
5953; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
5954; GCN-NEXT:    v_mul_lo_u32 v4, v0, v2
5955; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
5956; GCN-NEXT:    v_mul_hi_u32 v5, v0, v3
5957; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
5958; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
5959; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
5960; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
5961; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
5962; GCN-NEXT:    v_mul_lo_u32 v6, v1, v3
5963; GCN-NEXT:    v_mul_hi_u32 v3, v1, v3
5964; GCN-NEXT:    s_mov_b32 s9, s8
5965; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
5966; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
5967; GCN-NEXT:    v_mov_b32_e32 v4, 0
5968; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
5969; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
5970; GCN-NEXT:    v_mov_b32_e32 v6, 0
5971; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
5972; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
5973; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
5974; GCN-NEXT:    v_mul_lo_u32 v5, v2, s6
5975; GCN-NEXT:    v_mul_hi_u32 v7, s6, v0
5976; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
5977; GCN-NEXT:    v_mul_lo_u32 v7, v0, s6
5978; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, v0, v5
5979; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
5980; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
5981; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
5982; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
5983; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
5984; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
5985; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
5986; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
5987; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
5988; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
5989; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
5990; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
5991; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
5992; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
5993; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
5994; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
5995; GCN-NEXT:    s_add_u32 s0, s10, s8
5996; GCN-NEXT:    s_addc_u32 s1, s11, s8
5997; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
5998; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[8:9]
5999; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6000; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
6001; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
6002; GCN-NEXT:    v_mul_hi_u32 v5, s0, v1
6003; GCN-NEXT:    v_mul_hi_u32 v7, s1, v1
6004; GCN-NEXT:    v_mul_lo_u32 v1, s1, v1
6005; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6006; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6007; GCN-NEXT:    v_mul_lo_u32 v5, s1, v0
6008; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
6009; GCN-NEXT:    s_movk_i32 s9, 0xfff
6010; GCN-NEXT:    s_mov_b32 s6, -1
6011; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
6012; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
6013; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
6014; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6015; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
6016; GCN-NEXT:    v_mul_lo_u32 v2, v1, s9
6017; GCN-NEXT:    v_mul_hi_u32 v3, s9, v0
6018; GCN-NEXT:    v_mul_lo_u32 v4, v0, s9
6019; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6020; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s0, v4
6021; GCN-NEXT:    v_mov_b32_e32 v3, s1
6022; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v3, v2, vcc
6023; GCN-NEXT:    v_subrev_i32_e32 v3, vcc, s9, v4
6024; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v2, vcc
6025; GCN-NEXT:    s_movk_i32 s0, 0xffe
6026; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v3
6027; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
6028; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
6029; GCN-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
6030; GCN-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
6031; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
6032; GCN-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
6033; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v4
6034; GCN-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
6035; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
6036; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
6037; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
6038; GCN-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[0:1]
6039; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
6040; GCN-NEXT:    v_cndmask_b32_e32 v2, v7, v5, vcc
6041; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
6042; GCN-NEXT:    v_cndmask_b32_e32 v3, v8, v6, vcc
6043; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
6044; GCN-NEXT:    v_xor_b32_e32 v0, s8, v0
6045; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
6046; GCN-NEXT:    v_xor_b32_e32 v1, s8, v1
6047; GCN-NEXT:    v_mov_b32_e32 v3, s8
6048; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
6049; GCN-NEXT:    v_mov_b32_e32 v0, s2
6050; GCN-NEXT:    v_mov_b32_e32 v1, s3
6051; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
6052; GCN-NEXT:    s_endpgm
6053  %r = sdiv <2 x i64> %x, <i64 4096, i64 4095>
6054  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
6055  ret void
6056}
6057
6058define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
6059; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom(
6060; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
6061; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
6062; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
6063; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]]
6064; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
6065; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
6066; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
6067; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]]
6068; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
6069; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
6070; CHECK-NEXT:    ret void
6071;
6072; GCN-LABEL: sdiv_v2i64_pow2_shl_denom:
6073; GCN:       ; %bb.0:
6074; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
6075; GCN-NEXT:    s_mov_b32 s3, 0
6076; GCN-NEXT:    s_movk_i32 s2, 0x1000
6077; GCN-NEXT:    s_mov_b32 s18, 0x4f800000
6078; GCN-NEXT:    s_mov_b32 s19, 0x5f7ffffc
6079; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6080; GCN-NEXT:    s_lshl_b64 s[12:13], s[2:3], s6
6081; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
6082; GCN-NEXT:    s_ashr_i32 s16, s3, 31
6083; GCN-NEXT:    s_add_u32 s2, s2, s16
6084; GCN-NEXT:    s_mov_b32 s17, s16
6085; GCN-NEXT:    s_addc_u32 s3, s3, s16
6086; GCN-NEXT:    s_xor_b64 s[14:15], s[2:3], s[16:17]
6087; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s14
6088; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s15
6089; GCN-NEXT:    s_mov_b32 s20, 0x2f800000
6090; GCN-NEXT:    s_mov_b32 s21, 0xcf800000
6091; GCN-NEXT:    s_sub_u32 s6, 0, s14
6092; GCN-NEXT:    v_mac_f32_e32 v0, s18, v1
6093; GCN-NEXT:    v_rcp_f32_e32 v0, v0
6094; GCN-NEXT:    s_subb_u32 s7, 0, s15
6095; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6096; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
6097; GCN-NEXT:    v_mul_f32_e32 v0, s19, v0
6098; GCN-NEXT:    v_mul_f32_e32 v1, s20, v0
6099; GCN-NEXT:    v_trunc_f32_e32 v1, v1
6100; GCN-NEXT:    v_mac_f32_e32 v0, s21, v1
6101; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
6102; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
6103; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
6104; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
6105; GCN-NEXT:    v_mul_lo_u32 v4, s7, v0
6106; GCN-NEXT:    v_mul_lo_u32 v5, s6, v0
6107; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6108; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
6109; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
6110; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
6111; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
6112; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
6113; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
6114; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
6115; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
6116; GCN-NEXT:    v_mul_lo_u32 v6, v1, v5
6117; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
6118; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
6119; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
6120; GCN-NEXT:    v_mov_b32_e32 v4, 0
6121; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
6122; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6123; GCN-NEXT:    v_mov_b32_e32 v6, 0
6124; GCN-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v2
6125; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
6126; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[2:3]
6127; GCN-NEXT:    v_mul_lo_u32 v5, s6, v2
6128; GCN-NEXT:    v_mul_hi_u32 v7, s6, v0
6129; GCN-NEXT:    v_mul_lo_u32 v8, s7, v0
6130; GCN-NEXT:    s_mov_b32 s7, 0xf000
6131; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
6132; GCN-NEXT:    v_mul_lo_u32 v7, s6, v0
6133; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
6134; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
6135; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
6136; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
6137; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
6138; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
6139; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
6140; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
6141; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
6142; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
6143; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
6144; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
6145; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
6146; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
6147; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
6148; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6149; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
6150; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6151; GCN-NEXT:    s_ashr_i32 s2, s9, 31
6152; GCN-NEXT:    s_add_u32 s0, s8, s2
6153; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
6154; GCN-NEXT:    s_mov_b32 s3, s2
6155; GCN-NEXT:    s_addc_u32 s1, s9, s2
6156; GCN-NEXT:    s_xor_b64 s[8:9], s[0:1], s[2:3]
6157; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6158; GCN-NEXT:    v_mul_lo_u32 v2, s8, v1
6159; GCN-NEXT:    v_mul_hi_u32 v3, s8, v0
6160; GCN-NEXT:    v_mul_hi_u32 v5, s8, v1
6161; GCN-NEXT:    v_mul_hi_u32 v7, s9, v1
6162; GCN-NEXT:    v_mul_lo_u32 v1, s9, v1
6163; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6164; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6165; GCN-NEXT:    v_mul_lo_u32 v5, s9, v0
6166; GCN-NEXT:    v_mul_hi_u32 v0, s9, v0
6167; GCN-NEXT:    s_xor_b64 s[2:3], s[2:3], s[16:17]
6168; GCN-NEXT:    s_mov_b32 s6, -1
6169; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
6170; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
6171; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
6172; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6173; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
6174; GCN-NEXT:    v_mul_lo_u32 v2, s14, v1
6175; GCN-NEXT:    v_mul_hi_u32 v3, s14, v0
6176; GCN-NEXT:    v_mul_lo_u32 v5, s15, v0
6177; GCN-NEXT:    v_mov_b32_e32 v7, s15
6178; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6179; GCN-NEXT:    v_mul_lo_u32 v3, s14, v0
6180; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
6181; GCN-NEXT:    v_sub_i32_e32 v5, vcc, s9, v2
6182; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s8, v3
6183; GCN-NEXT:    v_subb_u32_e64 v5, s[0:1], v5, v7, vcc
6184; GCN-NEXT:    v_subrev_i32_e64 v7, s[0:1], s14, v3
6185; GCN-NEXT:    v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1]
6186; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v5
6187; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
6188; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v7
6189; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
6190; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v5
6191; GCN-NEXT:    v_cndmask_b32_e64 v5, v8, v7, s[0:1]
6192; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v0
6193; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
6194; GCN-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v0
6195; GCN-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1]
6196; GCN-NEXT:    s_ashr_i32 s8, s13, 31
6197; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
6198; GCN-NEXT:    s_add_u32 s12, s12, s8
6199; GCN-NEXT:    v_cndmask_b32_e64 v5, v10, v8, s[0:1]
6200; GCN-NEXT:    v_mov_b32_e32 v8, s9
6201; GCN-NEXT:    s_mov_b32 s9, s8
6202; GCN-NEXT:    s_addc_u32 s13, s13, s8
6203; GCN-NEXT:    s_xor_b64 s[12:13], s[12:13], s[8:9]
6204; GCN-NEXT:    v_cvt_f32_u32_e32 v10, s12
6205; GCN-NEXT:    v_cvt_f32_u32_e32 v11, s13
6206; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v8, v2, vcc
6207; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s15, v2
6208; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
6209; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s14, v3
6210; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
6211; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v2
6212; GCN-NEXT:    v_mac_f32_e32 v10, s18, v11
6213; GCN-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
6214; GCN-NEXT:    v_rcp_f32_e32 v3, v10
6215; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
6216; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
6217; GCN-NEXT:    s_sub_u32 s14, 0, s12
6218; GCN-NEXT:    v_mul_f32_e32 v3, s19, v3
6219; GCN-NEXT:    v_mul_f32_e32 v5, s20, v3
6220; GCN-NEXT:    v_trunc_f32_e32 v5, v5
6221; GCN-NEXT:    v_mac_f32_e32 v3, s21, v5
6222; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
6223; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
6224; GCN-NEXT:    v_cndmask_b32_e64 v2, v9, v7, s[0:1]
6225; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6226; GCN-NEXT:    v_mul_hi_u32 v2, s14, v3
6227; GCN-NEXT:    v_mul_lo_u32 v7, s14, v5
6228; GCN-NEXT:    s_subb_u32 s15, 0, s13
6229; GCN-NEXT:    v_mul_lo_u32 v8, s15, v3
6230; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
6231; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
6232; GCN-NEXT:    v_mul_lo_u32 v7, s14, v3
6233; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
6234; GCN-NEXT:    v_mul_lo_u32 v8, v3, v2
6235; GCN-NEXT:    v_mul_hi_u32 v10, v3, v2
6236; GCN-NEXT:    v_mul_hi_u32 v9, v3, v7
6237; GCN-NEXT:    v_mul_hi_u32 v11, v5, v2
6238; GCN-NEXT:    v_mul_lo_u32 v2, v5, v2
6239; GCN-NEXT:    v_xor_b32_e32 v1, s3, v1
6240; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
6241; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
6242; GCN-NEXT:    v_mul_lo_u32 v10, v5, v7
6243; GCN-NEXT:    v_mul_hi_u32 v7, v5, v7
6244; GCN-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
6245; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
6246; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v11, v4, vcc
6247; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
6248; GCN-NEXT:    v_add_i32_e64 v2, s[0:1], v3, v2
6249; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v6, v8, vcc
6250; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v5, v7, s[0:1]
6251; GCN-NEXT:    v_mul_lo_u32 v8, s14, v3
6252; GCN-NEXT:    v_mul_hi_u32 v9, s14, v2
6253; GCN-NEXT:    v_mul_lo_u32 v10, s15, v2
6254; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
6255; GCN-NEXT:    v_mul_lo_u32 v9, s14, v2
6256; GCN-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
6257; GCN-NEXT:    v_mul_lo_u32 v12, v2, v8
6258; GCN-NEXT:    v_mul_hi_u32 v14, v2, v8
6259; GCN-NEXT:    v_mul_hi_u32 v13, v2, v9
6260; GCN-NEXT:    v_mul_hi_u32 v11, v3, v9
6261; GCN-NEXT:    v_mul_lo_u32 v9, v3, v9
6262; GCN-NEXT:    v_mul_hi_u32 v10, v3, v8
6263; GCN-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
6264; GCN-NEXT:    v_addc_u32_e32 v13, vcc, 0, v14, vcc
6265; GCN-NEXT:    v_mul_lo_u32 v3, v3, v8
6266; GCN-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
6267; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v13, v11, vcc
6268; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v10, v4, vcc
6269; GCN-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
6270; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v6, v8, vcc
6271; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
6272; GCN-NEXT:    s_ashr_i32 s14, s11, 31
6273; GCN-NEXT:    v_addc_u32_e64 v5, vcc, v5, v8, s[0:1]
6274; GCN-NEXT:    s_add_u32 s0, s10, s14
6275; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
6276; GCN-NEXT:    s_mov_b32 s15, s14
6277; GCN-NEXT:    s_addc_u32 s1, s11, s14
6278; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
6279; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6280; GCN-NEXT:    v_mul_lo_u32 v5, s10, v3
6281; GCN-NEXT:    v_mul_hi_u32 v7, s10, v2
6282; GCN-NEXT:    v_mul_hi_u32 v9, s10, v3
6283; GCN-NEXT:    v_mul_hi_u32 v10, s11, v3
6284; GCN-NEXT:    v_mul_lo_u32 v3, s11, v3
6285; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
6286; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
6287; GCN-NEXT:    v_mul_lo_u32 v9, s11, v2
6288; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
6289; GCN-NEXT:    v_mov_b32_e32 v8, s3
6290; GCN-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
6291; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v2, vcc
6292; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v10, v4, vcc
6293; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
6294; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v4, vcc
6295; GCN-NEXT:    v_mul_lo_u32 v4, s12, v3
6296; GCN-NEXT:    v_mul_hi_u32 v5, s12, v2
6297; GCN-NEXT:    v_mul_lo_u32 v6, s13, v2
6298; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
6299; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
6300; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
6301; GCN-NEXT:    v_mul_lo_u32 v5, s12, v2
6302; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
6303; GCN-NEXT:    v_sub_i32_e32 v6, vcc, s11, v4
6304; GCN-NEXT:    v_mov_b32_e32 v7, s13
6305; GCN-NEXT:    v_sub_i32_e32 v5, vcc, s10, v5
6306; GCN-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v7, vcc
6307; GCN-NEXT:    v_subrev_i32_e64 v7, s[0:1], s12, v5
6308; GCN-NEXT:    v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
6309; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v6
6310; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
6311; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v7
6312; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
6313; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v6
6314; GCN-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[0:1]
6315; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v2
6316; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1]
6317; GCN-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v2
6318; GCN-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
6319; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
6320; GCN-NEXT:    v_cndmask_b32_e64 v6, v10, v8, s[0:1]
6321; GCN-NEXT:    v_mov_b32_e32 v8, s11
6322; GCN-NEXT:    v_subb_u32_e32 v4, vcc, v8, v4, vcc
6323; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v4
6324; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
6325; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v5
6326; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
6327; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v4
6328; GCN-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
6329; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
6330; GCN-NEXT:    v_cndmask_b32_e64 v4, v9, v7, s[0:1]
6331; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
6332; GCN-NEXT:    s_xor_b64 s[0:1], s[14:15], s[8:9]
6333; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
6334; GCN-NEXT:    v_xor_b32_e32 v2, s0, v2
6335; GCN-NEXT:    v_xor_b32_e32 v3, s1, v3
6336; GCN-NEXT:    v_mov_b32_e32 v4, s1
6337; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v2
6338; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
6339; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
6340; GCN-NEXT:    s_endpgm
6341  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
6342  %r = sdiv <2 x i64> %x, %shl.y
6343  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
6344  ret void
6345}
6346
6347define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
6348; CHECK-LABEL: @srem_i64_oddk_denom(
6349; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 1235195
6350; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
6351; CHECK-NEXT:    ret void
6352;
6353; GCN-LABEL: srem_i64_oddk_denom:
6354; GCN:       ; %bb.0:
6355; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
6356; GCN-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
6357; GCN-NEXT:    v_rcp_f32_e32 v0, v0
6358; GCN-NEXT:    s_mov_b32 s2, 0xffed2705
6359; GCN-NEXT:    v_mov_b32_e32 v8, 0
6360; GCN-NEXT:    v_mov_b32_e32 v7, 0
6361; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
6362; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
6363; GCN-NEXT:    v_trunc_f32_e32 v1, v1
6364; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
6365; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
6366; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
6367; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
6368; GCN-NEXT:    s_mov_b32 s7, 0xf000
6369; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
6370; GCN-NEXT:    v_mul_lo_u32 v2, v1, s2
6371; GCN-NEXT:    v_mul_lo_u32 v4, v0, s2
6372; GCN-NEXT:    s_mov_b32 s6, -1
6373; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6374; GCN-NEXT:    s_mov_b32 s4, s8
6375; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6376; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
6377; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
6378; GCN-NEXT:    v_mul_hi_u32 v6, v0, v4
6379; GCN-NEXT:    v_mul_hi_u32 v3, v0, v2
6380; GCN-NEXT:    v_mul_hi_u32 v9, v1, v2
6381; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
6382; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
6383; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
6384; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
6385; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v3, vcc
6386; GCN-NEXT:    s_mov_b32 s5, s9
6387; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
6388; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
6389; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
6390; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6391; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
6392; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
6393; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
6394; GCN-NEXT:    v_mul_lo_u32 v4, v2, s2
6395; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
6396; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
6397; GCN-NEXT:    v_mul_lo_u32 v5, v0, s2
6398; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
6399; GCN-NEXT:    v_mul_lo_u32 v10, v0, v4
6400; GCN-NEXT:    v_mul_hi_u32 v12, v0, v4
6401; GCN-NEXT:    v_mul_hi_u32 v11, v0, v5
6402; GCN-NEXT:    v_mul_hi_u32 v9, v2, v5
6403; GCN-NEXT:    v_mul_lo_u32 v5, v2, v5
6404; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
6405; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
6406; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v8, v12, vcc
6407; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
6408; GCN-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
6409; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v11, v9, vcc
6410; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v7, vcc
6411; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
6412; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
6413; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6414; GCN-NEXT:    s_ashr_i32 s2, s11, 31
6415; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
6416; GCN-NEXT:    s_add_u32 s0, s10, s2
6417; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
6418; GCN-NEXT:    s_mov_b32 s3, s2
6419; GCN-NEXT:    s_addc_u32 s1, s11, s2
6420; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
6421; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6422; GCN-NEXT:    v_mul_lo_u32 v2, s0, v1
6423; GCN-NEXT:    v_mul_hi_u32 v3, s0, v0
6424; GCN-NEXT:    v_mul_hi_u32 v4, s0, v1
6425; GCN-NEXT:    v_mul_hi_u32 v5, s1, v1
6426; GCN-NEXT:    v_mul_lo_u32 v1, s1, v1
6427; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6428; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
6429; GCN-NEXT:    v_mul_lo_u32 v4, s1, v0
6430; GCN-NEXT:    v_mul_hi_u32 v0, s1, v0
6431; GCN-NEXT:    s_mov_b32 s3, 0x12d8fb
6432; GCN-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
6433; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
6434; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
6435; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6436; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
6437; GCN-NEXT:    v_mul_hi_u32 v2, s3, v0
6438; GCN-NEXT:    v_mul_lo_u32 v1, v1, s3
6439; GCN-NEXT:    v_mul_lo_u32 v0, v0, s3
6440; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
6441; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
6442; GCN-NEXT:    v_mov_b32_e32 v2, s1
6443; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
6444; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v0
6445; GCN-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
6446; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, s3, v2
6447; GCN-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v3, vcc
6448; GCN-NEXT:    s_mov_b32 s0, 0x12d8fa
6449; GCN-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v2
6450; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
6451; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
6452; GCN-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
6453; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
6454; GCN-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v0
6455; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
6456; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
6457; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
6458; GCN-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[0:1]
6459; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
6460; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
6461; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
6462; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
6463; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
6464; GCN-NEXT:    v_xor_b32_e32 v1, s2, v1
6465; GCN-NEXT:    v_mov_b32_e32 v2, s2
6466; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
6467; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
6468; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6469; GCN-NEXT:    s_endpgm
6470  %r = srem i64 %x, 1235195
6471  store i64 %r, i64 addrspace(1)* %out
6472  ret void
6473}
6474
6475define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) {
6476; CHECK-LABEL: @srem_i64_pow2k_denom(
6477; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 4096
6478; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
6479; CHECK-NEXT:    ret void
6480;
6481; GCN-LABEL: srem_i64_pow2k_denom:
6482; GCN:       ; %bb.0:
6483; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
6484; GCN-NEXT:    s_mov_b32 s3, 0xf000
6485; GCN-NEXT:    s_mov_b32 s2, -1
6486; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6487; GCN-NEXT:    s_mov_b32 s0, s4
6488; GCN-NEXT:    s_ashr_i32 s4, s7, 31
6489; GCN-NEXT:    s_lshr_b32 s4, s4, 20
6490; GCN-NEXT:    s_add_u32 s4, s6, s4
6491; GCN-NEXT:    s_mov_b32 s1, s5
6492; GCN-NEXT:    s_addc_u32 s5, s7, 0
6493; GCN-NEXT:    s_and_b32 s4, s4, 0xfffff000
6494; GCN-NEXT:    s_sub_u32 s4, s6, s4
6495; GCN-NEXT:    s_subb_u32 s5, s7, s5
6496; GCN-NEXT:    v_mov_b32_e32 v0, s4
6497; GCN-NEXT:    v_mov_b32_e32 v1, s5
6498; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6499; GCN-NEXT:    s_endpgm
6500  %r = srem i64 %x, 4096
6501  store i64 %r, i64 addrspace(1)* %out
6502  ret void
6503}
6504
6505define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) {
6506; CHECK-LABEL: @srem_i64_pow2_shl_denom(
6507; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
6508; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]]
6509; CHECK-NEXT:    store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4
6510; CHECK-NEXT:    ret void
6511;
6512; GCN-LABEL: srem_i64_pow2_shl_denom:
6513; GCN:       ; %bb.0:
6514; GCN-NEXT:    s_load_dword s4, s[0:1], 0xd
6515; GCN-NEXT:    s_mov_b32 s3, 0
6516; GCN-NEXT:    s_movk_i32 s2, 0x1000
6517; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
6518; GCN-NEXT:    s_mov_b32 s7, 0xf000
6519; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6520; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
6521; GCN-NEXT:    s_ashr_i32 s4, s3, 31
6522; GCN-NEXT:    s_add_u32 s2, s2, s4
6523; GCN-NEXT:    s_mov_b32 s5, s4
6524; GCN-NEXT:    s_addc_u32 s3, s3, s4
6525; GCN-NEXT:    s_xor_b64 s[12:13], s[2:3], s[4:5]
6526; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
6527; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s13
6528; GCN-NEXT:    s_sub_u32 s2, 0, s12
6529; GCN-NEXT:    s_subb_u32 s3, 0, s13
6530; GCN-NEXT:    s_ashr_i32 s14, s11, 31
6531; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
6532; GCN-NEXT:    v_rcp_f32_e32 v0, v0
6533; GCN-NEXT:    s_mov_b32 s15, s14
6534; GCN-NEXT:    s_mov_b32 s6, -1
6535; GCN-NEXT:    s_mov_b32 s4, s8
6536; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
6537; GCN-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
6538; GCN-NEXT:    v_trunc_f32_e32 v1, v1
6539; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
6540; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
6541; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
6542; GCN-NEXT:    s_mov_b32 s5, s9
6543; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
6544; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
6545; GCN-NEXT:    v_mul_lo_u32 v5, s3, v0
6546; GCN-NEXT:    v_mul_lo_u32 v4, s2, v0
6547; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6548; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
6549; GCN-NEXT:    v_mul_hi_u32 v3, v0, v4
6550; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
6551; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
6552; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
6553; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
6554; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
6555; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
6556; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
6557; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
6558; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
6559; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
6560; GCN-NEXT:    v_mov_b32_e32 v4, 0
6561; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
6562; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6563; GCN-NEXT:    v_mov_b32_e32 v6, 0
6564; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
6565; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
6566; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
6567; GCN-NEXT:    v_mul_lo_u32 v5, s2, v2
6568; GCN-NEXT:    v_mul_hi_u32 v7, s2, v0
6569; GCN-NEXT:    v_mul_lo_u32 v8, s3, v0
6570; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
6571; GCN-NEXT:    v_mul_lo_u32 v7, s2, v0
6572; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
6573; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
6574; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
6575; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
6576; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
6577; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
6578; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
6579; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
6580; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
6581; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
6582; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
6583; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
6584; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
6585; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
6586; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
6587; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6588; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
6589; GCN-NEXT:    s_add_u32 s0, s10, s14
6590; GCN-NEXT:    s_addc_u32 s1, s11, s14
6591; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
6592; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
6593; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6594; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
6595; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
6596; GCN-NEXT:    v_mul_hi_u32 v5, s10, v1
6597; GCN-NEXT:    v_mul_hi_u32 v7, s11, v1
6598; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
6599; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6600; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6601; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
6602; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
6603; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
6604; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
6605; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
6606; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6607; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
6608; GCN-NEXT:    v_mul_lo_u32 v1, s12, v1
6609; GCN-NEXT:    v_mul_hi_u32 v2, s12, v0
6610; GCN-NEXT:    v_mul_lo_u32 v3, s13, v0
6611; GCN-NEXT:    v_mul_lo_u32 v0, s12, v0
6612; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
6613; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6614; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s11, v1
6615; GCN-NEXT:    v_mov_b32_e32 v3, s13
6616; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
6617; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
6618; GCN-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
6619; GCN-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
6620; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v5
6621; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
6622; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
6623; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
6624; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
6625; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
6626; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v5
6627; GCN-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
6628; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
6629; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
6630; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
6631; GCN-NEXT:    v_mov_b32_e32 v5, s11
6632; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
6633; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
6634; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
6635; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
6636; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
6637; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
6638; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
6639; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
6640; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6641; GCN-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
6642; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6643; GCN-NEXT:    v_xor_b32_e32 v0, s14, v0
6644; GCN-NEXT:    v_xor_b32_e32 v1, s14, v1
6645; GCN-NEXT:    v_mov_b32_e32 v2, s14
6646; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s14, v0
6647; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
6648; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
6649; GCN-NEXT:    s_endpgm
6650  %shl.y = shl i64 4096, %y
6651  %r = srem i64 %x, %shl.y
6652  store i64 %r, i64 addrspace(1)* %out
6653  ret void
6654}
6655
6656define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) {
6657; CHECK-LABEL: @srem_v2i64_pow2k_denom(
6658; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
6659; CHECK-NEXT:    [[TMP2:%.*]] = srem i64 [[TMP1]], 4096
6660; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0
6661; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1
6662; CHECK-NEXT:    [[TMP5:%.*]] = srem i64 [[TMP4]], 4096
6663; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1
6664; CHECK-NEXT:    store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
6665; CHECK-NEXT:    ret void
6666;
6667; GCN-LABEL: srem_v2i64_pow2k_denom:
6668; GCN:       ; %bb.0:
6669; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6670; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
6671; GCN-NEXT:    s_movk_i32 s8, 0xf000
6672; GCN-NEXT:    s_mov_b32 s7, 0xf000
6673; GCN-NEXT:    s_mov_b32 s6, -1
6674; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6675; GCN-NEXT:    s_ashr_i32 s9, s1, 31
6676; GCN-NEXT:    s_lshr_b32 s9, s9, 20
6677; GCN-NEXT:    s_add_u32 s9, s0, s9
6678; GCN-NEXT:    s_addc_u32 s10, s1, 0
6679; GCN-NEXT:    s_and_b32 s9, s9, s8
6680; GCN-NEXT:    s_sub_u32 s0, s0, s9
6681; GCN-NEXT:    s_subb_u32 s1, s1, s10
6682; GCN-NEXT:    s_ashr_i32 s9, s3, 31
6683; GCN-NEXT:    s_lshr_b32 s9, s9, 20
6684; GCN-NEXT:    s_add_u32 s9, s2, s9
6685; GCN-NEXT:    s_addc_u32 s10, s3, 0
6686; GCN-NEXT:    s_and_b32 s8, s9, s8
6687; GCN-NEXT:    s_sub_u32 s2, s2, s8
6688; GCN-NEXT:    s_subb_u32 s3, s3, s10
6689; GCN-NEXT:    v_mov_b32_e32 v0, s0
6690; GCN-NEXT:    v_mov_b32_e32 v1, s1
6691; GCN-NEXT:    v_mov_b32_e32 v2, s2
6692; GCN-NEXT:    v_mov_b32_e32 v3, s3
6693; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
6694; GCN-NEXT:    s_endpgm
6695  %r = srem <2 x i64> %x, <i64 4096, i64 4096>
6696  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
6697  ret void
6698}
6699
6700define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) {
6701; CHECK-LABEL: @srem_v2i64_pow2_shl_denom(
6702; CHECK-NEXT:    [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]]
6703; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0
6704; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0
6705; CHECK-NEXT:    [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]]
6706; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0
6707; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1
6708; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1
6709; CHECK-NEXT:    [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]]
6710; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1
6711; CHECK-NEXT:    store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16
6712; CHECK-NEXT:    ret void
6713;
6714; GCN-LABEL: srem_v2i64_pow2_shl_denom:
6715; GCN:       ; %bb.0:
6716; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x11
6717; GCN-NEXT:    s_mov_b32 s3, 0
6718; GCN-NEXT:    s_movk_i32 s2, 0x1000
6719; GCN-NEXT:    s_mov_b32 s18, 0x4f800000
6720; GCN-NEXT:    s_mov_b32 s19, 0x5f7ffffc
6721; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6722; GCN-NEXT:    s_lshl_b64 s[14:15], s[2:3], s6
6723; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
6724; GCN-NEXT:    s_ashr_i32 s4, s3, 31
6725; GCN-NEXT:    s_add_u32 s2, s2, s4
6726; GCN-NEXT:    s_mov_b32 s5, s4
6727; GCN-NEXT:    s_addc_u32 s3, s3, s4
6728; GCN-NEXT:    s_xor_b64 s[16:17], s[2:3], s[4:5]
6729; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s16
6730; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s17
6731; GCN-NEXT:    s_mov_b32 s20, 0x2f800000
6732; GCN-NEXT:    s_mov_b32 s21, 0xcf800000
6733; GCN-NEXT:    s_sub_u32 s6, 0, s16
6734; GCN-NEXT:    v_mac_f32_e32 v0, s18, v1
6735; GCN-NEXT:    v_rcp_f32_e32 v0, v0
6736; GCN-NEXT:    s_subb_u32 s7, 0, s17
6737; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
6738; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
6739; GCN-NEXT:    v_mul_f32_e32 v0, s19, v0
6740; GCN-NEXT:    v_mul_f32_e32 v1, s20, v0
6741; GCN-NEXT:    v_trunc_f32_e32 v1, v1
6742; GCN-NEXT:    v_mac_f32_e32 v0, s21, v1
6743; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
6744; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
6745; GCN-NEXT:    s_waitcnt lgkmcnt(0)
6746; GCN-NEXT:    s_ashr_i32 s12, s9, 31
6747; GCN-NEXT:    s_add_u32 s0, s8, s12
6748; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
6749; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
6750; GCN-NEXT:    v_mul_lo_u32 v4, s7, v0
6751; GCN-NEXT:    v_mul_lo_u32 v5, s6, v0
6752; GCN-NEXT:    s_mov_b32 s13, s12
6753; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6754; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
6755; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
6756; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
6757; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
6758; GCN-NEXT:    v_mul_hi_u32 v7, v1, v2
6759; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
6760; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
6761; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
6762; GCN-NEXT:    v_mul_lo_u32 v6, v1, v5
6763; GCN-NEXT:    v_mul_hi_u32 v5, v1, v5
6764; GCN-NEXT:    s_addc_u32 s1, s9, s12
6765; GCN-NEXT:    s_xor_b64 s[8:9], s[0:1], s[12:13]
6766; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
6767; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v4, v5, vcc
6768; GCN-NEXT:    v_mov_b32_e32 v4, 0
6769; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v4, vcc
6770; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6771; GCN-NEXT:    v_mov_b32_e32 v6, 0
6772; GCN-NEXT:    v_add_i32_e64 v0, s[2:3], v0, v2
6773; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
6774; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[2:3]
6775; GCN-NEXT:    v_mul_lo_u32 v5, s6, v2
6776; GCN-NEXT:    v_mul_hi_u32 v7, s6, v0
6777; GCN-NEXT:    v_mul_lo_u32 v8, s7, v0
6778; GCN-NEXT:    s_mov_b32 s7, 0xf000
6779; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
6780; GCN-NEXT:    v_mul_lo_u32 v7, s6, v0
6781; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
6782; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
6783; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
6784; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
6785; GCN-NEXT:    v_mul_hi_u32 v9, v2, v7
6786; GCN-NEXT:    v_mul_lo_u32 v7, v2, v7
6787; GCN-NEXT:    v_mul_hi_u32 v8, v2, v5
6788; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
6789; GCN-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
6790; GCN-NEXT:    v_mul_lo_u32 v2, v2, v5
6791; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
6792; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
6793; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
6794; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
6795; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
6796; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6797; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[2:3]
6798; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
6799; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
6800; GCN-NEXT:    v_mul_lo_u32 v2, s8, v1
6801; GCN-NEXT:    v_mul_hi_u32 v3, s8, v0
6802; GCN-NEXT:    v_mul_hi_u32 v5, s8, v1
6803; GCN-NEXT:    v_mul_hi_u32 v7, s9, v1
6804; GCN-NEXT:    v_mul_lo_u32 v1, s9, v1
6805; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
6806; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6807; GCN-NEXT:    v_mul_lo_u32 v5, s9, v0
6808; GCN-NEXT:    v_mul_hi_u32 v0, s9, v0
6809; GCN-NEXT:    s_mov_b32 s6, -1
6810; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
6811; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
6812; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
6813; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
6814; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
6815; GCN-NEXT:    v_mul_lo_u32 v1, s16, v1
6816; GCN-NEXT:    v_mul_hi_u32 v2, s16, v0
6817; GCN-NEXT:    v_mul_lo_u32 v3, s17, v0
6818; GCN-NEXT:    v_mul_lo_u32 v0, s16, v0
6819; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
6820; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
6821; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s9, v1
6822; GCN-NEXT:    v_mov_b32_e32 v3, s17
6823; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
6824; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
6825; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s16, v0
6826; GCN-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v2, s[0:1]
6827; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s17, v7
6828; GCN-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
6829; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
6830; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s16, v5
6831; GCN-NEXT:    v_subrev_i32_e64 v3, s[0:1], s16, v5
6832; GCN-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
6833; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s17, v7
6834; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
6835; GCN-NEXT:    s_ashr_i32 s2, s15, 31
6836; GCN-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
6837; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
6838; GCN-NEXT:    s_add_u32 s8, s14, s2
6839; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[0:1]
6840; GCN-NEXT:    v_mov_b32_e32 v7, s9
6841; GCN-NEXT:    s_mov_b32 s3, s2
6842; GCN-NEXT:    s_addc_u32 s9, s15, s2
6843; GCN-NEXT:    s_xor_b64 s[8:9], s[8:9], s[2:3]
6844; GCN-NEXT:    v_cvt_f32_u32_e32 v8, s8
6845; GCN-NEXT:    v_cvt_f32_u32_e32 v9, s9
6846; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v7, v1, vcc
6847; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s17, v1
6848; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
6849; GCN-NEXT:    v_mac_f32_e32 v8, s18, v9
6850; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s16, v0
6851; GCN-NEXT:    v_rcp_f32_e32 v8, v8
6852; GCN-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
6853; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s17, v1
6854; GCN-NEXT:    v_cndmask_b32_e32 v7, v7, v10, vcc
6855; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
6856; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
6857; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v3, s[0:1]
6858; GCN-NEXT:    v_mul_f32_e32 v3, s19, v8
6859; GCN-NEXT:    v_mul_f32_e32 v5, s20, v3
6860; GCN-NEXT:    v_trunc_f32_e32 v5, v5
6861; GCN-NEXT:    v_mac_f32_e32 v3, s21, v5
6862; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
6863; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v5
6864; GCN-NEXT:    s_sub_u32 s2, 0, s8
6865; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
6866; GCN-NEXT:    v_mul_hi_u32 v2, s2, v3
6867; GCN-NEXT:    v_mul_lo_u32 v7, s2, v5
6868; GCN-NEXT:    s_subb_u32 s3, 0, s9
6869; GCN-NEXT:    v_mul_lo_u32 v8, s3, v3
6870; GCN-NEXT:    s_ashr_i32 s14, s11, 31
6871; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
6872; GCN-NEXT:    v_mul_lo_u32 v7, s2, v3
6873; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
6874; GCN-NEXT:    v_mul_lo_u32 v8, v3, v2
6875; GCN-NEXT:    v_mul_hi_u32 v10, v3, v2
6876; GCN-NEXT:    v_mul_hi_u32 v9, v3, v7
6877; GCN-NEXT:    v_mul_hi_u32 v11, v5, v2
6878; GCN-NEXT:    v_mul_lo_u32 v2, v5, v2
6879; GCN-NEXT:    s_mov_b32 s15, s14
6880; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
6881; GCN-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
6882; GCN-NEXT:    v_mul_lo_u32 v10, v5, v7
6883; GCN-NEXT:    v_mul_hi_u32 v7, v5, v7
6884; GCN-NEXT:    v_xor_b32_e32 v0, s12, v0
6885; GCN-NEXT:    v_xor_b32_e32 v1, s12, v1
6886; GCN-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
6887; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
6888; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v11, v4, vcc
6889; GCN-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
6890; GCN-NEXT:    v_add_i32_e64 v2, s[0:1], v3, v2
6891; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v6, v8, vcc
6892; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v5, v7, s[0:1]
6893; GCN-NEXT:    v_mul_lo_u32 v8, s2, v3
6894; GCN-NEXT:    v_mul_hi_u32 v9, s2, v2
6895; GCN-NEXT:    v_mul_lo_u32 v10, s3, v2
6896; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
6897; GCN-NEXT:    v_mul_lo_u32 v9, s2, v2
6898; GCN-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
6899; GCN-NEXT:    v_mul_lo_u32 v12, v2, v8
6900; GCN-NEXT:    v_mul_hi_u32 v14, v2, v8
6901; GCN-NEXT:    v_mul_hi_u32 v13, v2, v9
6902; GCN-NEXT:    v_mul_hi_u32 v11, v3, v9
6903; GCN-NEXT:    v_mul_lo_u32 v9, v3, v9
6904; GCN-NEXT:    v_mul_hi_u32 v10, v3, v8
6905; GCN-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
6906; GCN-NEXT:    v_addc_u32_e32 v13, vcc, 0, v14, vcc
6907; GCN-NEXT:    v_mul_lo_u32 v3, v3, v8
6908; GCN-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
6909; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v13, v11, vcc
6910; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v10, v4, vcc
6911; GCN-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
6912; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v6, v8, vcc
6913; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
6914; GCN-NEXT:    v_addc_u32_e64 v5, vcc, v5, v8, s[0:1]
6915; GCN-NEXT:    s_add_u32 s0, s10, s14
6916; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
6917; GCN-NEXT:    s_addc_u32 s1, s11, s14
6918; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
6919; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
6920; GCN-NEXT:    v_mul_lo_u32 v5, s10, v3
6921; GCN-NEXT:    v_mul_hi_u32 v7, s10, v2
6922; GCN-NEXT:    v_mul_hi_u32 v9, s10, v3
6923; GCN-NEXT:    v_mul_hi_u32 v10, s11, v3
6924; GCN-NEXT:    v_mul_lo_u32 v3, s11, v3
6925; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
6926; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
6927; GCN-NEXT:    v_mul_lo_u32 v9, s11, v2
6928; GCN-NEXT:    v_mul_hi_u32 v2, s11, v2
6929; GCN-NEXT:    v_mov_b32_e32 v8, s12
6930; GCN-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
6931; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v7, v2, vcc
6932; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v10, v4, vcc
6933; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
6934; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v6, v4, vcc
6935; GCN-NEXT:    v_mul_lo_u32 v3, s8, v3
6936; GCN-NEXT:    v_mul_hi_u32 v4, s8, v2
6937; GCN-NEXT:    v_mul_lo_u32 v5, s9, v2
6938; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s12, v0
6939; GCN-NEXT:    v_mul_lo_u32 v2, s8, v2
6940; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
6941; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
6942; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
6943; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s11, v3
6944; GCN-NEXT:    v_mov_b32_e32 v5, s9
6945; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s10, v2
6946; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
6947; GCN-NEXT:    v_subrev_i32_e64 v6, s[0:1], s8, v2
6948; GCN-NEXT:    v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1]
6949; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v7
6950; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1]
6951; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
6952; GCN-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v6
6953; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s8, v6
6954; GCN-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
6955; GCN-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v7
6956; GCN-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
6957; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
6958; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
6959; GCN-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[0:1]
6960; GCN-NEXT:    v_mov_b32_e32 v7, s11
6961; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
6962; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
6963; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
6964; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
6965; GCN-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
6966; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
6967; GCN-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
6968; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
6969; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
6970; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
6971; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
6972; GCN-NEXT:    v_xor_b32_e32 v2, s14, v2
6973; GCN-NEXT:    v_xor_b32_e32 v3, s14, v3
6974; GCN-NEXT:    v_mov_b32_e32 v4, s14
6975; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, s14, v2
6976; GCN-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
6977; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
6978; GCN-NEXT:    s_endpgm
6979  %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
6980  %r = srem <2 x i64> %x, %shl.y
6981  store <2 x i64> %r, <2 x i64> addrspace(1)* %out
6982  ret void
6983}
6984