1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 3; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s 5 6define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 7; CHECK-LABEL: @udiv_i32( 8; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 9; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 10; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000 11; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 12; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]] 13; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]] 14; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64 15; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 16; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 17; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 18; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 19; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 20; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]] 21; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64 22; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 23; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 24; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 25; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 26; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 27; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]] 28; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]] 29; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]] 30; CHECK-NEXT: [[TMP23:%.*]] = add i32 [[TMP19]], 1 31; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP19]] 32; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP21]], [[Y]] 33; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP22]], i32 [[TMP25]], i32 [[TMP21]] 34; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]] 35; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP24]], 1 36; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]] 37; CHECK-NEXT: store i32 [[TMP29]], i32 addrspace(1)* [[OUT:%.*]], align 4 38; CHECK-NEXT: ret void 39; 40; GCN-LABEL: udiv_i32: 41; GCN: ; %bb.0: 42; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 43; GCN-NEXT: s_mov_b32 s7, 0xf000 44; GCN-NEXT: s_mov_b32 s6, -1 45; GCN-NEXT: s_waitcnt lgkmcnt(0) 46; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 47; GCN-NEXT: s_sub_i32 s4, 0, s3 48; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 49; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 50; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 51; GCN-NEXT: v_mul_lo_u32 v1, s4, v0 52; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 53; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 54; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 55; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 56; GCN-NEXT: v_mul_lo_u32 v1, v0, s3 57; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 58; GCN-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 59; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 60; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 61; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 62; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 63; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 64; GCN-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 65; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 66; GCN-NEXT: s_waitcnt lgkmcnt(0) 67; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 68; GCN-NEXT: s_endpgm 69 %r = udiv i32 %x, %y 70 store i32 %r, i32 addrspace(1)* %out 71 ret void 72} 73 74define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 75; CHECK-LABEL: @urem_i32( 76; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 77; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 78; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000 79; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 80; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]] 81; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]] 82; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64 83; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 84; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 85; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 86; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 87; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 88; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]] 89; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64 90; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 91; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 92; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 93; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 94; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 95; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]] 96; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]] 97; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]] 98; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP21]], [[Y]] 99; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP21]] 100; CHECK-NEXT: [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]] 101; CHECK-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]] 102; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]] 103; CHECK-NEXT: store i32 [[TMP27]], i32 addrspace(1)* [[OUT:%.*]], align 4 104; CHECK-NEXT: ret void 105; 106; GCN-LABEL: urem_i32: 107; GCN: ; %bb.0: 108; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 109; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 110; GCN-NEXT: s_mov_b32 s3, 0xf000 111; GCN-NEXT: s_waitcnt lgkmcnt(0) 112; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5 113; GCN-NEXT: s_sub_i32 s2, 0, s5 114; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 115; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 116; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 117; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 118; GCN-NEXT: s_mov_b32 s2, -1 119; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 120; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 121; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 122; GCN-NEXT: v_mul_lo_u32 v0, v0, s5 123; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 124; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s5, v0 125; GCN-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 126; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 127; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s5, v0 128; GCN-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 129; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 130; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 131; GCN-NEXT: s_endpgm 132 %r = urem i32 %x, %y 133 store i32 %r, i32 addrspace(1)* %out 134 ret void 135} 136 137define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 138; CHECK-LABEL: @sdiv_i32( 139; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 140; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 141; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 142; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[X]], [[TMP1]] 143; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]] 144; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]] 145; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]] 146; CHECK-NEXT: [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float 147; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]]) 148; CHECK-NEXT: [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41EFFFFFC0000000 149; CHECK-NEXT: [[TMP11:%.*]] = fptoui float [[TMP10]] to i32 150; CHECK-NEXT: [[TMP12:%.*]] = sub i32 0, [[TMP7]] 151; CHECK-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP11]] 152; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP11]] to i64 153; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 154; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 155; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 156; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 157; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 158; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP11]], [[TMP19]] 159; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP6]] to i64 160; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP20]] to i64 161; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]] 162; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 163; CHECK-NEXT: [[TMP25:%.*]] = lshr i64 [[TMP23]], 32 164; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 165; CHECK-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], [[TMP7]] 166; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP6]], [[TMP27]] 167; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP7]] 168; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 169; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 170; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP28]], [[TMP7]] 171; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP29]], i32 [[TMP32]], i32 [[TMP28]] 172; CHECK-NEXT: [[TMP34:%.*]] = icmp uge i32 [[TMP33]], [[TMP7]] 173; CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP31]], 1 174; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]] 175; CHECK-NEXT: [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]] 176; CHECK-NEXT: [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]] 177; CHECK-NEXT: store i32 [[TMP38]], i32 addrspace(1)* [[OUT:%.*]], align 4 178; CHECK-NEXT: ret void 179; 180; GCN-LABEL: sdiv_i32: 181; GCN: ; %bb.0: 182; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 183; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 184; GCN-NEXT: s_mov_b32 s7, 0xf000 185; GCN-NEXT: s_mov_b32 s6, -1 186; GCN-NEXT: s_waitcnt lgkmcnt(0) 187; GCN-NEXT: s_ashr_i32 s8, s3, 31 188; GCN-NEXT: s_add_i32 s3, s3, s8 189; GCN-NEXT: s_xor_b32 s9, s3, s8 190; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 191; GCN-NEXT: s_sub_i32 s3, 0, s9 192; GCN-NEXT: s_ashr_i32 s0, s2, 31 193; GCN-NEXT: s_add_i32 s1, s2, s0 194; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 195; GCN-NEXT: s_xor_b32 s1, s1, s0 196; GCN-NEXT: s_xor_b32 s2, s0, s8 197; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 198; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 199; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 200; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 201; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 202; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 203; GCN-NEXT: v_mul_lo_u32 v1, v0, s9 204; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 205; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 206; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v1 207; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 208; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s9, v1 209; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 210; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 211; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 212; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 213; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 214; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 215; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 216; GCN-NEXT: s_endpgm 217 %r = sdiv i32 %x, %y 218 store i32 %r, i32 addrspace(1)* %out 219 ret void 220} 221 222define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 223; CHECK-LABEL: @srem_i32( 224; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 225; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 226; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[X]], [[TMP1]] 227; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]] 228; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]] 229; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]] 230; CHECK-NEXT: [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float 231; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 232; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41EFFFFFC0000000 233; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP9]] to i32 234; CHECK-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP6]] 235; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP10]] 236; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP10]] to i64 237; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64 238; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]] 239; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 240; CHECK-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP15]], 32 241; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 242; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP10]], [[TMP18]] 243; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64 244; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP19]] to i64 245; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]] 246; CHECK-NEXT: [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32 247; CHECK-NEXT: [[TMP24:%.*]] = lshr i64 [[TMP22]], 32 248; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 249; CHECK-NEXT: [[TMP26:%.*]] = mul i32 [[TMP25]], [[TMP6]] 250; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP5]], [[TMP26]] 251; CHECK-NEXT: [[TMP28:%.*]] = icmp uge i32 [[TMP27]], [[TMP6]] 252; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP6]] 253; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP28]], i32 [[TMP29]], i32 [[TMP27]] 254; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP6]] 255; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP30]], [[TMP6]] 256; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]] 257; CHECK-NEXT: [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]] 258; CHECK-NEXT: [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]] 259; CHECK-NEXT: store i32 [[TMP35]], i32 addrspace(1)* [[OUT:%.*]], align 4 260; CHECK-NEXT: ret void 261; 262; GCN-LABEL: srem_i32: 263; GCN: ; %bb.0: 264; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 265; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 266; GCN-NEXT: s_waitcnt lgkmcnt(0) 267; GCN-NEXT: s_ashr_i32 s4, s3, 31 268; GCN-NEXT: s_add_i32 s3, s3, s4 269; GCN-NEXT: s_xor_b32 s6, s3, s4 270; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 271; GCN-NEXT: s_sub_i32 s3, 0, s6 272; GCN-NEXT: s_ashr_i32 s4, s2, 31 273; GCN-NEXT: s_add_i32 s2, s2, s4 274; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 275; GCN-NEXT: s_xor_b32 s5, s2, s4 276; GCN-NEXT: s_mov_b32 s2, -1 277; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 278; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 279; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 280; GCN-NEXT: s_mov_b32 s3, 0xf000 281; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 282; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 283; GCN-NEXT: v_mul_hi_u32 v0, s5, v0 284; GCN-NEXT: v_mul_lo_u32 v0, v0, s6 285; GCN-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 286; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s6, v0 287; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 288; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 289; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s6, v0 290; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 291; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 292; GCN-NEXT: v_xor_b32_e32 v0, s4, v0 293; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 294; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 295; GCN-NEXT: s_endpgm 296 %r = srem i32 %x, %y 297 store i32 %r, i32 addrspace(1)* %out 298 ret void 299} 300 301define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 302; CHECK-LABEL: @udiv_i16( 303; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 304; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 305; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 306; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 307; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 308; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 309; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 310; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 311; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 312; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 313; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 314; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 315; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 316; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 317; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 318; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 65535 319; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 320; CHECK-NEXT: store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]], align 2 321; CHECK-NEXT: ret void 322; 323; GCN-LABEL: udiv_i16: 324; GCN: ; %bb.0: 325; GCN-NEXT: s_load_dword s2, s[0:1], 0xb 326; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 327; GCN-NEXT: s_waitcnt lgkmcnt(0) 328; GCN-NEXT: s_lshr_b32 s3, s2, 16 329; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 330; GCN-NEXT: s_and_b32 s2, s2, 0xffff 331; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 332; GCN-NEXT: s_mov_b32 s3, 0xf000 333; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 334; GCN-NEXT: s_mov_b32 s2, -1 335; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 336; GCN-NEXT: v_trunc_f32_e32 v2, v2 337; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 338; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 339; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 340; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 341; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 342; GCN-NEXT: s_endpgm 343 %r = udiv i16 %x, %y 344 store i16 %r, i16 addrspace(1)* %out 345 ret void 346} 347 348define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 349; CHECK-LABEL: @urem_i16( 350; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 351; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 352; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 353; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 354; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 355; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 356; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 357; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 358; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 359; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 360; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 361; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 362; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 363; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 364; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 365; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 366; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 367; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 368; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 369; CHECK-NEXT: store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]], align 2 370; CHECK-NEXT: ret void 371; 372; GCN-LABEL: urem_i16: 373; GCN: ; %bb.0: 374; GCN-NEXT: s_load_dword s4, s[0:1], 0xb 375; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 376; GCN-NEXT: s_waitcnt lgkmcnt(0) 377; GCN-NEXT: s_lshr_b32 s2, s4, 16 378; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 379; GCN-NEXT: s_and_b32 s3, s4, 0xffff 380; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 381; GCN-NEXT: s_mov_b32 s3, 0xf000 382; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 383; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 384; GCN-NEXT: v_trunc_f32_e32 v2, v2 385; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 386; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 387; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 388; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 389; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 390; GCN-NEXT: s_mov_b32 s2, -1 391; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 392; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 393; GCN-NEXT: s_endpgm 394 %r = urem i16 %x, %y 395 store i16 %r, i16 addrspace(1)* %out 396 ret void 397} 398 399define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 400; CHECK-LABEL: @sdiv_i16( 401; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 402; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 403; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 404; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 405; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 406; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 407; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 408; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 409; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 410; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 411; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 412; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 413; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 414; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 415; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 416; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 417; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 418; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 419; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 16 420; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 16 421; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 422; CHECK-NEXT: store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]], align 2 423; CHECK-NEXT: ret void 424; 425; GCN-LABEL: sdiv_i16: 426; GCN: ; %bb.0: 427; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 428; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 429; GCN-NEXT: s_mov_b32 s7, 0xf000 430; GCN-NEXT: s_mov_b32 s6, -1 431; GCN-NEXT: s_waitcnt lgkmcnt(0) 432; GCN-NEXT: s_ashr_i32 s1, s0, 16 433; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 434; GCN-NEXT: s_sext_i32_i16 s0, s0 435; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 436; GCN-NEXT: s_xor_b32 s0, s0, s1 437; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 438; GCN-NEXT: s_ashr_i32 s0, s0, 30 439; GCN-NEXT: s_or_b32 s0, s0, 1 440; GCN-NEXT: v_mov_b32_e32 v3, s0 441; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 442; GCN-NEXT: v_trunc_f32_e32 v2, v2 443; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 444; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 445; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 446; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 447; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 448; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 449; GCN-NEXT: s_endpgm 450 %r = sdiv i16 %x, %y 451 store i16 %r, i16 addrspace(1)* %out 452 ret void 453} 454 455define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 456; CHECK-LABEL: @srem_i16( 457; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 458; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 459; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 460; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 461; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 462; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 463; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 464; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 465; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 466; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 467; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 468; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 469; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 470; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 471; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 472; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 473; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 474; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 475; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 476; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 477; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 478; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 479; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 480; CHECK-NEXT: store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]], align 2 481; CHECK-NEXT: ret void 482; 483; GCN-LABEL: srem_i16: 484; GCN: ; %bb.0: 485; GCN-NEXT: s_load_dword s4, s[0:1], 0xb 486; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 487; GCN-NEXT: s_waitcnt lgkmcnt(0) 488; GCN-NEXT: s_ashr_i32 s2, s4, 16 489; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 490; GCN-NEXT: s_sext_i32_i16 s3, s4 491; GCN-NEXT: v_cvt_f32_i32_e32 v1, s3 492; GCN-NEXT: s_xor_b32 s3, s3, s2 493; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 494; GCN-NEXT: s_ashr_i32 s3, s3, 30 495; GCN-NEXT: s_or_b32 s3, s3, 1 496; GCN-NEXT: v_mov_b32_e32 v3, s3 497; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 498; GCN-NEXT: v_trunc_f32_e32 v2, v2 499; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 500; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 501; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 502; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 503; GCN-NEXT: s_mov_b32 s3, 0xf000 504; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 505; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 506; GCN-NEXT: s_mov_b32 s2, -1 507; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 508; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 509; GCN-NEXT: s_endpgm 510 %r = srem i16 %x, %y 511 store i16 %r, i16 addrspace(1)* %out 512 ret void 513} 514 515define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 516; CHECK-LABEL: @udiv_i8( 517; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 518; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 519; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 520; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 521; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 522; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 523; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 524; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 525; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 526; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 527; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 528; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 529; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 530; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 531; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 532; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 255 533; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 534; CHECK-NEXT: store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]], align 1 535; CHECK-NEXT: ret void 536; 537; GCN-LABEL: udiv_i8: 538; GCN: ; %bb.0: 539; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 540; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 541; GCN-NEXT: s_mov_b32 s7, 0xf000 542; GCN-NEXT: s_mov_b32 s6, -1 543; GCN-NEXT: s_waitcnt lgkmcnt(0) 544; GCN-NEXT: v_cvt_f32_ubyte1_e32 v0, s0 545; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 546; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 547; GCN-NEXT: v_mul_f32_e32 v1, v2, v1 548; GCN-NEXT: v_trunc_f32_e32 v1, v1 549; GCN-NEXT: v_cvt_u32_f32_e32 v3, v1 550; GCN-NEXT: v_mad_f32 v1, -v1, v0, v2 551; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 552; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 553; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 554; GCN-NEXT: s_endpgm 555 %r = udiv i8 %x, %y 556 store i8 %r, i8 addrspace(1)* %out 557 ret void 558} 559 560define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 561; CHECK-LABEL: @urem_i8( 562; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 563; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 564; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 565; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 566; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 567; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 568; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 569; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 570; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 571; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 572; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 573; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 574; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 575; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 576; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 577; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 578; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 579; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 255 580; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8 581; CHECK-NEXT: store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]], align 1 582; CHECK-NEXT: ret void 583; 584; GCN-LABEL: urem_i8: 585; GCN: ; %bb.0: 586; GCN-NEXT: s_load_dword s4, s[0:1], 0xb 587; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 588; GCN-NEXT: s_mov_b32 s3, 0xf000 589; GCN-NEXT: s_waitcnt lgkmcnt(0) 590; GCN-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 591; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 592; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 593; GCN-NEXT: s_lshr_b32 s2, s4, 8 594; GCN-NEXT: v_mul_f32_e32 v1, v2, v1 595; GCN-NEXT: v_trunc_f32_e32 v1, v1 596; GCN-NEXT: v_cvt_u32_f32_e32 v3, v1 597; GCN-NEXT: v_mad_f32 v1, -v1, v0, v2 598; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 599; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 600; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 601; GCN-NEXT: s_mov_b32 s2, -1 602; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 603; GCN-NEXT: buffer_store_byte v0, off, s[0:3], 0 604; GCN-NEXT: s_endpgm 605 %r = urem i8 %x, %y 606 store i8 %r, i8 addrspace(1)* %out 607 ret void 608} 609 610define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 611; CHECK-LABEL: @sdiv_i8( 612; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 613; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 614; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 615; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 616; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 617; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 618; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 619; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 620; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 621; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 622; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 623; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 624; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 625; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 626; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 627; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 628; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 629; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 630; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 24 631; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 24 632; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8 633; CHECK-NEXT: store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]], align 1 634; CHECK-NEXT: ret void 635; 636; GCN-LABEL: sdiv_i8: 637; GCN: ; %bb.0: 638; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 639; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 640; GCN-NEXT: s_mov_b32 s7, 0xf000 641; GCN-NEXT: s_mov_b32 s6, -1 642; GCN-NEXT: s_waitcnt lgkmcnt(0) 643; GCN-NEXT: s_bfe_i32 s1, s0, 0x80008 644; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 645; GCN-NEXT: s_sext_i32_i8 s0, s0 646; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 647; GCN-NEXT: s_xor_b32 s0, s0, s1 648; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 649; GCN-NEXT: s_ashr_i32 s0, s0, 30 650; GCN-NEXT: s_or_b32 s0, s0, 1 651; GCN-NEXT: v_mov_b32_e32 v3, s0 652; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 653; GCN-NEXT: v_trunc_f32_e32 v2, v2 654; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 655; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 656; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 657; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 658; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 659; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 660; GCN-NEXT: s_endpgm 661 %r = sdiv i8 %x, %y 662 store i8 %r, i8 addrspace(1)* %out 663 ret void 664} 665 666define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 667; CHECK-LABEL: @srem_i8( 668; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 669; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 670; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 671; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 672; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 673; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 674; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 675; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 676; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 677; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 678; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 679; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 680; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 681; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 682; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 683; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 684; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 685; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 686; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 687; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 688; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 24 689; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 24 690; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8 691; CHECK-NEXT: store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]], align 1 692; CHECK-NEXT: ret void 693; 694; GCN-LABEL: srem_i8: 695; GCN: ; %bb.0: 696; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 697; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 698; GCN-NEXT: s_mov_b32 s7, 0xf000 699; GCN-NEXT: s_mov_b32 s6, -1 700; GCN-NEXT: s_waitcnt lgkmcnt(0) 701; GCN-NEXT: s_bfe_i32 s1, s0, 0x80008 702; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 703; GCN-NEXT: s_sext_i32_i8 s3, s0 704; GCN-NEXT: v_cvt_f32_i32_e32 v1, s3 705; GCN-NEXT: s_xor_b32 s1, s3, s1 706; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 707; GCN-NEXT: s_ashr_i32 s1, s1, 30 708; GCN-NEXT: s_or_b32 s1, s1, 1 709; GCN-NEXT: v_mov_b32_e32 v3, s1 710; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 711; GCN-NEXT: v_trunc_f32_e32 v2, v2 712; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 713; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 714; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 715; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 716; GCN-NEXT: s_lshr_b32 s2, s0, 8 717; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 718; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 719; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 720; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 721; GCN-NEXT: s_endpgm 722 %r = srem i8 %x, %y 723 store i8 %r, i8 addrspace(1)* %out 724 ret void 725} 726 727define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 728; CHECK-LABEL: @udiv_v4i32( 729; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 730; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 731; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 732; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 733; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 734; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 735; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 736; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 737; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 738; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 739; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 740; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 741; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 742; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 743; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 744; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 745; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 746; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 747; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 748; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 749; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 750; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 751; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 752; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 753; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1 754; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]] 755; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]] 756; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]] 757; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]] 758; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 759; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 760; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i64 0 761; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[X]], i64 1 762; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[Y]], i64 1 763; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float 764; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 765; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000 766; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32 767; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]] 768; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]] 769; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64 770; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64 771; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]] 772; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 773; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32 774; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32 775; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]] 776; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64 777; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64 778; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]] 779; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 780; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32 781; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32 782; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]] 783; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]] 784; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]] 785; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1 786; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]] 787; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]] 788; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]] 789; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]] 790; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1 791; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]] 792; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP63]], i64 1 793; CHECK-NEXT: [[TMP65:%.*]] = extractelement <4 x i32> [[X]], i64 2 794; CHECK-NEXT: [[TMP66:%.*]] = extractelement <4 x i32> [[Y]], i64 2 795; CHECK-NEXT: [[TMP67:%.*]] = uitofp i32 [[TMP66]] to float 796; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP67]]) 797; CHECK-NEXT: [[TMP69:%.*]] = fmul fast float [[TMP68]], 0x41EFFFFFC0000000 798; CHECK-NEXT: [[TMP70:%.*]] = fptoui float [[TMP69]] to i32 799; CHECK-NEXT: [[TMP71:%.*]] = sub i32 0, [[TMP66]] 800; CHECK-NEXT: [[TMP72:%.*]] = mul i32 [[TMP71]], [[TMP70]] 801; CHECK-NEXT: [[TMP73:%.*]] = zext i32 [[TMP70]] to i64 802; CHECK-NEXT: [[TMP74:%.*]] = zext i32 [[TMP72]] to i64 803; CHECK-NEXT: [[TMP75:%.*]] = mul i64 [[TMP73]], [[TMP74]] 804; CHECK-NEXT: [[TMP76:%.*]] = trunc i64 [[TMP75]] to i32 805; CHECK-NEXT: [[TMP77:%.*]] = lshr i64 [[TMP75]], 32 806; CHECK-NEXT: [[TMP78:%.*]] = trunc i64 [[TMP77]] to i32 807; CHECK-NEXT: [[TMP79:%.*]] = add i32 [[TMP70]], [[TMP78]] 808; CHECK-NEXT: [[TMP80:%.*]] = zext i32 [[TMP65]] to i64 809; CHECK-NEXT: [[TMP81:%.*]] = zext i32 [[TMP79]] to i64 810; CHECK-NEXT: [[TMP82:%.*]] = mul i64 [[TMP80]], [[TMP81]] 811; CHECK-NEXT: [[TMP83:%.*]] = trunc i64 [[TMP82]] to i32 812; CHECK-NEXT: [[TMP84:%.*]] = lshr i64 [[TMP82]], 32 813; CHECK-NEXT: [[TMP85:%.*]] = trunc i64 [[TMP84]] to i32 814; CHECK-NEXT: [[TMP86:%.*]] = mul i32 [[TMP85]], [[TMP66]] 815; CHECK-NEXT: [[TMP87:%.*]] = sub i32 [[TMP65]], [[TMP86]] 816; CHECK-NEXT: [[TMP88:%.*]] = icmp uge i32 [[TMP87]], [[TMP66]] 817; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP85]], 1 818; CHECK-NEXT: [[TMP90:%.*]] = select i1 [[TMP88]], i32 [[TMP89]], i32 [[TMP85]] 819; CHECK-NEXT: [[TMP91:%.*]] = sub i32 [[TMP87]], [[TMP66]] 820; CHECK-NEXT: [[TMP92:%.*]] = select i1 [[TMP88]], i32 [[TMP91]], i32 [[TMP87]] 821; CHECK-NEXT: [[TMP93:%.*]] = icmp uge i32 [[TMP92]], [[TMP66]] 822; CHECK-NEXT: [[TMP94:%.*]] = add i32 [[TMP90]], 1 823; CHECK-NEXT: [[TMP95:%.*]] = select i1 [[TMP93]], i32 [[TMP94]], i32 [[TMP90]] 824; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP95]], i64 2 825; CHECK-NEXT: [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 3 826; CHECK-NEXT: [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 3 827; CHECK-NEXT: [[TMP99:%.*]] = uitofp i32 [[TMP98]] to float 828; CHECK-NEXT: [[TMP100:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP99]]) 829; CHECK-NEXT: [[TMP101:%.*]] = fmul fast float [[TMP100]], 0x41EFFFFFC0000000 830; CHECK-NEXT: [[TMP102:%.*]] = fptoui float [[TMP101]] to i32 831; CHECK-NEXT: [[TMP103:%.*]] = sub i32 0, [[TMP98]] 832; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP102]] 833; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP102]] to i64 834; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64 835; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 836; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 837; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 838; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 839; CHECK-NEXT: [[TMP111:%.*]] = add i32 [[TMP102]], [[TMP110]] 840; CHECK-NEXT: [[TMP112:%.*]] = zext i32 [[TMP97]] to i64 841; CHECK-NEXT: [[TMP113:%.*]] = zext i32 [[TMP111]] to i64 842; CHECK-NEXT: [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]] 843; CHECK-NEXT: [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32 844; CHECK-NEXT: [[TMP116:%.*]] = lshr i64 [[TMP114]], 32 845; CHECK-NEXT: [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32 846; CHECK-NEXT: [[TMP118:%.*]] = mul i32 [[TMP117]], [[TMP98]] 847; CHECK-NEXT: [[TMP119:%.*]] = sub i32 [[TMP97]], [[TMP118]] 848; CHECK-NEXT: [[TMP120:%.*]] = icmp uge i32 [[TMP119]], [[TMP98]] 849; CHECK-NEXT: [[TMP121:%.*]] = add i32 [[TMP117]], 1 850; CHECK-NEXT: [[TMP122:%.*]] = select i1 [[TMP120]], i32 [[TMP121]], i32 [[TMP117]] 851; CHECK-NEXT: [[TMP123:%.*]] = sub i32 [[TMP119]], [[TMP98]] 852; CHECK-NEXT: [[TMP124:%.*]] = select i1 [[TMP120]], i32 [[TMP123]], i32 [[TMP119]] 853; CHECK-NEXT: [[TMP125:%.*]] = icmp uge i32 [[TMP124]], [[TMP98]] 854; CHECK-NEXT: [[TMP126:%.*]] = add i32 [[TMP122]], 1 855; CHECK-NEXT: [[TMP127:%.*]] = select i1 [[TMP125]], i32 [[TMP126]], i32 [[TMP122]] 856; CHECK-NEXT: [[TMP128:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP127]], i64 3 857; CHECK-NEXT: store <4 x i32> [[TMP128]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 858; CHECK-NEXT: ret void 859; 860; GCN-LABEL: udiv_v4i32: 861; GCN: ; %bb.0: 862; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 863; GCN-NEXT: s_mov_b32 s3, 0x4f7ffffe 864; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 865; GCN-NEXT: s_mov_b32 s15, 0xf000 866; GCN-NEXT: s_mov_b32 s14, -1 867; GCN-NEXT: s_waitcnt lgkmcnt(0) 868; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 869; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 870; GCN-NEXT: s_sub_i32 s2, 0, s8 871; GCN-NEXT: v_cvt_f32_u32_e32 v4, s10 872; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 873; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 874; GCN-NEXT: v_cvt_f32_u32_e32 v6, s11 875; GCN-NEXT: v_mul_f32_e32 v0, s3, v0 876; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 877; GCN-NEXT: v_mul_f32_e32 v1, s3, v1 878; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 879; GCN-NEXT: v_mul_lo_u32 v2, s2, v0 880; GCN-NEXT: s_sub_i32 s2, 0, s9 881; GCN-NEXT: v_mul_lo_u32 v3, s2, v1 882; GCN-NEXT: s_sub_i32 s2, 0, s10 883; GCN-NEXT: v_mul_hi_u32 v2, v0, v2 884; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 885; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 886; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 887; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 888; GCN-NEXT: v_mul_hi_u32 v1, s5, v1 889; GCN-NEXT: v_mul_lo_u32 v2, v0, s8 890; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v0 891; GCN-NEXT: v_mul_lo_u32 v5, v1, s9 892; GCN-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 893; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 894; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 895; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s8, v2 896; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 897; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v0 898; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 899; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v4 900; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 901; GCN-NEXT: v_sub_i32_e32 v3, vcc, s5, v5 902; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v1 903; GCN-NEXT: v_mul_f32_e32 v2, s3, v2 904; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 905; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v3 906; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 907; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s9, v3 908; GCN-NEXT: v_mul_lo_u32 v4, s2, v2 909; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 910; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v1 911; GCN-NEXT: s_sub_i32 s0, 0, s11 912; GCN-NEXT: v_mul_hi_u32 v4, v2, v4 913; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 914; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v6 915; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 916; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 917; GCN-NEXT: v_mul_hi_u32 v2, s6, v2 918; GCN-NEXT: v_mul_f32_e32 v4, s3, v4 919; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 920; GCN-NEXT: v_mul_lo_u32 v3, v2, s10 921; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v2 922; GCN-NEXT: v_mul_lo_u32 v5, s0, v4 923; GCN-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 924; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v3 925; GCN-NEXT: v_mul_hi_u32 v5, v4, v5 926; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 927; GCN-NEXT: v_subrev_i32_e32 v6, vcc, s10, v3 928; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 929; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 930; GCN-NEXT: v_mul_hi_u32 v4, s7, v4 931; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v2 932; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 933; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 934; GCN-NEXT: v_mul_lo_u32 v6, v4, s11 935; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4 936; GCN-NEXT: v_sub_i32_e32 v3, vcc, s7, v6 937; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v3 938; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] 939; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s11, v3 940; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 941; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4 942; GCN-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 943; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 944; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 945; GCN-NEXT: s_endpgm 946 %r = udiv <4 x i32> %x, %y 947 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 948 ret void 949} 950 951define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 952; CHECK-LABEL: @urem_v4i32( 953; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 954; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 955; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 956; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 957; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 958; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 959; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 960; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 961; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 962; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 963; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 964; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 965; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 966; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 967; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 968; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 969; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 970; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 971; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 972; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 973; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 974; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 975; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 976; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 977; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]] 978; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]] 979; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]] 980; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]] 981; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]] 982; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> undef, i32 [[TMP29]], i64 0 983; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[X]], i64 1 984; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[Y]], i64 1 985; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float 986; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 987; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000 988; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32 989; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]] 990; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]] 991; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64 992; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64 993; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]] 994; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32 995; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32 996; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 997; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]] 998; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64 999; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64 1000; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]] 1001; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32 1002; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32 1003; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 1004; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]] 1005; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]] 1006; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]] 1007; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]] 1008; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]] 1009; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]] 1010; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]] 1011; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]] 1012; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP59]], i64 1 1013; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i32> [[X]], i64 2 1014; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1015; CHECK-NEXT: [[TMP63:%.*]] = uitofp i32 [[TMP62]] to float 1016; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP63]]) 1017; CHECK-NEXT: [[TMP65:%.*]] = fmul fast float [[TMP64]], 0x41EFFFFFC0000000 1018; CHECK-NEXT: [[TMP66:%.*]] = fptoui float [[TMP65]] to i32 1019; CHECK-NEXT: [[TMP67:%.*]] = sub i32 0, [[TMP62]] 1020; CHECK-NEXT: [[TMP68:%.*]] = mul i32 [[TMP67]], [[TMP66]] 1021; CHECK-NEXT: [[TMP69:%.*]] = zext i32 [[TMP66]] to i64 1022; CHECK-NEXT: [[TMP70:%.*]] = zext i32 [[TMP68]] to i64 1023; CHECK-NEXT: [[TMP71:%.*]] = mul i64 [[TMP69]], [[TMP70]] 1024; CHECK-NEXT: [[TMP72:%.*]] = trunc i64 [[TMP71]] to i32 1025; CHECK-NEXT: [[TMP73:%.*]] = lshr i64 [[TMP71]], 32 1026; CHECK-NEXT: [[TMP74:%.*]] = trunc i64 [[TMP73]] to i32 1027; CHECK-NEXT: [[TMP75:%.*]] = add i32 [[TMP66]], [[TMP74]] 1028; CHECK-NEXT: [[TMP76:%.*]] = zext i32 [[TMP61]] to i64 1029; CHECK-NEXT: [[TMP77:%.*]] = zext i32 [[TMP75]] to i64 1030; CHECK-NEXT: [[TMP78:%.*]] = mul i64 [[TMP76]], [[TMP77]] 1031; CHECK-NEXT: [[TMP79:%.*]] = trunc i64 [[TMP78]] to i32 1032; CHECK-NEXT: [[TMP80:%.*]] = lshr i64 [[TMP78]], 32 1033; CHECK-NEXT: [[TMP81:%.*]] = trunc i64 [[TMP80]] to i32 1034; CHECK-NEXT: [[TMP82:%.*]] = mul i32 [[TMP81]], [[TMP62]] 1035; CHECK-NEXT: [[TMP83:%.*]] = sub i32 [[TMP61]], [[TMP82]] 1036; CHECK-NEXT: [[TMP84:%.*]] = icmp uge i32 [[TMP83]], [[TMP62]] 1037; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP83]], [[TMP62]] 1038; CHECK-NEXT: [[TMP86:%.*]] = select i1 [[TMP84]], i32 [[TMP85]], i32 [[TMP83]] 1039; CHECK-NEXT: [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP62]] 1040; CHECK-NEXT: [[TMP88:%.*]] = sub i32 [[TMP86]], [[TMP62]] 1041; CHECK-NEXT: [[TMP89:%.*]] = select i1 [[TMP87]], i32 [[TMP88]], i32 [[TMP86]] 1042; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP89]], i64 2 1043; CHECK-NEXT: [[TMP91:%.*]] = extractelement <4 x i32> [[X]], i64 3 1044; CHECK-NEXT: [[TMP92:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1045; CHECK-NEXT: [[TMP93:%.*]] = uitofp i32 [[TMP92]] to float 1046; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP93]]) 1047; CHECK-NEXT: [[TMP95:%.*]] = fmul fast float [[TMP94]], 0x41EFFFFFC0000000 1048; CHECK-NEXT: [[TMP96:%.*]] = fptoui float [[TMP95]] to i32 1049; CHECK-NEXT: [[TMP97:%.*]] = sub i32 0, [[TMP92]] 1050; CHECK-NEXT: [[TMP98:%.*]] = mul i32 [[TMP97]], [[TMP96]] 1051; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP96]] to i64 1052; CHECK-NEXT: [[TMP100:%.*]] = zext i32 [[TMP98]] to i64 1053; CHECK-NEXT: [[TMP101:%.*]] = mul i64 [[TMP99]], [[TMP100]] 1054; CHECK-NEXT: [[TMP102:%.*]] = trunc i64 [[TMP101]] to i32 1055; CHECK-NEXT: [[TMP103:%.*]] = lshr i64 [[TMP101]], 32 1056; CHECK-NEXT: [[TMP104:%.*]] = trunc i64 [[TMP103]] to i32 1057; CHECK-NEXT: [[TMP105:%.*]] = add i32 [[TMP96]], [[TMP104]] 1058; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP91]] to i64 1059; CHECK-NEXT: [[TMP107:%.*]] = zext i32 [[TMP105]] to i64 1060; CHECK-NEXT: [[TMP108:%.*]] = mul i64 [[TMP106]], [[TMP107]] 1061; CHECK-NEXT: [[TMP109:%.*]] = trunc i64 [[TMP108]] to i32 1062; CHECK-NEXT: [[TMP110:%.*]] = lshr i64 [[TMP108]], 32 1063; CHECK-NEXT: [[TMP111:%.*]] = trunc i64 [[TMP110]] to i32 1064; CHECK-NEXT: [[TMP112:%.*]] = mul i32 [[TMP111]], [[TMP92]] 1065; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP91]], [[TMP112]] 1066; CHECK-NEXT: [[TMP114:%.*]] = icmp uge i32 [[TMP113]], [[TMP92]] 1067; CHECK-NEXT: [[TMP115:%.*]] = sub i32 [[TMP113]], [[TMP92]] 1068; CHECK-NEXT: [[TMP116:%.*]] = select i1 [[TMP114]], i32 [[TMP115]], i32 [[TMP113]] 1069; CHECK-NEXT: [[TMP117:%.*]] = icmp uge i32 [[TMP116]], [[TMP92]] 1070; CHECK-NEXT: [[TMP118:%.*]] = sub i32 [[TMP116]], [[TMP92]] 1071; CHECK-NEXT: [[TMP119:%.*]] = select i1 [[TMP117]], i32 [[TMP118]], i32 [[TMP116]] 1072; CHECK-NEXT: [[TMP120:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP119]], i64 3 1073; CHECK-NEXT: store <4 x i32> [[TMP120]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1074; CHECK-NEXT: ret void 1075; 1076; GCN-LABEL: urem_v4i32: 1077; GCN: ; %bb.0: 1078; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1079; GCN-NEXT: s_mov_b32 s13, 0x4f7ffffe 1080; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1081; GCN-NEXT: s_mov_b32 s3, 0xf000 1082; GCN-NEXT: s_waitcnt lgkmcnt(0) 1083; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 1084; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 1085; GCN-NEXT: s_sub_i32 s2, 0, s8 1086; GCN-NEXT: s_sub_i32 s12, 0, s9 1087; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 1088; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 1089; GCN-NEXT: v_cvt_f32_u32_e32 v3, s10 1090; GCN-NEXT: v_cvt_f32_u32_e32 v5, s11 1091; GCN-NEXT: v_mul_f32_e32 v0, s13, v0 1092; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 1093; GCN-NEXT: v_mul_f32_e32 v1, s13, v1 1094; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 1095; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 1096; GCN-NEXT: v_mul_lo_u32 v2, s2, v0 1097; GCN-NEXT: s_mov_b32 s2, -1 1098; GCN-NEXT: v_mul_lo_u32 v4, s12, v1 1099; GCN-NEXT: v_mul_hi_u32 v2, v0, v2 1100; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 1101; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1102; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 1103; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1 1104; GCN-NEXT: v_mul_hi_u32 v1, s5, v1 1105; GCN-NEXT: v_mul_f32_e32 v2, s13, v3 1106; GCN-NEXT: v_mul_lo_u32 v0, v0, s8 1107; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 1108; GCN-NEXT: v_mul_lo_u32 v1, v1, s9 1109; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1110; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 1111; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1112; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1113; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 1114; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1115; GCN-NEXT: s_sub_i32 s4, 0, s10 1116; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1117; GCN-NEXT: v_mul_lo_u32 v3, s4, v2 1118; GCN-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 1119; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 1120; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1121; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1122; GCN-NEXT: v_mul_hi_u32 v3, v2, v3 1123; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v5 1124; GCN-NEXT: s_sub_i32 s4, 0, s11 1125; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 1126; GCN-NEXT: v_mul_f32_e32 v3, s13, v4 1127; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 1128; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 1129; GCN-NEXT: v_mul_hi_u32 v2, s6, v2 1130; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1131; GCN-NEXT: v_mul_lo_u32 v5, s4, v3 1132; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1133; GCN-NEXT: v_mul_lo_u32 v2, v2, s10 1134; GCN-NEXT: v_mul_hi_u32 v4, v3, v5 1135; GCN-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 1136; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 1137; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 1138; GCN-NEXT: v_mul_hi_u32 v3, s7, v3 1139; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1140; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1141; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s10, v2 1142; GCN-NEXT: v_mul_lo_u32 v3, v3, s11 1143; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1144; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 1145; GCN-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 1146; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 1147; GCN-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1148; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1149; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 1150; GCN-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1151; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1152; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1153; GCN-NEXT: s_endpgm 1154 %r = urem <4 x i32> %x, %y 1155 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1156 ret void 1157} 1158 1159define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1160; CHECK-LABEL: @sdiv_v4i32( 1161; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1162; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1163; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 1164; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 1165; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 1166; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 1167; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 1168; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 1169; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 1170; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 1171; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 1172; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000 1173; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 1174; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]] 1175; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]] 1176; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64 1177; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1178; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1179; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1180; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1181; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1182; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]] 1183; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64 1184; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64 1185; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 1186; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 1187; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 1188; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 1189; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]] 1190; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]] 1191; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]] 1192; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1 1193; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]] 1194; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]] 1195; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]] 1196; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]] 1197; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1 1198; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]] 1199; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]] 1200; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]] 1201; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> undef, i32 [[TMP40]], i64 0 1202; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[X]], i64 1 1203; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1204; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31 1205; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31 1206; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]] 1207; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]] 1208; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]] 1209; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]] 1210; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]] 1211; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float 1212; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]]) 1213; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000 1214; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32 1215; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]] 1216; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]] 1217; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64 1218; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64 1219; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]] 1220; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32 1221; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32 1222; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32 1223; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]] 1224; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64 1225; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64 1226; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]] 1227; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32 1228; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32 1229; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32 1230; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]] 1231; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]] 1232; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]] 1233; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1 1234; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]] 1235; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]] 1236; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]] 1237; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]] 1238; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1 1239; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]] 1240; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]] 1241; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]] 1242; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP81]], i64 1 1243; CHECK-NEXT: [[TMP83:%.*]] = extractelement <4 x i32> [[X]], i64 2 1244; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1245; CHECK-NEXT: [[TMP85:%.*]] = ashr i32 [[TMP83]], 31 1246; CHECK-NEXT: [[TMP86:%.*]] = ashr i32 [[TMP84]], 31 1247; CHECK-NEXT: [[TMP87:%.*]] = xor i32 [[TMP85]], [[TMP86]] 1248; CHECK-NEXT: [[TMP88:%.*]] = add i32 [[TMP83]], [[TMP85]] 1249; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP84]], [[TMP86]] 1250; CHECK-NEXT: [[TMP90:%.*]] = xor i32 [[TMP88]], [[TMP85]] 1251; CHECK-NEXT: [[TMP91:%.*]] = xor i32 [[TMP89]], [[TMP86]] 1252; CHECK-NEXT: [[TMP92:%.*]] = uitofp i32 [[TMP91]] to float 1253; CHECK-NEXT: [[TMP93:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP92]]) 1254; CHECK-NEXT: [[TMP94:%.*]] = fmul fast float [[TMP93]], 0x41EFFFFFC0000000 1255; CHECK-NEXT: [[TMP95:%.*]] = fptoui float [[TMP94]] to i32 1256; CHECK-NEXT: [[TMP96:%.*]] = sub i32 0, [[TMP91]] 1257; CHECK-NEXT: [[TMP97:%.*]] = mul i32 [[TMP96]], [[TMP95]] 1258; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP95]] to i64 1259; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64 1260; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]] 1261; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 1262; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32 1263; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32 1264; CHECK-NEXT: [[TMP104:%.*]] = add i32 [[TMP95]], [[TMP103]] 1265; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP90]] to i64 1266; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64 1267; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 1268; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 1269; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 1270; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 1271; CHECK-NEXT: [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP91]] 1272; CHECK-NEXT: [[TMP112:%.*]] = sub i32 [[TMP90]], [[TMP111]] 1273; CHECK-NEXT: [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP91]] 1274; CHECK-NEXT: [[TMP114:%.*]] = add i32 [[TMP110]], 1 1275; CHECK-NEXT: [[TMP115:%.*]] = select i1 [[TMP113]], i32 [[TMP114]], i32 [[TMP110]] 1276; CHECK-NEXT: [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP91]] 1277; CHECK-NEXT: [[TMP117:%.*]] = select i1 [[TMP113]], i32 [[TMP116]], i32 [[TMP112]] 1278; CHECK-NEXT: [[TMP118:%.*]] = icmp uge i32 [[TMP117]], [[TMP91]] 1279; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], 1 1280; CHECK-NEXT: [[TMP120:%.*]] = select i1 [[TMP118]], i32 [[TMP119]], i32 [[TMP115]] 1281; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP120]], [[TMP87]] 1282; CHECK-NEXT: [[TMP122:%.*]] = sub i32 [[TMP121]], [[TMP87]] 1283; CHECK-NEXT: [[TMP123:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP122]], i64 2 1284; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i32> [[X]], i64 3 1285; CHECK-NEXT: [[TMP125:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1286; CHECK-NEXT: [[TMP126:%.*]] = ashr i32 [[TMP124]], 31 1287; CHECK-NEXT: [[TMP127:%.*]] = ashr i32 [[TMP125]], 31 1288; CHECK-NEXT: [[TMP128:%.*]] = xor i32 [[TMP126]], [[TMP127]] 1289; CHECK-NEXT: [[TMP129:%.*]] = add i32 [[TMP124]], [[TMP126]] 1290; CHECK-NEXT: [[TMP130:%.*]] = add i32 [[TMP125]], [[TMP127]] 1291; CHECK-NEXT: [[TMP131:%.*]] = xor i32 [[TMP129]], [[TMP126]] 1292; CHECK-NEXT: [[TMP132:%.*]] = xor i32 [[TMP130]], [[TMP127]] 1293; CHECK-NEXT: [[TMP133:%.*]] = uitofp i32 [[TMP132]] to float 1294; CHECK-NEXT: [[TMP134:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP133]]) 1295; CHECK-NEXT: [[TMP135:%.*]] = fmul fast float [[TMP134]], 0x41EFFFFFC0000000 1296; CHECK-NEXT: [[TMP136:%.*]] = fptoui float [[TMP135]] to i32 1297; CHECK-NEXT: [[TMP137:%.*]] = sub i32 0, [[TMP132]] 1298; CHECK-NEXT: [[TMP138:%.*]] = mul i32 [[TMP137]], [[TMP136]] 1299; CHECK-NEXT: [[TMP139:%.*]] = zext i32 [[TMP136]] to i64 1300; CHECK-NEXT: [[TMP140:%.*]] = zext i32 [[TMP138]] to i64 1301; CHECK-NEXT: [[TMP141:%.*]] = mul i64 [[TMP139]], [[TMP140]] 1302; CHECK-NEXT: [[TMP142:%.*]] = trunc i64 [[TMP141]] to i32 1303; CHECK-NEXT: [[TMP143:%.*]] = lshr i64 [[TMP141]], 32 1304; CHECK-NEXT: [[TMP144:%.*]] = trunc i64 [[TMP143]] to i32 1305; CHECK-NEXT: [[TMP145:%.*]] = add i32 [[TMP136]], [[TMP144]] 1306; CHECK-NEXT: [[TMP146:%.*]] = zext i32 [[TMP131]] to i64 1307; CHECK-NEXT: [[TMP147:%.*]] = zext i32 [[TMP145]] to i64 1308; CHECK-NEXT: [[TMP148:%.*]] = mul i64 [[TMP146]], [[TMP147]] 1309; CHECK-NEXT: [[TMP149:%.*]] = trunc i64 [[TMP148]] to i32 1310; CHECK-NEXT: [[TMP150:%.*]] = lshr i64 [[TMP148]], 32 1311; CHECK-NEXT: [[TMP151:%.*]] = trunc i64 [[TMP150]] to i32 1312; CHECK-NEXT: [[TMP152:%.*]] = mul i32 [[TMP151]], [[TMP132]] 1313; CHECK-NEXT: [[TMP153:%.*]] = sub i32 [[TMP131]], [[TMP152]] 1314; CHECK-NEXT: [[TMP154:%.*]] = icmp uge i32 [[TMP153]], [[TMP132]] 1315; CHECK-NEXT: [[TMP155:%.*]] = add i32 [[TMP151]], 1 1316; CHECK-NEXT: [[TMP156:%.*]] = select i1 [[TMP154]], i32 [[TMP155]], i32 [[TMP151]] 1317; CHECK-NEXT: [[TMP157:%.*]] = sub i32 [[TMP153]], [[TMP132]] 1318; CHECK-NEXT: [[TMP158:%.*]] = select i1 [[TMP154]], i32 [[TMP157]], i32 [[TMP153]] 1319; CHECK-NEXT: [[TMP159:%.*]] = icmp uge i32 [[TMP158]], [[TMP132]] 1320; CHECK-NEXT: [[TMP160:%.*]] = add i32 [[TMP156]], 1 1321; CHECK-NEXT: [[TMP161:%.*]] = select i1 [[TMP159]], i32 [[TMP160]], i32 [[TMP156]] 1322; CHECK-NEXT: [[TMP162:%.*]] = xor i32 [[TMP161]], [[TMP128]] 1323; CHECK-NEXT: [[TMP163:%.*]] = sub i32 [[TMP162]], [[TMP128]] 1324; CHECK-NEXT: [[TMP164:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP163]], i64 3 1325; CHECK-NEXT: store <4 x i32> [[TMP164]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1326; CHECK-NEXT: ret void 1327; 1328; GCN-LABEL: sdiv_v4i32: 1329; GCN: ; %bb.0: 1330; GCN-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0xd 1331; GCN-NEXT: s_mov_b32 s16, 0x4f7ffffe 1332; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1333; GCN-NEXT: s_mov_b32 s7, 0xf000 1334; GCN-NEXT: s_mov_b32 s6, -1 1335; GCN-NEXT: s_waitcnt lgkmcnt(0) 1336; GCN-NEXT: s_ashr_i32 s2, s12, 31 1337; GCN-NEXT: s_add_i32 s3, s12, s2 1338; GCN-NEXT: s_xor_b32 s12, s3, s2 1339; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 1340; GCN-NEXT: s_ashr_i32 s3, s13, 31 1341; GCN-NEXT: s_add_i32 s0, s13, s3 1342; GCN-NEXT: s_xor_b32 s13, s0, s3 1343; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 1344; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13 1345; GCN-NEXT: s_sub_i32 s1, 0, s12 1346; GCN-NEXT: s_ashr_i32 s0, s8, 31 1347; GCN-NEXT: v_mul_f32_e32 v0, s16, v0 1348; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 1349; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 1350; GCN-NEXT: s_xor_b32 s2, s0, s2 1351; GCN-NEXT: v_mul_lo_u32 v2, s1, v0 1352; GCN-NEXT: s_add_i32 s1, s8, s0 1353; GCN-NEXT: v_mul_f32_e32 v1, s16, v1 1354; GCN-NEXT: s_xor_b32 s1, s1, s0 1355; GCN-NEXT: v_mul_hi_u32 v2, v0, v2 1356; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 1357; GCN-NEXT: s_sub_i32 s0, 0, s13 1358; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1359; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 1360; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 1361; GCN-NEXT: v_mul_lo_u32 v3, v0, s12 1362; GCN-NEXT: v_mul_hi_u32 v2, v1, v2 1363; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v0 1364; GCN-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 1365; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v3 1366; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 1367; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s12, v3 1368; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] 1369; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v0 1370; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 1371; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 1372; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 1373; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 1374; GCN-NEXT: s_ashr_i32 s0, s9, 31 1375; GCN-NEXT: s_add_i32 s1, s9, s0 1376; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 1377; GCN-NEXT: s_xor_b32 s2, s0, s3 1378; GCN-NEXT: s_ashr_i32 s3, s14, 31 1379; GCN-NEXT: s_xor_b32 s1, s1, s0 1380; GCN-NEXT: s_add_i32 s0, s14, s3 1381; GCN-NEXT: s_xor_b32 s9, s0, s3 1382; GCN-NEXT: v_cvt_f32_u32_e32 v3, s9 1383; GCN-NEXT: v_mul_hi_u32 v1, s1, v1 1384; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 1385; GCN-NEXT: v_mul_lo_u32 v2, v1, s13 1386; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1387; GCN-NEXT: v_mul_f32_e32 v3, s16, v3 1388; GCN-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 1389; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 1390; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v2 1391; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 1392; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s13, v2 1393; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 1394; GCN-NEXT: s_sub_i32 s0, 0, s9 1395; GCN-NEXT: v_mul_lo_u32 v5, s0, v3 1396; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1397; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 1398; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1399; GCN-NEXT: v_mul_hi_u32 v2, v3, v5 1400; GCN-NEXT: v_xor_b32_e32 v1, s2, v1 1401; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 1402; GCN-NEXT: s_ashr_i32 s2, s15, 31 1403; GCN-NEXT: s_ashr_i32 s0, s10, 31 1404; GCN-NEXT: s_add_i32 s8, s15, s2 1405; GCN-NEXT: s_add_i32 s1, s10, s0 1406; GCN-NEXT: s_xor_b32 s8, s8, s2 1407; GCN-NEXT: v_cvt_f32_u32_e32 v4, s8 1408; GCN-NEXT: s_xor_b32 s1, s1, s0 1409; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 1410; GCN-NEXT: v_mul_hi_u32 v2, s1, v2 1411; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v4 1412; GCN-NEXT: s_xor_b32 s3, s0, s3 1413; GCN-NEXT: v_mul_lo_u32 v3, v2, s9 1414; GCN-NEXT: v_mul_f32_e32 v4, s16, v4 1415; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 1416; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v2 1417; GCN-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 1418; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v3 1419; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 1420; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s9, v3 1421; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1422; GCN-NEXT: s_sub_i32 s0, 0, s8 1423; GCN-NEXT: v_mul_lo_u32 v5, s0, v4 1424; GCN-NEXT: s_ashr_i32 s0, s11, 31 1425; GCN-NEXT: s_add_i32 s1, s11, s0 1426; GCN-NEXT: s_xor_b32 s1, s1, s0 1427; GCN-NEXT: v_mul_hi_u32 v5, v4, v5 1428; GCN-NEXT: v_add_i32_e32 v6, vcc, 1, v2 1429; GCN-NEXT: s_xor_b32 s2, s0, s2 1430; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 1431; GCN-NEXT: v_mul_hi_u32 v4, s1, v4 1432; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 1433; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 1434; GCN-NEXT: v_xor_b32_e32 v2, s3, v2 1435; GCN-NEXT: v_mul_lo_u32 v3, v4, s8 1436; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1437; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s3, v2 1438; GCN-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 1439; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 1440; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] 1441; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s8, v3 1442; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1443; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1444; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 1445; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 1446; GCN-NEXT: v_xor_b32_e32 v3, s2, v3 1447; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s2, v3 1448; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1449; GCN-NEXT: s_endpgm 1450 %r = sdiv <4 x i32> %x, %y 1451 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1452 ret void 1453} 1454 1455define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1456; CHECK-LABEL: @srem_v4i32( 1457; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1458; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1459; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 1460; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 1461; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 1462; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 1463; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 1464; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 1465; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 1466; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 1467; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000 1468; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 1469; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]] 1470; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]] 1471; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64 1472; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 1473; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]] 1474; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 1475; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32 1476; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 1477; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] 1478; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 1479; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64 1480; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 1481; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 1482; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 1483; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 1484; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]] 1485; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]] 1486; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]] 1487; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]] 1488; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]] 1489; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]] 1490; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]] 1491; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]] 1492; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]] 1493; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]] 1494; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i32> undef, i32 [[TMP37]], i64 0 1495; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i32> [[X]], i64 1 1496; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1497; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31 1498; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31 1499; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]] 1500; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]] 1501; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]] 1502; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]] 1503; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float 1504; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]]) 1505; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000 1506; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32 1507; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]] 1508; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]] 1509; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64 1510; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64 1511; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]] 1512; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32 1513; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32 1514; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32 1515; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]] 1516; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64 1517; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64 1518; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]] 1519; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32 1520; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32 1521; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32 1522; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]] 1523; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]] 1524; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]] 1525; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]] 1526; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]] 1527; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]] 1528; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]] 1529; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]] 1530; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]] 1531; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]] 1532; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP75]], i64 1 1533; CHECK-NEXT: [[TMP77:%.*]] = extractelement <4 x i32> [[X]], i64 2 1534; CHECK-NEXT: [[TMP78:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1535; CHECK-NEXT: [[TMP79:%.*]] = ashr i32 [[TMP77]], 31 1536; CHECK-NEXT: [[TMP80:%.*]] = ashr i32 [[TMP78]], 31 1537; CHECK-NEXT: [[TMP81:%.*]] = add i32 [[TMP77]], [[TMP79]] 1538; CHECK-NEXT: [[TMP82:%.*]] = add i32 [[TMP78]], [[TMP80]] 1539; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP79]] 1540; CHECK-NEXT: [[TMP84:%.*]] = xor i32 [[TMP82]], [[TMP80]] 1541; CHECK-NEXT: [[TMP85:%.*]] = uitofp i32 [[TMP84]] to float 1542; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP85]]) 1543; CHECK-NEXT: [[TMP87:%.*]] = fmul fast float [[TMP86]], 0x41EFFFFFC0000000 1544; CHECK-NEXT: [[TMP88:%.*]] = fptoui float [[TMP87]] to i32 1545; CHECK-NEXT: [[TMP89:%.*]] = sub i32 0, [[TMP84]] 1546; CHECK-NEXT: [[TMP90:%.*]] = mul i32 [[TMP89]], [[TMP88]] 1547; CHECK-NEXT: [[TMP91:%.*]] = zext i32 [[TMP88]] to i64 1548; CHECK-NEXT: [[TMP92:%.*]] = zext i32 [[TMP90]] to i64 1549; CHECK-NEXT: [[TMP93:%.*]] = mul i64 [[TMP91]], [[TMP92]] 1550; CHECK-NEXT: [[TMP94:%.*]] = trunc i64 [[TMP93]] to i32 1551; CHECK-NEXT: [[TMP95:%.*]] = lshr i64 [[TMP93]], 32 1552; CHECK-NEXT: [[TMP96:%.*]] = trunc i64 [[TMP95]] to i32 1553; CHECK-NEXT: [[TMP97:%.*]] = add i32 [[TMP88]], [[TMP96]] 1554; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP83]] to i64 1555; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64 1556; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]] 1557; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 1558; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32 1559; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32 1560; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP84]] 1561; CHECK-NEXT: [[TMP105:%.*]] = sub i32 [[TMP83]], [[TMP104]] 1562; CHECK-NEXT: [[TMP106:%.*]] = icmp uge i32 [[TMP105]], [[TMP84]] 1563; CHECK-NEXT: [[TMP107:%.*]] = sub i32 [[TMP105]], [[TMP84]] 1564; CHECK-NEXT: [[TMP108:%.*]] = select i1 [[TMP106]], i32 [[TMP107]], i32 [[TMP105]] 1565; CHECK-NEXT: [[TMP109:%.*]] = icmp uge i32 [[TMP108]], [[TMP84]] 1566; CHECK-NEXT: [[TMP110:%.*]] = sub i32 [[TMP108]], [[TMP84]] 1567; CHECK-NEXT: [[TMP111:%.*]] = select i1 [[TMP109]], i32 [[TMP110]], i32 [[TMP108]] 1568; CHECK-NEXT: [[TMP112:%.*]] = xor i32 [[TMP111]], [[TMP79]] 1569; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP112]], [[TMP79]] 1570; CHECK-NEXT: [[TMP114:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP113]], i64 2 1571; CHECK-NEXT: [[TMP115:%.*]] = extractelement <4 x i32> [[X]], i64 3 1572; CHECK-NEXT: [[TMP116:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1573; CHECK-NEXT: [[TMP117:%.*]] = ashr i32 [[TMP115]], 31 1574; CHECK-NEXT: [[TMP118:%.*]] = ashr i32 [[TMP116]], 31 1575; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], [[TMP117]] 1576; CHECK-NEXT: [[TMP120:%.*]] = add i32 [[TMP116]], [[TMP118]] 1577; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP119]], [[TMP117]] 1578; CHECK-NEXT: [[TMP122:%.*]] = xor i32 [[TMP120]], [[TMP118]] 1579; CHECK-NEXT: [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float 1580; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]]) 1581; CHECK-NEXT: [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41EFFFFFC0000000 1582; CHECK-NEXT: [[TMP126:%.*]] = fptoui float [[TMP125]] to i32 1583; CHECK-NEXT: [[TMP127:%.*]] = sub i32 0, [[TMP122]] 1584; CHECK-NEXT: [[TMP128:%.*]] = mul i32 [[TMP127]], [[TMP126]] 1585; CHECK-NEXT: [[TMP129:%.*]] = zext i32 [[TMP126]] to i64 1586; CHECK-NEXT: [[TMP130:%.*]] = zext i32 [[TMP128]] to i64 1587; CHECK-NEXT: [[TMP131:%.*]] = mul i64 [[TMP129]], [[TMP130]] 1588; CHECK-NEXT: [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32 1589; CHECK-NEXT: [[TMP133:%.*]] = lshr i64 [[TMP131]], 32 1590; CHECK-NEXT: [[TMP134:%.*]] = trunc i64 [[TMP133]] to i32 1591; CHECK-NEXT: [[TMP135:%.*]] = add i32 [[TMP126]], [[TMP134]] 1592; CHECK-NEXT: [[TMP136:%.*]] = zext i32 [[TMP121]] to i64 1593; CHECK-NEXT: [[TMP137:%.*]] = zext i32 [[TMP135]] to i64 1594; CHECK-NEXT: [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]] 1595; CHECK-NEXT: [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32 1596; CHECK-NEXT: [[TMP140:%.*]] = lshr i64 [[TMP138]], 32 1597; CHECK-NEXT: [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32 1598; CHECK-NEXT: [[TMP142:%.*]] = mul i32 [[TMP141]], [[TMP122]] 1599; CHECK-NEXT: [[TMP143:%.*]] = sub i32 [[TMP121]], [[TMP142]] 1600; CHECK-NEXT: [[TMP144:%.*]] = icmp uge i32 [[TMP143]], [[TMP122]] 1601; CHECK-NEXT: [[TMP145:%.*]] = sub i32 [[TMP143]], [[TMP122]] 1602; CHECK-NEXT: [[TMP146:%.*]] = select i1 [[TMP144]], i32 [[TMP145]], i32 [[TMP143]] 1603; CHECK-NEXT: [[TMP147:%.*]] = icmp uge i32 [[TMP146]], [[TMP122]] 1604; CHECK-NEXT: [[TMP148:%.*]] = sub i32 [[TMP146]], [[TMP122]] 1605; CHECK-NEXT: [[TMP149:%.*]] = select i1 [[TMP147]], i32 [[TMP148]], i32 [[TMP146]] 1606; CHECK-NEXT: [[TMP150:%.*]] = xor i32 [[TMP149]], [[TMP117]] 1607; CHECK-NEXT: [[TMP151:%.*]] = sub i32 [[TMP150]], [[TMP117]] 1608; CHECK-NEXT: [[TMP152:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP151]], i64 3 1609; CHECK-NEXT: store <4 x i32> [[TMP152]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1610; CHECK-NEXT: ret void 1611; 1612; GCN-LABEL: srem_v4i32: 1613; GCN: ; %bb.0: 1614; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1615; GCN-NEXT: s_mov_b32 s13, 0x4f7ffffe 1616; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1617; GCN-NEXT: s_mov_b32 s3, 0xf000 1618; GCN-NEXT: s_waitcnt lgkmcnt(0) 1619; GCN-NEXT: s_ashr_i32 s2, s8, 31 1620; GCN-NEXT: s_add_i32 s8, s8, s2 1621; GCN-NEXT: s_xor_b32 s12, s8, s2 1622; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 1623; GCN-NEXT: s_ashr_i32 s8, s9, 31 1624; GCN-NEXT: s_add_i32 s9, s9, s8 1625; GCN-NEXT: s_xor_b32 s14, s9, s8 1626; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 1627; GCN-NEXT: v_cvt_f32_u32_e32 v1, s14 1628; GCN-NEXT: s_sub_i32 s9, 0, s12 1629; GCN-NEXT: s_ashr_i32 s8, s4, 31 1630; GCN-NEXT: v_mul_f32_e32 v0, s13, v0 1631; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 1632; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 1633; GCN-NEXT: s_add_i32 s4, s4, s8 1634; GCN-NEXT: s_xor_b32 s4, s4, s8 1635; GCN-NEXT: v_mul_lo_u32 v2, s9, v0 1636; GCN-NEXT: v_mul_f32_e32 v1, s13, v1 1637; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 1638; GCN-NEXT: s_sub_i32 s9, 0, s14 1639; GCN-NEXT: v_mul_hi_u32 v2, v0, v2 1640; GCN-NEXT: s_mov_b32 s2, -1 1641; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1642; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 1643; GCN-NEXT: v_mul_lo_u32 v2, s9, v1 1644; GCN-NEXT: s_ashr_i32 s9, s5, 31 1645; GCN-NEXT: s_add_i32 s5, s5, s9 1646; GCN-NEXT: v_mul_lo_u32 v0, v0, s12 1647; GCN-NEXT: v_mul_hi_u32 v2, v1, v2 1648; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1649; GCN-NEXT: s_xor_b32 s4, s5, s9 1650; GCN-NEXT: s_ashr_i32 s5, s10, 31 1651; GCN-NEXT: s_add_i32 s10, s10, s5 1652; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s12, v0 1653; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 1654; GCN-NEXT: s_xor_b32 s10, s10, s5 1655; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1656; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 1657; GCN-NEXT: v_cvt_f32_u32_e32 v2, s10 1658; GCN-NEXT: v_mul_hi_u32 v1, s4, v1 1659; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s12, v0 1660; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 1661; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 1662; GCN-NEXT: v_mul_lo_u32 v1, v1, s14 1663; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1664; GCN-NEXT: v_xor_b32_e32 v0, s8, v0 1665; GCN-NEXT: v_mul_f32_e32 v2, s13, v2 1666; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 1667; GCN-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 1668; GCN-NEXT: s_sub_i32 s4, 0, s10 1669; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 1670; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s14, v1 1671; GCN-NEXT: v_cmp_le_u32_e32 vcc, s14, v1 1672; GCN-NEXT: v_mul_lo_u32 v4, s4, v2 1673; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 1674; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s14, v1 1675; GCN-NEXT: v_cmp_le_u32_e32 vcc, s14, v1 1676; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 1677; GCN-NEXT: v_mul_hi_u32 v3, v2, v4 1678; GCN-NEXT: s_ashr_i32 s4, s6, 31 1679; GCN-NEXT: s_add_i32 s5, s6, s4 1680; GCN-NEXT: s_ashr_i32 s6, s11, 31 1681; GCN-NEXT: s_add_i32 s8, s11, s6 1682; GCN-NEXT: s_xor_b32 s8, s8, s6 1683; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 1684; GCN-NEXT: v_cvt_f32_u32_e32 v3, s8 1685; GCN-NEXT: s_xor_b32 s5, s5, s4 1686; GCN-NEXT: v_mul_hi_u32 v2, s5, v2 1687; GCN-NEXT: v_xor_b32_e32 v1, s9, v1 1688; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 1689; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s9, v1 1690; GCN-NEXT: v_mul_lo_u32 v2, v2, s10 1691; GCN-NEXT: v_mul_f32_e32 v3, s13, v3 1692; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 1693; GCN-NEXT: v_sub_i32_e32 v2, vcc, s5, v2 1694; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s10, v2 1695; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1696; GCN-NEXT: s_sub_i32 s5, 0, s8 1697; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 1698; GCN-NEXT: v_mul_lo_u32 v4, s5, v3 1699; GCN-NEXT: s_ashr_i32 s5, s7, 31 1700; GCN-NEXT: s_add_i32 s6, s7, s5 1701; GCN-NEXT: s_xor_b32 s6, s6, s5 1702; GCN-NEXT: v_mul_hi_u32 v4, v3, v4 1703; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 1704; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 1705; GCN-NEXT: v_mul_hi_u32 v3, s6, v3 1706; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1707; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1708; GCN-NEXT: v_xor_b32_e32 v2, s4, v2 1709; GCN-NEXT: v_mul_lo_u32 v3, v3, s8 1710; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s4, v2 1711; GCN-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 1712; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 1713; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 1714; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1715; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 1716; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 1717; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1718; GCN-NEXT: v_xor_b32_e32 v3, s5, v3 1719; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s5, v3 1720; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1721; GCN-NEXT: s_endpgm 1722 %r = srem <4 x i32> %x, %y 1723 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1724 ret void 1725} 1726 1727define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 1728; CHECK-LABEL: @udiv_v4i16( 1729; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 1730; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 1731; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 1732; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 1733; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 1734; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 1735; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 1736; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 1737; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 1738; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 1739; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 1740; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 1741; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 1742; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 1743; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 1744; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 1745; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 1746; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 1747; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 1748; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i16> undef, i16 [[TMP19]], i64 0 1749; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1 1750; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1 1751; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 1752; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 1753; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 1754; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 1755; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 1756; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 1757; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 1758; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 1759; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 1760; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 1761; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 1762; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 1763; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 1764; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 1765; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 1766; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 1767; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 1768; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1 1769; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2 1770; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2 1771; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 1772; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 1773; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 1774; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 1775; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 1776; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 1777; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 1778; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 1779; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 1780; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 1781; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 1782; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 1783; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 1784; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 1785; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 1786; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 1787; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 1788; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2 1789; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3 1790; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3 1791; CHECK-NEXT: [[TMP63:%.*]] = zext i16 [[TMP61]] to i32 1792; CHECK-NEXT: [[TMP64:%.*]] = zext i16 [[TMP62]] to i32 1793; CHECK-NEXT: [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float 1794; CHECK-NEXT: [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float 1795; CHECK-NEXT: [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]]) 1796; CHECK-NEXT: [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]] 1797; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]]) 1798; CHECK-NEXT: [[TMP70:%.*]] = fneg fast float [[TMP69]] 1799; CHECK-NEXT: [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]]) 1800; CHECK-NEXT: [[TMP72:%.*]] = fptoui float [[TMP69]] to i32 1801; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]]) 1802; CHECK-NEXT: [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 1803; CHECK-NEXT: [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]] 1804; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0 1805; CHECK-NEXT: [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]] 1806; CHECK-NEXT: [[TMP78:%.*]] = and i32 [[TMP77]], 65535 1807; CHECK-NEXT: [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16 1808; CHECK-NEXT: [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3 1809; CHECK-NEXT: store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 1810; CHECK-NEXT: ret void 1811; 1812; GCN-LABEL: udiv_v4i16: 1813; GCN: ; %bb.0: 1814; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1815; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 1816; GCN-NEXT: s_mov_b32 s8, 0xffff 1817; GCN-NEXT: s_mov_b32 s7, 0xf000 1818; GCN-NEXT: s_mov_b32 s6, -1 1819; GCN-NEXT: s_waitcnt lgkmcnt(0) 1820; GCN-NEXT: s_and_b32 s9, s2, s8 1821; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 1822; GCN-NEXT: s_lshr_b32 s9, s0, 16 1823; GCN-NEXT: s_and_b32 s0, s0, s8 1824; GCN-NEXT: s_lshr_b32 s2, s2, 16 1825; GCN-NEXT: v_cvt_f32_u32_e32 v3, s2 1826; GCN-NEXT: v_cvt_f32_u32_e32 v1, s0 1827; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 1828; GCN-NEXT: v_cvt_f32_u32_e32 v4, s9 1829; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 1830; GCN-NEXT: s_and_b32 s2, s3, s8 1831; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 1832; GCN-NEXT: v_trunc_f32_e32 v2, v2 1833; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 1834; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 1835; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1836; GCN-NEXT: v_mul_f32_e32 v1, v4, v5 1837; GCN-NEXT: v_trunc_f32_e32 v1, v1 1838; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 1839; GCN-NEXT: v_mad_f32 v2, -v1, v3, v4 1840; GCN-NEXT: v_cvt_f32_u32_e32 v4, s2 1841; GCN-NEXT: s_lshr_b32 s0, s1, 16 1842; GCN-NEXT: s_and_b32 s1, s1, s8 1843; GCN-NEXT: s_lshr_b32 s10, s3, 16 1844; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 1845; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 1846; GCN-NEXT: v_cvt_f32_u32_e32 v3, s10 1847; GCN-NEXT: v_cvt_f32_u32_e32 v5, s1 1848; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 1849; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc 1850; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v3 1851; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1852; GCN-NEXT: v_mul_f32_e32 v1, v5, v6 1853; GCN-NEXT: v_cvt_f32_u32_e32 v6, s0 1854; GCN-NEXT: v_trunc_f32_e32 v1, v1 1855; GCN-NEXT: v_mad_f32 v5, -v1, v4, v5 1856; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 1857; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 1858; GCN-NEXT: v_mul_f32_e32 v4, v6, v7 1859; GCN-NEXT: v_trunc_f32_e32 v4, v4 1860; GCN-NEXT: v_cvt_u32_f32_e32 v5, v4 1861; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1862; GCN-NEXT: v_mad_f32 v4, -v4, v3, v6 1863; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 1864; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 1865; GCN-NEXT: v_and_b32_e32 v0, s8, v0 1866; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1867; GCN-NEXT: v_and_b32_e32 v1, s8, v1 1868; GCN-NEXT: v_or_b32_e32 v1, v1, v3 1869; GCN-NEXT: v_or_b32_e32 v0, v0, v2 1870; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1871; GCN-NEXT: s_endpgm 1872 %r = udiv <4 x i16> %x, %y 1873 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 1874 ret void 1875} 1876 1877define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 1878; CHECK-LABEL: @urem_v4i16( 1879; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 1880; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 1881; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 1882; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 1883; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 1884; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 1885; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 1886; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 1887; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 1888; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 1889; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 1890; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 1891; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 1892; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 1893; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 1894; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 1895; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 1896; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 1897; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 1898; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 1899; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 1900; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i16> undef, i16 [[TMP21]], i64 0 1901; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1 1902; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1 1903; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 1904; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 1905; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 1906; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 1907; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 1908; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 1909; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 1910; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 1911; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 1912; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 1913; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 1914; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 1915; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 1916; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 1917; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 1918; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 1919; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 1920; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 1921; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 1922; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1 1923; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2 1924; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2 1925; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 1926; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 1927; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 1928; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 1929; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 1930; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 1931; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 1932; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 1933; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 1934; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 1935; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 1936; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 1937; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 1938; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 1939; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 1940; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 1941; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 1942; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 1943; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 1944; CHECK-NEXT: [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2 1945; CHECK-NEXT: [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3 1946; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3 1947; CHECK-NEXT: [[TMP69:%.*]] = zext i16 [[TMP67]] to i32 1948; CHECK-NEXT: [[TMP70:%.*]] = zext i16 [[TMP68]] to i32 1949; CHECK-NEXT: [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float 1950; CHECK-NEXT: [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float 1951; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]]) 1952; CHECK-NEXT: [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]] 1953; CHECK-NEXT: [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]]) 1954; CHECK-NEXT: [[TMP76:%.*]] = fneg fast float [[TMP75]] 1955; CHECK-NEXT: [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]]) 1956; CHECK-NEXT: [[TMP78:%.*]] = fptoui float [[TMP75]] to i32 1957; CHECK-NEXT: [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]]) 1958; CHECK-NEXT: [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]]) 1959; CHECK-NEXT: [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]] 1960; CHECK-NEXT: [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0 1961; CHECK-NEXT: [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]] 1962; CHECK-NEXT: [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]] 1963; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]] 1964; CHECK-NEXT: [[TMP86:%.*]] = and i32 [[TMP85]], 65535 1965; CHECK-NEXT: [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16 1966; CHECK-NEXT: [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3 1967; CHECK-NEXT: store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 1968; CHECK-NEXT: ret void 1969; 1970; GCN-LABEL: urem_v4i16: 1971; GCN: ; %bb.0: 1972; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1973; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 1974; GCN-NEXT: s_mov_b32 s8, 0xffff 1975; GCN-NEXT: s_mov_b32 s7, 0xf000 1976; GCN-NEXT: s_mov_b32 s6, -1 1977; GCN-NEXT: s_waitcnt lgkmcnt(0) 1978; GCN-NEXT: s_and_b32 s9, s2, s8 1979; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 1980; GCN-NEXT: s_and_b32 s10, s0, s8 1981; GCN-NEXT: s_lshr_b32 s11, s2, 16 1982; GCN-NEXT: v_cvt_f32_u32_e32 v1, s10 1983; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 1984; GCN-NEXT: v_cvt_f32_u32_e32 v3, s11 1985; GCN-NEXT: s_lshr_b32 s9, s0, 16 1986; GCN-NEXT: v_cvt_f32_u32_e32 v4, s9 1987; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 1988; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 1989; GCN-NEXT: v_trunc_f32_e32 v2, v2 1990; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 1991; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 1992; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1993; GCN-NEXT: v_mul_f32_e32 v1, v4, v5 1994; GCN-NEXT: v_trunc_f32_e32 v1, v1 1995; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 1996; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 1997; GCN-NEXT: v_mad_f32 v1, -v1, v3, v4 1998; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3 1999; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 2000; GCN-NEXT: s_and_b32 s2, s3, s8 2001; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 2002; GCN-NEXT: v_cvt_f32_u32_e32 v2, s2 2003; GCN-NEXT: s_and_b32 s2, s1, s8 2004; GCN-NEXT: v_mul_lo_u32 v1, v1, s11 2005; GCN-NEXT: v_cvt_f32_u32_e32 v3, s2 2006; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 2007; GCN-NEXT: s_lshr_b32 s12, s3, 16 2008; GCN-NEXT: v_sub_i32_e32 v5, vcc, s9, v1 2009; GCN-NEXT: s_lshr_b32 s10, s1, 16 2010; GCN-NEXT: v_mul_f32_e32 v1, v3, v4 2011; GCN-NEXT: v_cvt_f32_u32_e32 v4, s12 2012; GCN-NEXT: v_cvt_f32_u32_e32 v6, s10 2013; GCN-NEXT: v_trunc_f32_e32 v1, v1 2014; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2015; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v4 2016; GCN-NEXT: v_mad_f32 v3, -v1, v2, v3 2017; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 2018; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 2019; GCN-NEXT: v_mul_f32_e32 v2, v6, v7 2020; GCN-NEXT: v_trunc_f32_e32 v2, v2 2021; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 2022; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2023; GCN-NEXT: v_mad_f32 v2, -v2, v4, v6 2024; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 2025; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 2026; GCN-NEXT: v_mul_lo_u32 v1, v1, s3 2027; GCN-NEXT: v_mul_lo_u32 v2, v2, s12 2028; GCN-NEXT: v_and_b32_e32 v0, s8, v0 2029; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 2030; GCN-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 2031; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2032; GCN-NEXT: v_and_b32_e32 v1, s8, v1 2033; GCN-NEXT: v_or_b32_e32 v1, v1, v2 2034; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 2035; GCN-NEXT: v_or_b32_e32 v0, v0, v2 2036; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2037; GCN-NEXT: s_endpgm 2038 %r = urem <4 x i16> %x, %y 2039 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 2040 ret void 2041} 2042 2043define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 2044; CHECK-LABEL: @sdiv_v4i16( 2045; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2046; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2047; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 2048; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 2049; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 2050; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 2051; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 2052; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 2053; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 2054; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2055; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 2056; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 2057; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 2058; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 2059; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 2060; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 2061; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2062; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 2063; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 2064; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 2065; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 2066; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 2067; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 2068; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> undef, i16 [[TMP23]], i64 0 2069; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1 2070; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2071; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 2072; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 2073; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 2074; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 2075; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 2076; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 2077; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 2078; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 2079; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 2080; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 2081; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 2082; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 2083; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 2084; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 2085; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 2086; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 2087; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 2088; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 2089; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 2090; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 2091; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 2092; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1 2093; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2 2094; CHECK-NEXT: [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2095; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 2096; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 2097; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 2098; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 2099; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 2100; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 2101; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 2102; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 2103; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 2104; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 2105; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 2106; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 2107; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 2108; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 2109; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 2110; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 2111; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 2112; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 2113; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 2114; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 2115; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 2116; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2 2117; CHECK-NEXT: [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3 2118; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2119; CHECK-NEXT: [[TMP75:%.*]] = sext i16 [[TMP73]] to i32 2120; CHECK-NEXT: [[TMP76:%.*]] = sext i16 [[TMP74]] to i32 2121; CHECK-NEXT: [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]] 2122; CHECK-NEXT: [[TMP78:%.*]] = ashr i32 [[TMP77]], 30 2123; CHECK-NEXT: [[TMP79:%.*]] = or i32 [[TMP78]], 1 2124; CHECK-NEXT: [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float 2125; CHECK-NEXT: [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float 2126; CHECK-NEXT: [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]]) 2127; CHECK-NEXT: [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]] 2128; CHECK-NEXT: [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]]) 2129; CHECK-NEXT: [[TMP85:%.*]] = fneg fast float [[TMP84]] 2130; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]]) 2131; CHECK-NEXT: [[TMP87:%.*]] = fptosi float [[TMP84]] to i32 2132; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]]) 2133; CHECK-NEXT: [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]]) 2134; CHECK-NEXT: [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]] 2135; CHECK-NEXT: [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0 2136; CHECK-NEXT: [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]] 2137; CHECK-NEXT: [[TMP93:%.*]] = shl i32 [[TMP92]], 16 2138; CHECK-NEXT: [[TMP94:%.*]] = ashr i32 [[TMP93]], 16 2139; CHECK-NEXT: [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16 2140; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3 2141; CHECK-NEXT: store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 2142; CHECK-NEXT: ret void 2143; 2144; GCN-LABEL: sdiv_v4i16: 2145; GCN: ; %bb.0: 2146; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2147; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 2148; GCN-NEXT: s_mov_b32 s7, 0xf000 2149; GCN-NEXT: s_mov_b32 s6, -1 2150; GCN-NEXT: s_waitcnt lgkmcnt(0) 2151; GCN-NEXT: s_sext_i32_i16 s8, s2 2152; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 2153; GCN-NEXT: s_sext_i32_i16 s9, s0 2154; GCN-NEXT: v_cvt_f32_i32_e32 v1, s9 2155; GCN-NEXT: s_xor_b32 s8, s9, s8 2156; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2157; GCN-NEXT: s_ashr_i32 s2, s2, 16 2158; GCN-NEXT: s_ashr_i32 s8, s8, 30 2159; GCN-NEXT: s_or_b32 s8, s8, 1 2160; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2161; GCN-NEXT: v_trunc_f32_e32 v2, v2 2162; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 2163; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 2164; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 2165; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 2166; GCN-NEXT: v_mov_b32_e32 v3, s8 2167; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 2168; GCN-NEXT: s_ashr_i32 s0, s0, 16 2169; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 2170; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 2171; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 2172; GCN-NEXT: s_xor_b32 s0, s0, s2 2173; GCN-NEXT: s_ashr_i32 s0, s0, 30 2174; GCN-NEXT: s_or_b32 s0, s0, 1 2175; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 2176; GCN-NEXT: v_trunc_f32_e32 v3, v3 2177; GCN-NEXT: v_mad_f32 v2, -v3, v1, v2 2178; GCN-NEXT: v_mov_b32_e32 v4, s0 2179; GCN-NEXT: s_sext_i32_i16 s0, s3 2180; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 2181; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 2182; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 2183; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 2184; GCN-NEXT: s_sext_i32_i16 s2, s1 2185; GCN-NEXT: v_add_i32_e32 v3, vcc, v1, v3 2186; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 2187; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 2188; GCN-NEXT: s_xor_b32 s0, s2, s0 2189; GCN-NEXT: s_ashr_i32 s0, s0, 30 2190; GCN-NEXT: s_or_b32 s0, s0, 1 2191; GCN-NEXT: v_mul_f32_e32 v4, v1, v4 2192; GCN-NEXT: v_trunc_f32_e32 v4, v4 2193; GCN-NEXT: v_mad_f32 v1, -v4, v2, v1 2194; GCN-NEXT: v_mov_b32_e32 v5, s0 2195; GCN-NEXT: s_ashr_i32 s0, s3, 16 2196; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 2197; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 2198; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 2199; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc 2200; GCN-NEXT: s_ashr_i32 s1, s1, 16 2201; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 2202; GCN-NEXT: v_cvt_f32_i32_e32 v4, s1 2203; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v2 2204; GCN-NEXT: s_xor_b32 s0, s1, s0 2205; GCN-NEXT: s_ashr_i32 s0, s0, 30 2206; GCN-NEXT: s_or_b32 s0, s0, 1 2207; GCN-NEXT: v_mul_f32_e32 v5, v4, v5 2208; GCN-NEXT: v_trunc_f32_e32 v5, v5 2209; GCN-NEXT: v_mad_f32 v4, -v5, v2, v4 2210; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 2211; GCN-NEXT: v_mov_b32_e32 v6, s0 2212; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| 2213; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc 2214; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 2215; GCN-NEXT: s_mov_b32 s0, 0xffff 2216; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2217; GCN-NEXT: v_and_b32_e32 v1, s0, v1 2218; GCN-NEXT: v_or_b32_e32 v1, v1, v2 2219; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 2220; GCN-NEXT: v_and_b32_e32 v0, s0, v0 2221; GCN-NEXT: v_or_b32_e32 v0, v0, v2 2222; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2223; GCN-NEXT: s_endpgm 2224 %r = sdiv <4 x i16> %x, %y 2225 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 2226 ret void 2227} 2228 2229define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 2230; CHECK-LABEL: @srem_v4i16( 2231; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2232; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2233; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 2234; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 2235; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 2236; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 2237; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 2238; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 2239; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 2240; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2241; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 2242; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 2243; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 2244; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 2245; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 2246; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 2247; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2248; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 2249; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 2250; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 2251; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 2252; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 2253; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 2254; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 2255; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 2256; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> undef, i16 [[TMP25]], i64 0 2257; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1 2258; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2259; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 2260; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 2261; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 2262; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 2263; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 2264; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 2265; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 2266; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 2267; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 2268; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 2269; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 2270; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 2271; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 2272; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 2273; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 2274; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 2275; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 2276; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 2277; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 2278; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 2279; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 2280; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 2281; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 2282; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1 2283; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2 2284; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2285; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 2286; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 2287; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 2288; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 2289; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 2290; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 2291; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 2292; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 2293; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 2294; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 2295; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 2296; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 2297; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 2298; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 2299; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 2300; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 2301; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 2302; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 2303; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 2304; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 2305; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 2306; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 2307; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 2308; CHECK-NEXT: [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2 2309; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3 2310; CHECK-NEXT: [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2311; CHECK-NEXT: [[TMP81:%.*]] = sext i16 [[TMP79]] to i32 2312; CHECK-NEXT: [[TMP82:%.*]] = sext i16 [[TMP80]] to i32 2313; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]] 2314; CHECK-NEXT: [[TMP84:%.*]] = ashr i32 [[TMP83]], 30 2315; CHECK-NEXT: [[TMP85:%.*]] = or i32 [[TMP84]], 1 2316; CHECK-NEXT: [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float 2317; CHECK-NEXT: [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float 2318; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]]) 2319; CHECK-NEXT: [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]] 2320; CHECK-NEXT: [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]]) 2321; CHECK-NEXT: [[TMP91:%.*]] = fneg fast float [[TMP90]] 2322; CHECK-NEXT: [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]]) 2323; CHECK-NEXT: [[TMP93:%.*]] = fptosi float [[TMP90]] to i32 2324; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]]) 2325; CHECK-NEXT: [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]]) 2326; CHECK-NEXT: [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]] 2327; CHECK-NEXT: [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0 2328; CHECK-NEXT: [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]] 2329; CHECK-NEXT: [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]] 2330; CHECK-NEXT: [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]] 2331; CHECK-NEXT: [[TMP101:%.*]] = shl i32 [[TMP100]], 16 2332; CHECK-NEXT: [[TMP102:%.*]] = ashr i32 [[TMP101]], 16 2333; CHECK-NEXT: [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16 2334; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3 2335; CHECK-NEXT: store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 2336; CHECK-NEXT: ret void 2337; 2338; GCN-LABEL: srem_v4i16: 2339; GCN: ; %bb.0: 2340; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2341; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 2342; GCN-NEXT: s_mov_b32 s7, 0xf000 2343; GCN-NEXT: s_mov_b32 s6, -1 2344; GCN-NEXT: s_waitcnt lgkmcnt(0) 2345; GCN-NEXT: s_sext_i32_i16 s8, s2 2346; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 2347; GCN-NEXT: s_sext_i32_i16 s9, s0 2348; GCN-NEXT: v_cvt_f32_i32_e32 v1, s9 2349; GCN-NEXT: s_xor_b32 s8, s9, s8 2350; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2351; GCN-NEXT: s_ashr_i32 s8, s8, 30 2352; GCN-NEXT: s_or_b32 s8, s8, 1 2353; GCN-NEXT: v_mov_b32_e32 v3, s8 2354; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2355; GCN-NEXT: v_trunc_f32_e32 v2, v2 2356; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 2357; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 2358; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 2359; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 2360; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 2361; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 2362; GCN-NEXT: s_ashr_i32 s2, s2, 16 2363; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 2364; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2365; GCN-NEXT: s_ashr_i32 s0, s0, 16 2366; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 2367; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 2368; GCN-NEXT: s_xor_b32 s8, s0, s2 2369; GCN-NEXT: s_ashr_i32 s8, s8, 30 2370; GCN-NEXT: s_or_b32 s8, s8, 1 2371; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 2372; GCN-NEXT: v_trunc_f32_e32 v3, v3 2373; GCN-NEXT: v_mad_f32 v2, -v3, v1, v2 2374; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 2375; GCN-NEXT: v_mov_b32_e32 v4, s8 2376; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 2377; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 2378; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 2379; GCN-NEXT: v_mul_lo_u32 v1, v1, s2 2380; GCN-NEXT: s_sext_i32_i16 s2, s3 2381; GCN-NEXT: v_cvt_f32_i32_e32 v2, s2 2382; GCN-NEXT: v_sub_i32_e32 v3, vcc, s0, v1 2383; GCN-NEXT: s_sext_i32_i16 s0, s1 2384; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 2385; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 2386; GCN-NEXT: s_xor_b32 s0, s0, s2 2387; GCN-NEXT: s_ashr_i32 s0, s0, 30 2388; GCN-NEXT: s_or_b32 s0, s0, 1 2389; GCN-NEXT: v_mul_f32_e32 v4, v1, v4 2390; GCN-NEXT: v_trunc_f32_e32 v4, v4 2391; GCN-NEXT: v_mad_f32 v1, -v4, v2, v1 2392; GCN-NEXT: v_mov_b32_e32 v5, s0 2393; GCN-NEXT: s_ashr_i32 s0, s3, 16 2394; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 2395; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 2396; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 2397; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc 2398; GCN-NEXT: s_ashr_i32 s2, s1, 16 2399; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 2400; GCN-NEXT: v_cvt_f32_i32_e32 v4, s2 2401; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v2 2402; GCN-NEXT: v_mul_lo_u32 v1, v1, s3 2403; GCN-NEXT: s_xor_b32 s3, s2, s0 2404; GCN-NEXT: s_ashr_i32 s3, s3, 30 2405; GCN-NEXT: v_mul_f32_e32 v5, v4, v5 2406; GCN-NEXT: v_trunc_f32_e32 v5, v5 2407; GCN-NEXT: v_mad_f32 v4, -v5, v2, v4 2408; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 2409; GCN-NEXT: s_or_b32 s3, s3, 1 2410; GCN-NEXT: v_mov_b32_e32 v6, s3 2411; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| 2412; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc 2413; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 2414; GCN-NEXT: v_mul_lo_u32 v2, v2, s0 2415; GCN-NEXT: s_mov_b32 s0, 0xffff 2416; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 2417; GCN-NEXT: v_and_b32_e32 v1, s0, v1 2418; GCN-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 2419; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2420; GCN-NEXT: v_or_b32_e32 v1, v1, v2 2421; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 2422; GCN-NEXT: v_and_b32_e32 v0, s0, v0 2423; GCN-NEXT: v_or_b32_e32 v0, v0, v2 2424; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2425; GCN-NEXT: s_endpgm 2426 %r = srem <4 x i16> %x, %y 2427 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 2428 ret void 2429} 2430 2431define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 2432; CHECK-LABEL: @udiv_i3( 2433; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 2434; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 2435; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 2436; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 2437; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 2438; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 2439; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 2440; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 2441; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 2442; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 2443; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2444; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 2445; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 2446; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 2447; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 2448; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 7 2449; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3 2450; CHECK-NEXT: store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]], align 1 2451; CHECK-NEXT: ret void 2452; 2453; GCN-LABEL: udiv_i3: 2454; GCN: ; %bb.0: 2455; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2456; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 2457; GCN-NEXT: s_mov_b32 s7, 0xf000 2458; GCN-NEXT: s_mov_b32 s6, -1 2459; GCN-NEXT: s_waitcnt lgkmcnt(0) 2460; GCN-NEXT: s_bfe_u32 s1, s0, 0x30008 2461; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, s1 2462; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 2463; GCN-NEXT: s_and_b32 s0, s0, 7 2464; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 2465; GCN-NEXT: v_mul_f32_e32 v1, v2, v1 2466; GCN-NEXT: v_trunc_f32_e32 v1, v1 2467; GCN-NEXT: v_cvt_u32_f32_e32 v3, v1 2468; GCN-NEXT: v_mad_f32 v1, -v1, v0, v2 2469; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2470; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 2471; GCN-NEXT: v_and_b32_e32 v0, 7, v0 2472; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 2473; GCN-NEXT: s_endpgm 2474 %r = udiv i3 %x, %y 2475 store i3 %r, i3 addrspace(1)* %out 2476 ret void 2477} 2478 2479define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 2480; CHECK-LABEL: @urem_i3( 2481; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 2482; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 2483; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 2484; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 2485; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 2486; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 2487; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 2488; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 2489; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 2490; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 2491; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2492; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 2493; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 2494; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 2495; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 2496; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 2497; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 2498; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 7 2499; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3 2500; CHECK-NEXT: store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]], align 1 2501; CHECK-NEXT: ret void 2502; 2503; GCN-LABEL: urem_i3: 2504; GCN: ; %bb.0: 2505; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2506; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 2507; GCN-NEXT: s_mov_b32 s7, 0xf000 2508; GCN-NEXT: s_mov_b32 s6, -1 2509; GCN-NEXT: s_waitcnt lgkmcnt(0) 2510; GCN-NEXT: s_bfe_u32 s1, s0, 0x30008 2511; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, s1 2512; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 2513; GCN-NEXT: s_and_b32 s2, s0, 7 2514; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 2515; GCN-NEXT: s_lshr_b32 s1, s0, 8 2516; GCN-NEXT: v_mul_f32_e32 v1, v2, v1 2517; GCN-NEXT: v_trunc_f32_e32 v1, v1 2518; GCN-NEXT: v_cvt_u32_f32_e32 v3, v1 2519; GCN-NEXT: v_mad_f32 v1, -v1, v0, v2 2520; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2521; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 2522; GCN-NEXT: v_mul_lo_u32 v0, v0, s1 2523; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2524; GCN-NEXT: v_and_b32_e32 v0, 7, v0 2525; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 2526; GCN-NEXT: s_endpgm 2527 %r = urem i3 %x, %y 2528 store i3 %r, i3 addrspace(1)* %out 2529 ret void 2530} 2531 2532define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 2533; CHECK-LABEL: @sdiv_i3( 2534; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 2535; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 2536; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 2537; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 2538; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 2539; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 2540; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 2541; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 2542; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 2543; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 2544; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 2545; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 2546; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 2547; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 2548; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 2549; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 2550; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 2551; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 2552; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 29 2553; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 29 2554; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3 2555; CHECK-NEXT: store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]], align 1 2556; CHECK-NEXT: ret void 2557; 2558; GCN-LABEL: sdiv_i3: 2559; GCN: ; %bb.0: 2560; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2561; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 2562; GCN-NEXT: s_mov_b32 s7, 0xf000 2563; GCN-NEXT: s_mov_b32 s6, -1 2564; GCN-NEXT: s_waitcnt lgkmcnt(0) 2565; GCN-NEXT: s_bfe_i32 s1, s0, 0x30008 2566; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 2567; GCN-NEXT: s_bfe_i32 s0, s0, 0x30000 2568; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 2569; GCN-NEXT: s_xor_b32 s0, s0, s1 2570; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2571; GCN-NEXT: s_ashr_i32 s0, s0, 30 2572; GCN-NEXT: s_or_b32 s0, s0, 1 2573; GCN-NEXT: v_mov_b32_e32 v3, s0 2574; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2575; GCN-NEXT: v_trunc_f32_e32 v2, v2 2576; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 2577; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 2578; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 2579; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 2580; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 2581; GCN-NEXT: v_and_b32_e32 v0, 7, v0 2582; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 2583; GCN-NEXT: s_endpgm 2584 %r = sdiv i3 %x, %y 2585 store i3 %r, i3 addrspace(1)* %out 2586 ret void 2587} 2588 2589define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 2590; CHECK-LABEL: @srem_i3( 2591; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 2592; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 2593; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 2594; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 2595; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 2596; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 2597; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 2598; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 2599; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 2600; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 2601; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 2602; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 2603; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 2604; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 2605; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 2606; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 2607; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 2608; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 2609; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 2610; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 2611; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 29 2612; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 29 2613; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3 2614; CHECK-NEXT: store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]], align 1 2615; CHECK-NEXT: ret void 2616; 2617; GCN-LABEL: srem_i3: 2618; GCN: ; %bb.0: 2619; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2620; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 2621; GCN-NEXT: s_mov_b32 s7, 0xf000 2622; GCN-NEXT: s_mov_b32 s6, -1 2623; GCN-NEXT: s_waitcnt lgkmcnt(0) 2624; GCN-NEXT: s_bfe_i32 s1, s0, 0x30008 2625; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 2626; GCN-NEXT: s_bfe_i32 s3, s0, 0x30000 2627; GCN-NEXT: v_cvt_f32_i32_e32 v1, s3 2628; GCN-NEXT: s_xor_b32 s1, s3, s1 2629; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2630; GCN-NEXT: s_ashr_i32 s1, s1, 30 2631; GCN-NEXT: s_or_b32 s1, s1, 1 2632; GCN-NEXT: v_mov_b32_e32 v3, s1 2633; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2634; GCN-NEXT: v_trunc_f32_e32 v2, v2 2635; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 2636; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 2637; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 2638; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 2639; GCN-NEXT: s_lshr_b32 s2, s0, 8 2640; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 2641; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 2642; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2643; GCN-NEXT: v_and_b32_e32 v0, 7, v0 2644; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 2645; GCN-NEXT: s_endpgm 2646 %r = srem i3 %x, %y 2647 store i3 %r, i3 addrspace(1)* %out 2648 ret void 2649} 2650 2651define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 2652; CHECK-LABEL: @udiv_v3i16( 2653; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 2654; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 2655; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 2656; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 2657; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 2658; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 2659; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 2660; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 2661; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 2662; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 2663; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 2664; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 2665; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 2666; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 2667; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 2668; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 2669; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 2670; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 2671; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 2672; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i16> undef, i16 [[TMP19]], i64 0 2673; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1 2674; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1 2675; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 2676; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 2677; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 2678; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 2679; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 2680; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 2681; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 2682; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 2683; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 2684; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 2685; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 2686; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 2687; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 2688; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 2689; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 2690; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 2691; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 2692; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1 2693; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2 2694; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2 2695; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 2696; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 2697; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 2698; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 2699; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 2700; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 2701; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 2702; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 2703; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 2704; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 2705; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 2706; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 2707; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 2708; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 2709; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 2710; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 2711; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 2712; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2 2713; CHECK-NEXT: store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 2714; CHECK-NEXT: ret void 2715; 2716; GCN-LABEL: udiv_v3i16: 2717; GCN: ; %bb.0: 2718; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2719; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 2720; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2721; GCN-NEXT: s_mov_b32 s8, 0xffff 2722; GCN-NEXT: s_mov_b32 s7, 0xf000 2723; GCN-NEXT: s_waitcnt lgkmcnt(0) 2724; GCN-NEXT: s_and_b32 s6, s0, s8 2725; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 2726; GCN-NEXT: s_and_b32 s6, s2, s8 2727; GCN-NEXT: s_lshr_b32 s0, s0, 16 2728; GCN-NEXT: v_cvt_f32_u32_e32 v3, s0 2729; GCN-NEXT: v_cvt_f32_u32_e32 v1, s6 2730; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2731; GCN-NEXT: s_lshr_b32 s0, s2, 16 2732; GCN-NEXT: v_cvt_f32_u32_e32 v4, s0 2733; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 2734; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2735; GCN-NEXT: v_trunc_f32_e32 v2, v2 2736; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 2737; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 2738; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2739; GCN-NEXT: v_mul_f32_e32 v1, v4, v5 2740; GCN-NEXT: v_trunc_f32_e32 v1, v1 2741; GCN-NEXT: s_and_b32 s0, s1, s8 2742; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 2743; GCN-NEXT: v_mad_f32 v2, -v1, v3, v4 2744; GCN-NEXT: v_cvt_f32_u32_e32 v4, s0 2745; GCN-NEXT: s_and_b32 s0, s3, s8 2746; GCN-NEXT: v_cvt_f32_u32_e32 v5, s0 2747; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 2748; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 2749; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 2750; GCN-NEXT: s_mov_b32 s6, -1 2751; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2752; GCN-NEXT: v_mul_f32_e32 v2, v5, v6 2753; GCN-NEXT: v_trunc_f32_e32 v2, v2 2754; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 2755; GCN-NEXT: v_mad_f32 v2, -v2, v4, v5 2756; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 2757; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2758; GCN-NEXT: v_and_b32_e32 v0, s8, v0 2759; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 2760; GCN-NEXT: v_or_b32_e32 v0, v0, v1 2761; GCN-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 2762; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 2763; GCN-NEXT: s_endpgm 2764 %r = udiv <3 x i16> %x, %y 2765 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 2766 ret void 2767} 2768 2769define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 2770; CHECK-LABEL: @urem_v3i16( 2771; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 2772; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 2773; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 2774; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 2775; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 2776; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 2777; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 2778; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 2779; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 2780; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 2781; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 2782; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 2783; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 2784; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 2785; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 2786; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 2787; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 2788; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 2789; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 2790; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 2791; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 2792; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i16> undef, i16 [[TMP21]], i64 0 2793; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1 2794; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1 2795; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 2796; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 2797; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 2798; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 2799; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 2800; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 2801; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 2802; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 2803; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 2804; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 2805; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 2806; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 2807; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 2808; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 2809; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 2810; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 2811; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 2812; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 2813; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 2814; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1 2815; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2 2816; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2 2817; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 2818; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 2819; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 2820; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 2821; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 2822; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 2823; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 2824; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 2825; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 2826; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 2827; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 2828; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 2829; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 2830; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 2831; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 2832; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 2833; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 2834; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 2835; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 2836; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2 2837; CHECK-NEXT: store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 2838; CHECK-NEXT: ret void 2839; 2840; GCN-LABEL: urem_v3i16: 2841; GCN: ; %bb.0: 2842; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2843; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 2844; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2845; GCN-NEXT: s_mov_b32 s8, 0xffff 2846; GCN-NEXT: s_mov_b32 s7, 0xf000 2847; GCN-NEXT: s_waitcnt lgkmcnt(0) 2848; GCN-NEXT: v_mov_b32_e32 v1, s2 2849; GCN-NEXT: s_and_b32 s6, s0, s8 2850; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 2851; GCN-NEXT: s_and_b32 s6, s2, s8 2852; GCN-NEXT: v_cvt_f32_u32_e32 v2, s6 2853; GCN-NEXT: v_mov_b32_e32 v4, s0 2854; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v0 2855; GCN-NEXT: v_alignbit_b32 v4, s1, v4, 16 2856; GCN-NEXT: v_and_b32_e32 v5, s8, v4 2857; GCN-NEXT: v_alignbit_b32 v1, s3, v1, 16 2858; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 2859; GCN-NEXT: v_trunc_f32_e32 v3, v3 2860; GCN-NEXT: v_mad_f32 v2, -v3, v0, v2 2861; GCN-NEXT: v_cvt_u32_f32_e32 v6, v3 2862; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 2863; GCN-NEXT: v_cvt_f32_u32_e32 v2, v5 2864; GCN-NEXT: v_and_b32_e32 v3, s8, v1 2865; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc 2866; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 2867; GCN-NEXT: s_and_b32 s0, s1, s8 2868; GCN-NEXT: v_cvt_f32_u32_e32 v3, v3 2869; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v2 2870; GCN-NEXT: v_cvt_f32_u32_e32 v6, s0 2871; GCN-NEXT: s_and_b32 s0, s3, s8 2872; GCN-NEXT: v_cvt_f32_u32_e32 v7, s0 2873; GCN-NEXT: v_mul_f32_e32 v5, v3, v5 2874; GCN-NEXT: v_trunc_f32_e32 v5, v5 2875; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v6 2876; GCN-NEXT: v_mad_f32 v3, -v5, v2, v3 2877; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 2878; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 2879; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 2880; GCN-NEXT: v_mul_f32_e32 v3, v7, v8 2881; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 2882; GCN-NEXT: v_trunc_f32_e32 v3, v3 2883; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 2884; GCN-NEXT: v_cvt_u32_f32_e32 v4, v3 2885; GCN-NEXT: v_mad_f32 v3, -v3, v6, v7 2886; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v6 2887; GCN-NEXT: s_mov_b32 s6, -1 2888; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 2889; GCN-NEXT: v_mul_lo_u32 v3, v3, s1 2890; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 2891; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2892; GCN-NEXT: v_and_b32_e32 v0, s8, v0 2893; GCN-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 2894; GCN-NEXT: v_or_b32_e32 v0, v0, v1 2895; GCN-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 2896; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 2897; GCN-NEXT: s_endpgm 2898 %r = urem <3 x i16> %x, %y 2899 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 2900 ret void 2901} 2902 2903define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 2904; CHECK-LABEL: @sdiv_v3i16( 2905; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 2906; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 2907; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 2908; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 2909; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 2910; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 2911; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 2912; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 2913; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 2914; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2915; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 2916; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 2917; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 2918; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 2919; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 2920; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 2921; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2922; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 2923; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 2924; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 2925; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 2926; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 2927; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 2928; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i16> undef, i16 [[TMP23]], i64 0 2929; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1 2930; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1 2931; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 2932; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 2933; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 2934; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 2935; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 2936; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 2937; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 2938; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 2939; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 2940; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 2941; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 2942; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 2943; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 2944; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 2945; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 2946; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 2947; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 2948; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 2949; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 2950; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 2951; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 2952; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1 2953; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2 2954; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2 2955; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 2956; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 2957; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 2958; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 2959; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 2960; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 2961; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 2962; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 2963; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 2964; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 2965; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 2966; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 2967; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 2968; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 2969; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 2970; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 2971; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 2972; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 2973; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 2974; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 2975; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 2976; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2 2977; CHECK-NEXT: store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 2978; CHECK-NEXT: ret void 2979; 2980; GCN-LABEL: sdiv_v3i16: 2981; GCN: ; %bb.0: 2982; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2983; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 2984; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2985; GCN-NEXT: s_mov_b32 s7, 0xf000 2986; GCN-NEXT: s_mov_b32 s6, -1 2987; GCN-NEXT: s_waitcnt lgkmcnt(0) 2988; GCN-NEXT: s_sext_i32_i16 s9, s2 2989; GCN-NEXT: s_sext_i32_i16 s8, s0 2990; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 2991; GCN-NEXT: v_cvt_f32_i32_e32 v1, s9 2992; GCN-NEXT: s_xor_b32 s8, s9, s8 2993; GCN-NEXT: s_ashr_i32 s0, s0, 16 2994; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2995; GCN-NEXT: s_ashr_i32 s8, s8, 30 2996; GCN-NEXT: s_or_b32 s8, s8, 1 2997; GCN-NEXT: v_mov_b32_e32 v3, s8 2998; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2999; GCN-NEXT: v_trunc_f32_e32 v2, v2 3000; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 3001; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3002; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 3003; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 3004; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3005; GCN-NEXT: s_ashr_i32 s2, s2, 16 3006; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 3007; GCN-NEXT: v_cvt_f32_i32_e32 v2, s2 3008; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 3009; GCN-NEXT: s_xor_b32 s0, s2, s0 3010; GCN-NEXT: s_ashr_i32 s0, s0, 30 3011; GCN-NEXT: s_or_b32 s0, s0, 1 3012; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 3013; GCN-NEXT: v_trunc_f32_e32 v3, v3 3014; GCN-NEXT: v_mad_f32 v2, -v3, v1, v2 3015; GCN-NEXT: v_mov_b32_e32 v4, s0 3016; GCN-NEXT: s_sext_i32_i16 s0, s1 3017; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 3018; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 3019; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 3020; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 3021; GCN-NEXT: s_sext_i32_i16 s1, s3 3022; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 3023; GCN-NEXT: v_cvt_f32_i32_e32 v3, s1 3024; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 3025; GCN-NEXT: s_xor_b32 s0, s1, s0 3026; GCN-NEXT: s_ashr_i32 s0, s0, 30 3027; GCN-NEXT: s_or_b32 s0, s0, 1 3028; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 3029; GCN-NEXT: v_trunc_f32_e32 v4, v4 3030; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 3031; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 3032; GCN-NEXT: v_mov_b32_e32 v5, s0 3033; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 3034; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 3035; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 3036; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3037; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 3038; GCN-NEXT: v_or_b32_e32 v0, v0, v1 3039; GCN-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 3040; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3041; GCN-NEXT: s_endpgm 3042 %r = sdiv <3 x i16> %x, %y 3043 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 3044 ret void 3045} 3046 3047define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 3048; CHECK-LABEL: @srem_v3i16( 3049; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 3050; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 3051; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 3052; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 3053; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 3054; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 3055; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 3056; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 3057; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 3058; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 3059; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 3060; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 3061; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 3062; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 3063; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 3064; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 3065; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3066; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 3067; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 3068; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 3069; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 3070; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 3071; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 3072; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 3073; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 3074; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i16> undef, i16 [[TMP25]], i64 0 3075; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1 3076; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1 3077; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 3078; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 3079; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 3080; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 3081; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 3082; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 3083; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 3084; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 3085; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 3086; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 3087; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 3088; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 3089; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 3090; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 3091; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 3092; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 3093; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 3094; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 3095; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 3096; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 3097; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 3098; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 3099; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 3100; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1 3101; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2 3102; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2 3103; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 3104; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 3105; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 3106; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 3107; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 3108; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 3109; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 3110; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 3111; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 3112; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 3113; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 3114; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 3115; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 3116; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 3117; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 3118; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 3119; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 3120; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 3121; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 3122; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 3123; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 3124; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 3125; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 3126; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2 3127; CHECK-NEXT: store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 3128; CHECK-NEXT: ret void 3129; 3130; GCN-LABEL: srem_v3i16: 3131; GCN: ; %bb.0: 3132; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3133; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3134; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3135; GCN-NEXT: s_mov_b32 s7, 0xf000 3136; GCN-NEXT: s_waitcnt lgkmcnt(0) 3137; GCN-NEXT: s_sext_i32_i16 s8, s2 3138; GCN-NEXT: s_sext_i32_i16 s6, s0 3139; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6 3140; GCN-NEXT: v_cvt_f32_i32_e32 v1, s8 3141; GCN-NEXT: s_xor_b32 s6, s8, s6 3142; GCN-NEXT: s_ashr_i32 s6, s6, 30 3143; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 3144; GCN-NEXT: s_or_b32 s6, s6, 1 3145; GCN-NEXT: v_mov_b32_e32 v3, s6 3146; GCN-NEXT: s_mov_b32 s6, -1 3147; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 3148; GCN-NEXT: v_trunc_f32_e32 v2, v2 3149; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 3150; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 3151; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3152; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3153; GCN-NEXT: v_mov_b32_e32 v1, s2 3154; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 3155; GCN-NEXT: v_mov_b32_e32 v2, s0 3156; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 16 3157; GCN-NEXT: v_bfe_i32 v3, v2, 0, 16 3158; GCN-NEXT: v_cvt_f32_i32_e32 v4, v3 3159; GCN-NEXT: v_alignbit_b32 v1, s3, v1, 16 3160; GCN-NEXT: v_bfe_i32 v5, v1, 0, 16 3161; GCN-NEXT: v_cvt_f32_i32_e32 v6, v5 3162; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v4 3163; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 3164; GCN-NEXT: v_xor_b32_e32 v3, v5, v3 3165; GCN-NEXT: s_sext_i32_i16 s0, s1 3166; GCN-NEXT: v_mul_f32_e32 v5, v6, v7 3167; GCN-NEXT: v_trunc_f32_e32 v5, v5 3168; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 3169; GCN-NEXT: v_mad_f32 v6, -v5, v4, v6 3170; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 3171; GCN-NEXT: v_ashrrev_i32_e32 v3, 30, v3 3172; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4| 3173; GCN-NEXT: v_cvt_f32_i32_e32 v4, s0 3174; GCN-NEXT: v_or_b32_e32 v3, 1, v3 3175; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 3176; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 3177; GCN-NEXT: s_sext_i32_i16 s2, s3 3178; GCN-NEXT: v_mul_lo_u32 v2, v3, v2 3179; GCN-NEXT: v_cvt_f32_i32_e32 v3, s2 3180; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v4 3181; GCN-NEXT: s_xor_b32 s0, s2, s0 3182; GCN-NEXT: s_ashr_i32 s0, s0, 30 3183; GCN-NEXT: s_or_b32 s0, s0, 1 3184; GCN-NEXT: v_mul_f32_e32 v5, v3, v5 3185; GCN-NEXT: v_trunc_f32_e32 v5, v5 3186; GCN-NEXT: v_mad_f32 v3, -v5, v4, v3 3187; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 3188; GCN-NEXT: v_mov_b32_e32 v6, s0 3189; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v4| 3190; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 3191; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 3192; GCN-NEXT: v_mul_lo_u32 v3, v3, s1 3193; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 3194; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3195; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 3196; GCN-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 3197; GCN-NEXT: v_or_b32_e32 v0, v0, v1 3198; GCN-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 3199; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3200; GCN-NEXT: s_endpgm 3201 %r = srem <3 x i16> %x, %y 3202 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 3203 ret void 3204} 3205 3206define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 3207; CHECK-LABEL: @udiv_v3i15( 3208; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 3209; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 3210; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 3211; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 3212; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3213; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3214; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3215; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3216; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3217; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3218; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3219; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3220; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3221; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3222; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3223; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3224; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3225; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 32767 3226; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15 3227; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i15> undef, i15 [[TMP19]], i64 0 3228; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1 3229; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1 3230; CHECK-NEXT: [[TMP23:%.*]] = zext i15 [[TMP21]] to i32 3231; CHECK-NEXT: [[TMP24:%.*]] = zext i15 [[TMP22]] to i32 3232; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 3233; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 3234; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 3235; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 3236; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 3237; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 3238; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 3239; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 3240; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 3241; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 3242; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 3243; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 3244; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 3245; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 32767 3246; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15 3247; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1 3248; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2 3249; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2 3250; CHECK-NEXT: [[TMP43:%.*]] = zext i15 [[TMP41]] to i32 3251; CHECK-NEXT: [[TMP44:%.*]] = zext i15 [[TMP42]] to i32 3252; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 3253; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 3254; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 3255; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 3256; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 3257; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 3258; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 3259; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 3260; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 3261; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 3262; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 3263; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 3264; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 3265; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 32767 3266; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15 3267; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2 3268; CHECK-NEXT: store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 3269; CHECK-NEXT: ret void 3270; 3271; GCN-LABEL: udiv_v3i15: 3272; GCN: ; %bb.0: 3273; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3274; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3275; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3276; GCN-NEXT: s_mov_b32 s7, 0xf000 3277; GCN-NEXT: s_mov_b32 s6, -1 3278; GCN-NEXT: s_waitcnt lgkmcnt(0) 3279; GCN-NEXT: v_mov_b32_e32 v0, s2 3280; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 30 3281; GCN-NEXT: s_movk_i32 s3, 0x7fff 3282; GCN-NEXT: s_and_b32 s9, s0, s3 3283; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 3284; GCN-NEXT: v_mov_b32_e32 v2, s0 3285; GCN-NEXT: s_and_b32 s8, s2, s3 3286; GCN-NEXT: s_bfe_u32 s0, s0, 0xf000f 3287; GCN-NEXT: v_cvt_f32_u32_e32 v5, s0 3288; GCN-NEXT: v_cvt_f32_u32_e32 v3, s8 3289; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 3290; GCN-NEXT: s_bfe_u32 s2, s2, 0xf000f 3291; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 30 3292; GCN-NEXT: v_cvt_f32_u32_e32 v6, s2 3293; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 3294; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v5 3295; GCN-NEXT: v_and_b32_e32 v2, s3, v2 3296; GCN-NEXT: v_trunc_f32_e32 v4, v4 3297; GCN-NEXT: v_mad_f32 v3, -v4, v1, v3 3298; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 3299; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 3300; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 3301; GCN-NEXT: v_mul_f32_e32 v1, v6, v7 3302; GCN-NEXT: v_and_b32_e32 v0, s3, v0 3303; GCN-NEXT: v_trunc_f32_e32 v1, v1 3304; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 3305; GCN-NEXT: v_mad_f32 v4, -v1, v5, v6 3306; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 3307; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 3308; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v2 3309; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v5 3310; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 3311; GCN-NEXT: v_mul_f32_e32 v1, v0, v6 3312; GCN-NEXT: v_trunc_f32_e32 v1, v1 3313; GCN-NEXT: v_cvt_u32_f32_e32 v5, v1 3314; GCN-NEXT: v_mad_f32 v0, -v1, v2, v0 3315; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 3316; GCN-NEXT: v_and_b32_e32 v2, s3, v3 3317; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc 3318; GCN-NEXT: v_and_b32_e32 v3, s3, v4 3319; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 3320; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 3321; GCN-NEXT: v_or_b32_e32 v2, v2, v3 3322; GCN-NEXT: v_or_b32_e32 v0, v2, v0 3323; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1 3324; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3325; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 3326; GCN-NEXT: s_endpgm 3327 %r = udiv <3 x i15> %x, %y 3328 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 3329 ret void 3330} 3331 3332define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 3333; CHECK-LABEL: @urem_v3i15( 3334; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 3335; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 3336; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 3337; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 3338; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3339; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3340; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3341; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3342; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3343; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3344; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3345; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3346; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3347; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3348; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3349; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3350; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3351; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 3352; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 3353; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 32767 3354; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15 3355; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i15> undef, i15 [[TMP21]], i64 0 3356; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1 3357; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1 3358; CHECK-NEXT: [[TMP25:%.*]] = zext i15 [[TMP23]] to i32 3359; CHECK-NEXT: [[TMP26:%.*]] = zext i15 [[TMP24]] to i32 3360; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 3361; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 3362; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 3363; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 3364; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 3365; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 3366; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 3367; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 3368; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 3369; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 3370; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 3371; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 3372; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 3373; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 3374; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 3375; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 32767 3376; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15 3377; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1 3378; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2 3379; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2 3380; CHECK-NEXT: [[TMP47:%.*]] = zext i15 [[TMP45]] to i32 3381; CHECK-NEXT: [[TMP48:%.*]] = zext i15 [[TMP46]] to i32 3382; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 3383; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 3384; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 3385; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 3386; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 3387; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 3388; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 3389; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 3390; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 3391; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 3392; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 3393; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 3394; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 3395; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 3396; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 3397; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 32767 3398; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15 3399; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2 3400; CHECK-NEXT: store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 3401; CHECK-NEXT: ret void 3402; 3403; GCN-LABEL: urem_v3i15: 3404; GCN: ; %bb.0: 3405; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3406; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3407; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3408; GCN-NEXT: s_mov_b32 s7, 0xf000 3409; GCN-NEXT: s_mov_b32 s6, -1 3410; GCN-NEXT: s_waitcnt lgkmcnt(0) 3411; GCN-NEXT: v_mov_b32_e32 v0, s2 3412; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 30 3413; GCN-NEXT: s_movk_i32 s3, 0x7fff 3414; GCN-NEXT: s_and_b32 s10, s0, s3 3415; GCN-NEXT: v_cvt_f32_u32_e32 v1, s10 3416; GCN-NEXT: s_and_b32 s9, s2, s3 3417; GCN-NEXT: v_cvt_f32_u32_e32 v3, s9 3418; GCN-NEXT: v_mov_b32_e32 v2, s0 3419; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 3420; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 30 3421; GCN-NEXT: s_bfe_u32 s1, s0, 0xf000f 3422; GCN-NEXT: v_cvt_f32_u32_e32 v5, s1 3423; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 3424; GCN-NEXT: v_trunc_f32_e32 v4, v4 3425; GCN-NEXT: v_mad_f32 v3, -v4, v1, v3 3426; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 3427; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 3428; GCN-NEXT: s_bfe_u32 s10, s2, 0xf000f 3429; GCN-NEXT: v_cvt_f32_u32_e32 v3, s10 3430; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc 3431; GCN-NEXT: v_mul_lo_u32 v1, v1, s0 3432; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v5 3433; GCN-NEXT: v_and_b32_e32 v2, s3, v2 3434; GCN-NEXT: v_and_b32_e32 v0, s3, v0 3435; GCN-NEXT: v_sub_i32_e32 v6, vcc, s2, v1 3436; GCN-NEXT: v_mul_f32_e32 v1, v3, v4 3437; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2 3438; GCN-NEXT: v_cvt_f32_u32_e32 v7, v0 3439; GCN-NEXT: v_trunc_f32_e32 v1, v1 3440; GCN-NEXT: v_mad_f32 v3, -v1, v5, v3 3441; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v4 3442; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 3443; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 3444; GCN-NEXT: s_lshr_b32 s0, s0, 15 3445; GCN-NEXT: v_mul_f32_e32 v3, v7, v8 3446; GCN-NEXT: v_trunc_f32_e32 v3, v3 3447; GCN-NEXT: v_cvt_u32_f32_e32 v5, v3 3448; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3449; GCN-NEXT: v_mad_f32 v3, -v3, v4, v7 3450; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 3451; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 3452; GCN-NEXT: v_mul_lo_u32 v1, v1, s0 3453; GCN-NEXT: v_mul_lo_u32 v2, v3, v2 3454; GCN-NEXT: s_lshr_b32 s8, s2, 15 3455; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v1 3456; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 3457; GCN-NEXT: v_and_b32_e32 v3, s3, v3 3458; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 3459; GCN-NEXT: v_and_b32_e32 v2, s3, v6 3460; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 3461; GCN-NEXT: v_or_b32_e32 v2, v2, v3 3462; GCN-NEXT: v_or_b32_e32 v0, v2, v0 3463; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1 3464; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3465; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 3466; GCN-NEXT: s_endpgm 3467 %r = urem <3 x i15> %x, %y 3468 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 3469 ret void 3470} 3471 3472define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 3473; CHECK-LABEL: @sdiv_v3i15( 3474; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 3475; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 3476; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 3477; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 3478; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 3479; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 3480; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 3481; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 3482; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 3483; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 3484; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 3485; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 3486; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 3487; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 3488; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 3489; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 3490; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3491; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 3492; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 3493; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 3494; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 17 3495; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 17 3496; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15 3497; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i15> undef, i15 [[TMP23]], i64 0 3498; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1 3499; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1 3500; CHECK-NEXT: [[TMP27:%.*]] = sext i15 [[TMP25]] to i32 3501; CHECK-NEXT: [[TMP28:%.*]] = sext i15 [[TMP26]] to i32 3502; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 3503; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 3504; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 3505; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 3506; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 3507; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 3508; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 3509; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 3510; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 3511; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 3512; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 3513; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 3514; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 3515; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 3516; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 3517; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 3518; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 17 3519; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 17 3520; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15 3521; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1 3522; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2 3523; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2 3524; CHECK-NEXT: [[TMP51:%.*]] = sext i15 [[TMP49]] to i32 3525; CHECK-NEXT: [[TMP52:%.*]] = sext i15 [[TMP50]] to i32 3526; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 3527; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 3528; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 3529; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 3530; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 3531; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 3532; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 3533; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 3534; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 3535; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 3536; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 3537; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 3538; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 3539; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 3540; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 3541; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 3542; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 17 3543; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 17 3544; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15 3545; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2 3546; CHECK-NEXT: store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 3547; CHECK-NEXT: ret void 3548; 3549; GCN-LABEL: sdiv_v3i15: 3550; GCN: ; %bb.0: 3551; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3552; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3553; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3554; GCN-NEXT: s_mov_b32 s7, 0xf000 3555; GCN-NEXT: s_mov_b32 s6, -1 3556; GCN-NEXT: s_waitcnt lgkmcnt(0) 3557; GCN-NEXT: v_mov_b32_e32 v0, s2 3558; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 30 3559; GCN-NEXT: s_bfe_i32 s3, s0, 0xf0000 3560; GCN-NEXT: v_cvt_f32_i32_e32 v2, s3 3561; GCN-NEXT: v_mov_b32_e32 v1, s0 3562; GCN-NEXT: v_alignbit_b32 v1, s1, v1, 30 3563; GCN-NEXT: s_bfe_i32 s1, s2, 0xf0000 3564; GCN-NEXT: v_cvt_f32_i32_e32 v3, s1 3565; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 3566; GCN-NEXT: s_xor_b32 s1, s1, s3 3567; GCN-NEXT: s_bfe_i32 s0, s0, 0xf000f 3568; GCN-NEXT: s_ashr_i32 s1, s1, 30 3569; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 3570; GCN-NEXT: v_trunc_f32_e32 v4, v4 3571; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 3572; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 3573; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 3574; GCN-NEXT: v_cvt_f32_i32_e32 v3, s0 3575; GCN-NEXT: s_or_b32 s1, s1, 1 3576; GCN-NEXT: v_mov_b32_e32 v5, s1 3577; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 3578; GCN-NEXT: s_bfe_i32 s1, s2, 0xf000f 3579; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 3580; GCN-NEXT: v_cvt_f32_i32_e32 v4, s1 3581; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 3582; GCN-NEXT: s_xor_b32 s0, s1, s0 3583; GCN-NEXT: v_bfe_i32 v1, v1, 0, 15 3584; GCN-NEXT: s_ashr_i32 s0, s0, 30 3585; GCN-NEXT: v_mul_f32_e32 v5, v4, v5 3586; GCN-NEXT: v_trunc_f32_e32 v5, v5 3587; GCN-NEXT: v_mad_f32 v4, -v5, v3, v4 3588; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| 3589; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 3590; GCN-NEXT: v_cvt_f32_i32_e32 v4, v1 3591; GCN-NEXT: s_or_b32 s0, s0, 1 3592; GCN-NEXT: v_mov_b32_e32 v6, s0 3593; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 3594; GCN-NEXT: v_bfe_i32 v0, v0, 0, 15 3595; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 3596; GCN-NEXT: v_cvt_f32_i32_e32 v5, v0 3597; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 3598; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 3599; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 3600; GCN-NEXT: v_or_b32_e32 v0, 1, v0 3601; GCN-NEXT: v_mul_f32_e32 v1, v5, v6 3602; GCN-NEXT: v_trunc_f32_e32 v1, v1 3603; GCN-NEXT: v_mad_f32 v5, -v1, v4, v5 3604; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 3605; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| 3606; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 3607; GCN-NEXT: s_movk_i32 s0, 0x7fff 3608; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 3609; GCN-NEXT: v_and_b32_e32 v3, s0, v3 3610; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 3611; GCN-NEXT: v_and_b32_e32 v2, s0, v2 3612; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 3613; GCN-NEXT: v_or_b32_e32 v2, v2, v3 3614; GCN-NEXT: v_or_b32_e32 v0, v2, v0 3615; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1 3616; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3617; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 3618; GCN-NEXT: s_endpgm 3619 %r = sdiv <3 x i15> %x, %y 3620 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 3621 ret void 3622} 3623 3624define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 3625; CHECK-LABEL: @srem_v3i15( 3626; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 3627; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 3628; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 3629; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 3630; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 3631; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 3632; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 3633; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 3634; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 3635; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 3636; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 3637; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 3638; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 3639; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 3640; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 3641; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 3642; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3643; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 3644; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 3645; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 3646; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 3647; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 3648; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 17 3649; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 17 3650; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15 3651; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i15> undef, i15 [[TMP25]], i64 0 3652; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1 3653; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1 3654; CHECK-NEXT: [[TMP29:%.*]] = sext i15 [[TMP27]] to i32 3655; CHECK-NEXT: [[TMP30:%.*]] = sext i15 [[TMP28]] to i32 3656; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 3657; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 3658; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 3659; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 3660; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 3661; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 3662; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 3663; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 3664; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 3665; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 3666; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 3667; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 3668; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 3669; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 3670; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 3671; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 3672; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 3673; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 3674; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 17 3675; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 17 3676; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15 3677; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1 3678; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2 3679; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2 3680; CHECK-NEXT: [[TMP55:%.*]] = sext i15 [[TMP53]] to i32 3681; CHECK-NEXT: [[TMP56:%.*]] = sext i15 [[TMP54]] to i32 3682; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 3683; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 3684; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 3685; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 3686; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 3687; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 3688; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 3689; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 3690; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 3691; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 3692; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 3693; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 3694; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 3695; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 3696; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 3697; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 3698; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 3699; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 3700; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 17 3701; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 17 3702; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15 3703; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2 3704; CHECK-NEXT: store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 3705; CHECK-NEXT: ret void 3706; 3707; GCN-LABEL: srem_v3i15: 3708; GCN: ; %bb.0: 3709; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3710; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3711; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3712; GCN-NEXT: s_mov_b32 s7, 0xf000 3713; GCN-NEXT: s_mov_b32 s6, -1 3714; GCN-NEXT: s_waitcnt lgkmcnt(0) 3715; GCN-NEXT: v_mov_b32_e32 v0, s2 3716; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 30 3717; GCN-NEXT: s_movk_i32 s3, 0x7fff 3718; GCN-NEXT: s_and_b32 s11, s0, s3 3719; GCN-NEXT: s_bfe_i32 s11, s11, 0xf0000 3720; GCN-NEXT: v_cvt_f32_i32_e32 v2, s11 3721; GCN-NEXT: s_and_b32 s9, s2, s3 3722; GCN-NEXT: s_bfe_i32 s9, s9, 0xf0000 3723; GCN-NEXT: v_cvt_f32_i32_e32 v3, s9 3724; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 3725; GCN-NEXT: s_xor_b32 s9, s9, s11 3726; GCN-NEXT: s_ashr_i32 s9, s9, 30 3727; GCN-NEXT: s_or_b32 s9, s9, 1 3728; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 3729; GCN-NEXT: v_trunc_f32_e32 v4, v4 3730; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 3731; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 3732; GCN-NEXT: v_mov_b32_e32 v5, s9 3733; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 3734; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 3735; GCN-NEXT: v_mov_b32_e32 v1, s0 3736; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 3737; GCN-NEXT: s_bfe_u32 s12, s0, 0xf000f 3738; GCN-NEXT: v_alignbit_b32 v1, s1, v1, 30 3739; GCN-NEXT: v_mul_lo_u32 v2, v2, s0 3740; GCN-NEXT: s_lshr_b32 s1, s0, 15 3741; GCN-NEXT: s_bfe_i32 s0, s12, 0xf0000 3742; GCN-NEXT: v_cvt_f32_i32_e32 v3, s0 3743; GCN-NEXT: s_bfe_u32 s10, s2, 0xf000f 3744; GCN-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 3745; GCN-NEXT: s_lshr_b32 s8, s2, 15 3746; GCN-NEXT: s_bfe_i32 s2, s10, 0xf0000 3747; GCN-NEXT: v_cvt_f32_i32_e32 v4, s2 3748; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 3749; GCN-NEXT: s_xor_b32 s0, s2, s0 3750; GCN-NEXT: s_ashr_i32 s0, s0, 30 3751; GCN-NEXT: s_or_b32 s0, s0, 1 3752; GCN-NEXT: v_mul_f32_e32 v5, v4, v5 3753; GCN-NEXT: v_trunc_f32_e32 v5, v5 3754; GCN-NEXT: v_mad_f32 v4, -v5, v3, v4 3755; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 3756; GCN-NEXT: v_and_b32_e32 v1, s3, v1 3757; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| 3758; GCN-NEXT: v_mov_b32_e32 v6, s0 3759; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 3760; GCN-NEXT: v_bfe_i32 v4, v1, 0, 15 3761; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 3762; GCN-NEXT: v_cvt_f32_i32_e32 v5, v4 3763; GCN-NEXT: v_and_b32_e32 v0, s3, v0 3764; GCN-NEXT: v_bfe_i32 v6, v0, 0, 15 3765; GCN-NEXT: v_cvt_f32_i32_e32 v7, v6 3766; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v5 3767; GCN-NEXT: v_xor_b32_e32 v4, v6, v4 3768; GCN-NEXT: v_ashrrev_i32_e32 v4, 30, v4 3769; GCN-NEXT: v_or_b32_e32 v4, 1, v4 3770; GCN-NEXT: v_mul_f32_e32 v6, v7, v8 3771; GCN-NEXT: v_trunc_f32_e32 v6, v6 3772; GCN-NEXT: v_mad_f32 v7, -v6, v5, v7 3773; GCN-NEXT: v_cvt_i32_f32_e32 v6, v6 3774; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v5| 3775; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc 3776; GCN-NEXT: v_mul_lo_u32 v3, v3, s1 3777; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 3778; GCN-NEXT: v_mul_lo_u32 v1, v4, v1 3779; GCN-NEXT: v_and_b32_e32 v2, s3, v2 3780; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 3781; GCN-NEXT: v_and_b32_e32 v3, s3, v3 3782; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 3783; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 3784; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 3785; GCN-NEXT: v_or_b32_e32 v2, v2, v3 3786; GCN-NEXT: v_or_b32_e32 v0, v2, v0 3787; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1 3788; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3789; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 3790; GCN-NEXT: s_endpgm 3791 %r = srem <3 x i15> %x, %y 3792 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 3793 ret void 3794} 3795 3796define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 3797; CHECK-LABEL: @udiv_i32_oddk_denom( 3798; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 1235195 3799; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 3800; CHECK-NEXT: ret void 3801; 3802; GCN-LABEL: udiv_i32_oddk_denom: 3803; GCN: ; %bb.0: 3804; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3805; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 3806; GCN-NEXT: v_mov_b32_e32 v0, 0xb2a50881 3807; GCN-NEXT: s_mov_b32 s7, 0xf000 3808; GCN-NEXT: s_mov_b32 s6, -1 3809; GCN-NEXT: s_waitcnt lgkmcnt(0) 3810; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 3811; GCN-NEXT: v_sub_i32_e32 v1, vcc, s0, v0 3812; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v1 3813; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 3814; GCN-NEXT: v_lshrrev_b32_e32 v0, 20, v0 3815; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3816; GCN-NEXT: s_endpgm 3817 %r = udiv i32 %x, 1235195 3818 store i32 %r, i32 addrspace(1)* %out 3819 ret void 3820} 3821 3822define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 3823; CHECK-LABEL: @udiv_i32_pow2k_denom( 3824; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 4096 3825; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 3826; CHECK-NEXT: ret void 3827; 3828; GCN-LABEL: udiv_i32_pow2k_denom: 3829; GCN: ; %bb.0: 3830; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3831; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 3832; GCN-NEXT: s_mov_b32 s7, 0xf000 3833; GCN-NEXT: s_mov_b32 s6, -1 3834; GCN-NEXT: s_waitcnt lgkmcnt(0) 3835; GCN-NEXT: s_lshr_b32 s0, s0, 12 3836; GCN-NEXT: v_mov_b32_e32 v0, s0 3837; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3838; GCN-NEXT: s_endpgm 3839 %r = udiv i32 %x, 4096 3840 store i32 %r, i32 addrspace(1)* %out 3841 ret void 3842} 3843 3844define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 3845; CHECK-LABEL: @udiv_i32_pow2_shl_denom( 3846; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 3847; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]] 3848; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 3849; CHECK-NEXT: ret void 3850; 3851; GCN-LABEL: udiv_i32_pow2_shl_denom: 3852; GCN: ; %bb.0: 3853; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3854; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3855; GCN-NEXT: s_mov_b32 s7, 0xf000 3856; GCN-NEXT: s_mov_b32 s6, -1 3857; GCN-NEXT: s_waitcnt lgkmcnt(0) 3858; GCN-NEXT: s_add_i32 s1, s1, 12 3859; GCN-NEXT: s_lshr_b32 s0, s0, s1 3860; GCN-NEXT: v_mov_b32_e32 v0, s0 3861; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3862; GCN-NEXT: s_endpgm 3863 %shl.y = shl i32 4096, %y 3864 %r = udiv i32 %x, %shl.y 3865 store i32 %r, i32 addrspace(1)* %out 3866 ret void 3867} 3868 3869define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 3870; CHECK-LABEL: @udiv_v2i32_pow2k_denom( 3871; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 3872; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 3873; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 3874; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 3875; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096 3876; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 3877; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 3878; CHECK-NEXT: ret void 3879; 3880; GCN-LABEL: udiv_v2i32_pow2k_denom: 3881; GCN: ; %bb.0: 3882; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3883; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3884; GCN-NEXT: s_mov_b32 s7, 0xf000 3885; GCN-NEXT: s_mov_b32 s6, -1 3886; GCN-NEXT: s_waitcnt lgkmcnt(0) 3887; GCN-NEXT: s_lshr_b32 s0, s0, 12 3888; GCN-NEXT: s_lshr_b32 s1, s1, 12 3889; GCN-NEXT: v_mov_b32_e32 v0, s0 3890; GCN-NEXT: v_mov_b32_e32 v1, s1 3891; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3892; GCN-NEXT: s_endpgm 3893 %r = udiv <2 x i32> %x, <i32 4096, i32 4096> 3894 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 3895 ret void 3896} 3897 3898define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 3899; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom( 3900; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 3901; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 3902; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 3903; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 3904; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095 3905; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 3906; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 3907; CHECK-NEXT: ret void 3908; 3909; GCN-LABEL: udiv_v2i32_mixed_pow2k_denom: 3910; GCN: ; %bb.0: 3911; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3912; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3913; GCN-NEXT: v_mov_b32_e32 v0, 0x100101 3914; GCN-NEXT: s_mov_b32 s7, 0xf000 3915; GCN-NEXT: s_mov_b32 s6, -1 3916; GCN-NEXT: s_waitcnt lgkmcnt(0) 3917; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 3918; GCN-NEXT: s_lshr_b32 s0, s0, 12 3919; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v0 3920; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v1 3921; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 3922; GCN-NEXT: v_lshrrev_b32_e32 v1, 11, v0 3923; GCN-NEXT: v_mov_b32_e32 v0, s0 3924; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3925; GCN-NEXT: s_endpgm 3926 %r = udiv <2 x i32> %x, <i32 4096, i32 4095> 3927 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 3928 ret void 3929} 3930 3931define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 3932; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom( 3933; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 3934; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 3935; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 3936; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 3937; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 3938; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 3939; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 3940; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 3941; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 3942; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 3943; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 3944; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 3945; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 3946; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 3947; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 3948; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 3949; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 3950; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 3951; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 3952; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 3953; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 3954; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 3955; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 3956; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 3957; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 3958; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1 3959; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]] 3960; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]] 3961; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]] 3962; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]] 3963; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 3964; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 3965; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> undef, i32 [[TMP31]], i64 0 3966; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x i32> [[X]], i64 1 3967; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 3968; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float 3969; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 3970; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000 3971; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32 3972; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]] 3973; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]] 3974; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64 3975; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64 3976; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]] 3977; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 3978; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32 3979; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32 3980; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]] 3981; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64 3982; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64 3983; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]] 3984; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 3985; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32 3986; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32 3987; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]] 3988; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]] 3989; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]] 3990; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1 3991; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]] 3992; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]] 3993; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]] 3994; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]] 3995; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1 3996; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]] 3997; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP63]], i64 1 3998; CHECK-NEXT: store <2 x i32> [[TMP64]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 3999; CHECK-NEXT: ret void 4000; 4001; GCN-LABEL: udiv_v2i32_pow2_shl_denom: 4002; GCN: ; %bb.0: 4003; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 4004; GCN-NEXT: s_movk_i32 s4, 0x1000 4005; GCN-NEXT: s_mov_b32 s7, 0xf000 4006; GCN-NEXT: s_mov_b32 s6, -1 4007; GCN-NEXT: s_waitcnt lgkmcnt(0) 4008; GCN-NEXT: s_lshl_b32 s8, s4, s2 4009; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 4010; GCN-NEXT: s_lshl_b32 s9, s4, s3 4011; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 4012; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4013; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4014; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 4015; GCN-NEXT: s_mov_b32 s0, 0x4f7ffffe 4016; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 4017; GCN-NEXT: v_mul_f32_e32 v0, s0, v0 4018; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4019; GCN-NEXT: v_mul_f32_e32 v1, s0, v1 4020; GCN-NEXT: s_sub_i32 s0, 0, s8 4021; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 4022; GCN-NEXT: v_mul_lo_u32 v2, s0, v0 4023; GCN-NEXT: s_sub_i32 s0, 0, s9 4024; GCN-NEXT: v_mul_lo_u32 v3, s0, v1 4025; GCN-NEXT: v_mul_hi_u32 v2, v0, v2 4026; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 4027; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 4028; GCN-NEXT: s_waitcnt lgkmcnt(0) 4029; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 4030; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 4031; GCN-NEXT: v_mul_hi_u32 v1, s3, v1 4032; GCN-NEXT: v_mul_lo_u32 v2, v0, s8 4033; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v0 4034; GCN-NEXT: v_mul_lo_u32 v4, v1, s9 4035; GCN-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 4036; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 4037; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 4038; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s8, v2 4039; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 4040; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v0 4041; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 4042; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4043; GCN-NEXT: v_sub_i32_e32 v2, vcc, s3, v4 4044; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v1 4045; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 4046; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 4047; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s9, v2 4048; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 4049; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v1 4050; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 4051; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 4052; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4053; GCN-NEXT: s_endpgm 4054 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 4055 %r = udiv <2 x i32> %x, %shl.y 4056 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4057 ret void 4058} 4059 4060define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 4061; CHECK-LABEL: @urem_i32_oddk_denom( 4062; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 1235195 4063; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4064; CHECK-NEXT: ret void 4065; 4066; GCN-LABEL: urem_i32_oddk_denom: 4067; GCN: ; %bb.0: 4068; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4069; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4070; GCN-NEXT: v_mov_b32_e32 v0, 0xb2a50881 4071; GCN-NEXT: s_mov_b32 s7, 0xf000 4072; GCN-NEXT: s_mov_b32 s6, -1 4073; GCN-NEXT: s_waitcnt lgkmcnt(0) 4074; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 4075; GCN-NEXT: v_sub_i32_e32 v1, vcc, s0, v0 4076; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v1 4077; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 4078; GCN-NEXT: v_lshrrev_b32_e32 v0, 20, v0 4079; GCN-NEXT: v_mul_u32_u24_e32 v0, 0x12d8fb, v0 4080; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 4081; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4082; GCN-NEXT: s_endpgm 4083 %r = urem i32 %x, 1235195 4084 store i32 %r, i32 addrspace(1)* %out 4085 ret void 4086} 4087 4088define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 4089; CHECK-LABEL: @urem_i32_pow2k_denom( 4090; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 4096 4091; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4092; CHECK-NEXT: ret void 4093; 4094; GCN-LABEL: urem_i32_pow2k_denom: 4095; GCN: ; %bb.0: 4096; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4097; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4098; GCN-NEXT: s_mov_b32 s7, 0xf000 4099; GCN-NEXT: s_mov_b32 s6, -1 4100; GCN-NEXT: s_waitcnt lgkmcnt(0) 4101; GCN-NEXT: s_and_b32 s0, s0, 0xfff 4102; GCN-NEXT: v_mov_b32_e32 v0, s0 4103; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4104; GCN-NEXT: s_endpgm 4105 %r = urem i32 %x, 4096 4106 store i32 %r, i32 addrspace(1)* %out 4107 ret void 4108} 4109 4110define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 4111; CHECK-LABEL: @urem_i32_pow2_shl_denom( 4112; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 4113; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]] 4114; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4115; CHECK-NEXT: ret void 4116; 4117; GCN-LABEL: urem_i32_pow2_shl_denom: 4118; GCN: ; %bb.0: 4119; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4120; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4121; GCN-NEXT: s_mov_b32 s7, 0xf000 4122; GCN-NEXT: s_mov_b32 s6, -1 4123; GCN-NEXT: s_waitcnt lgkmcnt(0) 4124; GCN-NEXT: s_lshl_b32 s1, 0x1000, s1 4125; GCN-NEXT: s_add_i32 s1, s1, -1 4126; GCN-NEXT: s_and_b32 s0, s0, s1 4127; GCN-NEXT: v_mov_b32_e32 v0, s0 4128; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4129; GCN-NEXT: s_endpgm 4130 %shl.y = shl i32 4096, %y 4131 %r = urem i32 %x, %shl.y 4132 store i32 %r, i32 addrspace(1)* %out 4133 ret void 4134} 4135 4136define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 4137; CHECK-LABEL: @urem_v2i32_pow2k_denom( 4138; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4139; CHECK-NEXT: [[TMP2:%.*]] = urem i32 [[TMP1]], 4096 4140; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 4141; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 4142; CHECK-NEXT: [[TMP5:%.*]] = urem i32 [[TMP4]], 4096 4143; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 4144; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4145; CHECK-NEXT: ret void 4146; 4147; GCN-LABEL: urem_v2i32_pow2k_denom: 4148; GCN: ; %bb.0: 4149; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4150; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4151; GCN-NEXT: s_movk_i32 s2, 0xfff 4152; GCN-NEXT: s_mov_b32 s7, 0xf000 4153; GCN-NEXT: s_mov_b32 s6, -1 4154; GCN-NEXT: s_waitcnt lgkmcnt(0) 4155; GCN-NEXT: s_and_b32 s0, s0, s2 4156; GCN-NEXT: s_and_b32 s1, s1, s2 4157; GCN-NEXT: v_mov_b32_e32 v0, s0 4158; GCN-NEXT: v_mov_b32_e32 v1, s1 4159; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4160; GCN-NEXT: s_endpgm 4161 %r = urem <2 x i32> %x, <i32 4096, i32 4096> 4162 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4163 ret void 4164} 4165 4166define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 4167; CHECK-LABEL: @urem_v2i32_pow2_shl_denom( 4168; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 4169; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4170; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 4171; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 4172; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 4173; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 4174; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 4175; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 4176; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 4177; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 4178; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 4179; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 4180; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 4181; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 4182; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 4183; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 4184; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 4185; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 4186; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 4187; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 4188; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 4189; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 4190; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 4191; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 4192; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 4193; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]] 4194; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]] 4195; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]] 4196; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]] 4197; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]] 4198; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> undef, i32 [[TMP29]], i64 0 4199; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[X]], i64 1 4200; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 4201; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float 4202; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 4203; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000 4204; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32 4205; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]] 4206; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]] 4207; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64 4208; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64 4209; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]] 4210; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32 4211; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32 4212; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 4213; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]] 4214; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64 4215; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64 4216; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]] 4217; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32 4218; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32 4219; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 4220; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]] 4221; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]] 4222; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]] 4223; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]] 4224; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]] 4225; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]] 4226; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]] 4227; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]] 4228; CHECK-NEXT: [[TMP60:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP59]], i64 1 4229; CHECK-NEXT: store <2 x i32> [[TMP60]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4230; CHECK-NEXT: ret void 4231; 4232; GCN-LABEL: urem_v2i32_pow2_shl_denom: 4233; GCN: ; %bb.0: 4234; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 4235; GCN-NEXT: s_movk_i32 s4, 0x1000 4236; GCN-NEXT: s_mov_b32 s7, 0xf000 4237; GCN-NEXT: s_mov_b32 s6, -1 4238; GCN-NEXT: s_waitcnt lgkmcnt(0) 4239; GCN-NEXT: s_lshl_b32 s8, s4, s2 4240; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 4241; GCN-NEXT: s_lshl_b32 s3, s4, s3 4242; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 4243; GCN-NEXT: s_mov_b32 s4, 0x4f7ffffe 4244; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 4245; GCN-NEXT: s_sub_i32 s2, 0, s8 4246; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 4247; GCN-NEXT: v_mul_f32_e32 v0, s4, v0 4248; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4249; GCN-NEXT: v_mul_f32_e32 v1, s4, v1 4250; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 4251; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4252; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4253; GCN-NEXT: v_mul_lo_u32 v2, s2, v0 4254; GCN-NEXT: s_sub_i32 s2, 0, s3 4255; GCN-NEXT: v_mul_lo_u32 v3, s2, v1 4256; GCN-NEXT: v_mul_hi_u32 v2, v0, v2 4257; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 4258; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 4259; GCN-NEXT: s_waitcnt lgkmcnt(0) 4260; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 4261; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 4262; GCN-NEXT: v_mul_hi_u32 v1, s1, v1 4263; GCN-NEXT: v_mul_lo_u32 v0, v0, s8 4264; GCN-NEXT: v_mul_lo_u32 v1, v1, s3 4265; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 4266; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0 4267; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 4268; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4269; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0 4270; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 4271; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4272; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 4273; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 4274; GCN-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 4275; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4276; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 4277; GCN-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 4278; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4279; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4280; GCN-NEXT: s_endpgm 4281 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 4282 %r = urem <2 x i32> %x, %shl.y 4283 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4284 ret void 4285} 4286 4287define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 4288; CHECK-LABEL: @sdiv_i32_oddk_denom( 4289; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195 4290; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4291; CHECK-NEXT: ret void 4292; 4293; GCN-LABEL: sdiv_i32_oddk_denom: 4294; GCN: ; %bb.0: 4295; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4296; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4297; GCN-NEXT: v_mov_b32_e32 v0, 0xd9528441 4298; GCN-NEXT: s_mov_b32 s7, 0xf000 4299; GCN-NEXT: s_mov_b32 s6, -1 4300; GCN-NEXT: s_waitcnt lgkmcnt(0) 4301; GCN-NEXT: v_mul_hi_i32 v0, s0, v0 4302; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 4303; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 4304; GCN-NEXT: v_ashrrev_i32_e32 v0, 20, v0 4305; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 4306; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4307; GCN-NEXT: s_endpgm 4308 %r = sdiv i32 %x, 1235195 4309 store i32 %r, i32 addrspace(1)* %out 4310 ret void 4311} 4312 4313define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 4314; CHECK-LABEL: @sdiv_i32_pow2k_denom( 4315; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 4096 4316; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4317; CHECK-NEXT: ret void 4318; 4319; GCN-LABEL: sdiv_i32_pow2k_denom: 4320; GCN: ; %bb.0: 4321; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4322; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4323; GCN-NEXT: s_mov_b32 s7, 0xf000 4324; GCN-NEXT: s_mov_b32 s6, -1 4325; GCN-NEXT: s_waitcnt lgkmcnt(0) 4326; GCN-NEXT: s_ashr_i32 s1, s0, 31 4327; GCN-NEXT: s_lshr_b32 s1, s1, 20 4328; GCN-NEXT: s_add_i32 s0, s0, s1 4329; GCN-NEXT: s_ashr_i32 s0, s0, 12 4330; GCN-NEXT: v_mov_b32_e32 v0, s0 4331; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4332; GCN-NEXT: s_endpgm 4333 %r = sdiv i32 %x, 4096 4334 store i32 %r, i32 addrspace(1)* %out 4335 ret void 4336} 4337 4338define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 4339; CHECK-LABEL: @sdiv_i32_pow2_shl_denom( 4340; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 4341; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]] 4342; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4343; CHECK-NEXT: ret void 4344; 4345; GCN-LABEL: sdiv_i32_pow2_shl_denom: 4346; GCN: ; %bb.0: 4347; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4348; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4349; GCN-NEXT: s_waitcnt lgkmcnt(0) 4350; GCN-NEXT: s_lshl_b32 s3, 0x1000, s3 4351; GCN-NEXT: s_ashr_i32 s4, s3, 31 4352; GCN-NEXT: s_add_i32 s3, s3, s4 4353; GCN-NEXT: s_xor_b32 s7, s3, s4 4354; GCN-NEXT: v_cvt_f32_u32_e32 v0, s7 4355; GCN-NEXT: s_sub_i32 s3, 0, s7 4356; GCN-NEXT: s_ashr_i32 s5, s2, 31 4357; GCN-NEXT: s_add_i32 s2, s2, s5 4358; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 4359; GCN-NEXT: s_xor_b32 s6, s2, s5 4360; GCN-NEXT: s_xor_b32 s4, s5, s4 4361; GCN-NEXT: s_mov_b32 s2, -1 4362; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 4363; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4364; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 4365; GCN-NEXT: s_mov_b32 s3, 0xf000 4366; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 4367; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 4368; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 4369; GCN-NEXT: v_mul_lo_u32 v1, v0, s7 4370; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 4371; GCN-NEXT: v_sub_i32_e32 v1, vcc, s6, v1 4372; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s7, v1 4373; GCN-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 4374; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4375; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 4376; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 4377; GCN-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 4378; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 4379; GCN-NEXT: v_xor_b32_e32 v0, s4, v0 4380; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 4381; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 4382; GCN-NEXT: s_endpgm 4383 %shl.y = shl i32 4096, %y 4384 %r = sdiv i32 %x, %shl.y 4385 store i32 %r, i32 addrspace(1)* %out 4386 ret void 4387} 4388 4389define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 4390; CHECK-LABEL: @sdiv_v2i32_pow2k_denom( 4391; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4392; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 4393; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 4394; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 4395; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096 4396; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 4397; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4398; CHECK-NEXT: ret void 4399; 4400; GCN-LABEL: sdiv_v2i32_pow2k_denom: 4401; GCN: ; %bb.0: 4402; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4403; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4404; GCN-NEXT: s_mov_b32 s7, 0xf000 4405; GCN-NEXT: s_mov_b32 s6, -1 4406; GCN-NEXT: s_waitcnt lgkmcnt(0) 4407; GCN-NEXT: s_ashr_i32 s2, s0, 31 4408; GCN-NEXT: s_lshr_b32 s2, s2, 20 4409; GCN-NEXT: s_ashr_i32 s3, s1, 31 4410; GCN-NEXT: s_add_i32 s0, s0, s2 4411; GCN-NEXT: s_lshr_b32 s2, s3, 20 4412; GCN-NEXT: s_add_i32 s1, s1, s2 4413; GCN-NEXT: s_ashr_i32 s0, s0, 12 4414; GCN-NEXT: s_ashr_i32 s1, s1, 12 4415; GCN-NEXT: v_mov_b32_e32 v0, s0 4416; GCN-NEXT: v_mov_b32_e32 v1, s1 4417; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4418; GCN-NEXT: s_endpgm 4419 %r = sdiv <2 x i32> %x, <i32 4096, i32 4096> 4420 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4421 ret void 4422} 4423 4424define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 4425; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom( 4426; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4427; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 4428; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 4429; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 4430; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095 4431; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 4432; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4433; CHECK-NEXT: ret void 4434; 4435; GCN-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 4436; GCN: ; %bb.0: 4437; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4438; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4439; GCN-NEXT: v_mov_b32_e32 v0, 0x80080081 4440; GCN-NEXT: s_mov_b32 s7, 0xf000 4441; GCN-NEXT: s_mov_b32 s6, -1 4442; GCN-NEXT: s_waitcnt lgkmcnt(0) 4443; GCN-NEXT: v_mul_hi_i32 v0, s1, v0 4444; GCN-NEXT: s_ashr_i32 s2, s0, 31 4445; GCN-NEXT: s_lshr_b32 s2, s2, 20 4446; GCN-NEXT: s_add_i32 s0, s0, s2 4447; GCN-NEXT: v_add_i32_e32 v0, vcc, s1, v0 4448; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 4449; GCN-NEXT: v_ashrrev_i32_e32 v0, 11, v0 4450; GCN-NEXT: s_ashr_i32 s0, s0, 12 4451; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v0 4452; GCN-NEXT: v_mov_b32_e32 v0, s0 4453; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4454; GCN-NEXT: s_endpgm 4455 %r = sdiv <2 x i32> %x, <i32 4096, i32 4095> 4456 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4457 ret void 4458} 4459 4460define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 4461; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom( 4462; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 4463; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4464; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 4465; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 4466; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 4467; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 4468; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 4469; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 4470; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 4471; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 4472; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 4473; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 4474; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000 4475; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 4476; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]] 4477; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]] 4478; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64 4479; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 4480; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 4481; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 4482; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 4483; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 4484; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]] 4485; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64 4486; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64 4487; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 4488; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 4489; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 4490; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 4491; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]] 4492; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]] 4493; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]] 4494; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1 4495; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]] 4496; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]] 4497; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]] 4498; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]] 4499; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1 4500; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]] 4501; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]] 4502; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]] 4503; CHECK-NEXT: [[TMP41:%.*]] = insertelement <2 x i32> undef, i32 [[TMP40]], i64 0 4504; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[X]], i64 1 4505; CHECK-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 4506; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31 4507; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31 4508; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]] 4509; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]] 4510; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]] 4511; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]] 4512; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]] 4513; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float 4514; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]]) 4515; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000 4516; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32 4517; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]] 4518; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]] 4519; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64 4520; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64 4521; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]] 4522; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32 4523; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32 4524; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32 4525; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]] 4526; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64 4527; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64 4528; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]] 4529; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32 4530; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32 4531; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32 4532; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]] 4533; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]] 4534; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]] 4535; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1 4536; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]] 4537; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]] 4538; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]] 4539; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]] 4540; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1 4541; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]] 4542; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]] 4543; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]] 4544; CHECK-NEXT: [[TMP82:%.*]] = insertelement <2 x i32> [[TMP41]], i32 [[TMP81]], i64 1 4545; CHECK-NEXT: store <2 x i32> [[TMP82]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4546; CHECK-NEXT: ret void 4547; 4548; GCN-LABEL: sdiv_v2i32_pow2_shl_denom: 4549; GCN: ; %bb.0: 4550; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 4551; GCN-NEXT: s_movk_i32 s10, 0x1000 4552; GCN-NEXT: s_mov_b32 s13, 0x4f7ffffe 4553; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4554; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb 4555; GCN-NEXT: s_mov_b32 s7, 0xf000 4556; GCN-NEXT: s_waitcnt lgkmcnt(0) 4557; GCN-NEXT: s_lshl_b32 s2, s10, s2 4558; GCN-NEXT: s_ashr_i32 s11, s2, 31 4559; GCN-NEXT: s_add_i32 s2, s2, s11 4560; GCN-NEXT: s_xor_b32 s12, s2, s11 4561; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 4562; GCN-NEXT: s_lshl_b32 s0, s10, s3 4563; GCN-NEXT: s_sub_i32 s3, 0, s12 4564; GCN-NEXT: s_ashr_i32 s2, s0, 31 4565; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 4566; GCN-NEXT: s_add_i32 s0, s0, s2 4567; GCN-NEXT: s_xor_b32 s10, s0, s2 4568; GCN-NEXT: v_cvt_f32_u32_e32 v2, s10 4569; GCN-NEXT: v_mul_f32_e32 v0, s13, v0 4570; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4571; GCN-NEXT: s_ashr_i32 s1, s8, 31 4572; GCN-NEXT: s_add_i32 s0, s8, s1 4573; GCN-NEXT: s_xor_b32 s0, s0, s1 4574; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 4575; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 4576; GCN-NEXT: s_xor_b32 s3, s1, s11 4577; GCN-NEXT: s_mov_b32 s6, -1 4578; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 4579; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 4580; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 4581; GCN-NEXT: v_mul_f32_e32 v1, s13, v2 4582; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 4583; GCN-NEXT: v_mul_lo_u32 v2, v0, s12 4584; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v0 4585; GCN-NEXT: v_sub_i32_e32 v2, vcc, s0, v2 4586; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v2 4587; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 4588; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s12, v2 4589; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 4590; GCN-NEXT: s_sub_i32 s0, 0, s10 4591; GCN-NEXT: v_mul_lo_u32 v3, s0, v1 4592; GCN-NEXT: s_ashr_i32 s0, s9, 31 4593; GCN-NEXT: s_add_i32 s1, s9, s0 4594; GCN-NEXT: s_xor_b32 s1, s1, s0 4595; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 4596; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v0 4597; GCN-NEXT: s_xor_b32 s2, s0, s2 4598; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 4599; GCN-NEXT: v_mul_hi_u32 v1, s1, v1 4600; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v2 4601; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 4602; GCN-NEXT: v_xor_b32_e32 v0, s3, v0 4603; GCN-NEXT: v_mul_lo_u32 v2, v1, s10 4604; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v1 4605; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 4606; GCN-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 4607; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 4608; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 4609; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s10, v2 4610; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 4611; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v1 4612; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 4613; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 4614; GCN-NEXT: v_xor_b32_e32 v1, s2, v1 4615; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 4616; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4617; GCN-NEXT: s_endpgm 4618 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 4619 %r = sdiv <2 x i32> %x, %shl.y 4620 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4621 ret void 4622} 4623 4624define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 4625; CHECK-LABEL: @srem_i32_oddk_denom( 4626; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 1235195 4627; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4628; CHECK-NEXT: ret void 4629; 4630; GCN-LABEL: srem_i32_oddk_denom: 4631; GCN: ; %bb.0: 4632; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4633; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4634; GCN-NEXT: v_mov_b32_e32 v0, 0xd9528441 4635; GCN-NEXT: s_mov_b32 s7, 0xf000 4636; GCN-NEXT: s_mov_b32 s6, -1 4637; GCN-NEXT: s_waitcnt lgkmcnt(0) 4638; GCN-NEXT: v_mul_hi_i32 v0, s0, v0 4639; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 4640; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 4641; GCN-NEXT: v_ashrrev_i32_e32 v0, 20, v0 4642; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 4643; GCN-NEXT: v_mul_i32_i24_e32 v0, 0x12d8fb, v0 4644; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 4645; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4646; GCN-NEXT: s_endpgm 4647 %r = srem i32 %x, 1235195 4648 store i32 %r, i32 addrspace(1)* %out 4649 ret void 4650} 4651 4652define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 4653; CHECK-LABEL: @srem_i32_pow2k_denom( 4654; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 4096 4655; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4656; CHECK-NEXT: ret void 4657; 4658; GCN-LABEL: srem_i32_pow2k_denom: 4659; GCN: ; %bb.0: 4660; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4661; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4662; GCN-NEXT: s_mov_b32 s7, 0xf000 4663; GCN-NEXT: s_mov_b32 s6, -1 4664; GCN-NEXT: s_waitcnt lgkmcnt(0) 4665; GCN-NEXT: s_ashr_i32 s1, s0, 31 4666; GCN-NEXT: s_lshr_b32 s1, s1, 20 4667; GCN-NEXT: s_add_i32 s1, s0, s1 4668; GCN-NEXT: s_and_b32 s1, s1, 0xfffff000 4669; GCN-NEXT: s_sub_i32 s0, s0, s1 4670; GCN-NEXT: v_mov_b32_e32 v0, s0 4671; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4672; GCN-NEXT: s_endpgm 4673 %r = srem i32 %x, 4096 4674 store i32 %r, i32 addrspace(1)* %out 4675 ret void 4676} 4677 4678define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 4679; CHECK-LABEL: @srem_i32_pow2_shl_denom( 4680; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 4681; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]] 4682; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4683; CHECK-NEXT: ret void 4684; 4685; GCN-LABEL: srem_i32_pow2_shl_denom: 4686; GCN: ; %bb.0: 4687; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4688; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4689; GCN-NEXT: s_waitcnt lgkmcnt(0) 4690; GCN-NEXT: s_lshl_b32 s3, 0x1000, s3 4691; GCN-NEXT: s_ashr_i32 s4, s3, 31 4692; GCN-NEXT: s_add_i32 s3, s3, s4 4693; GCN-NEXT: s_xor_b32 s6, s3, s4 4694; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 4695; GCN-NEXT: s_sub_i32 s3, 0, s6 4696; GCN-NEXT: s_ashr_i32 s4, s2, 31 4697; GCN-NEXT: s_add_i32 s2, s2, s4 4698; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 4699; GCN-NEXT: s_xor_b32 s5, s2, s4 4700; GCN-NEXT: s_mov_b32 s2, -1 4701; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 4702; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4703; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 4704; GCN-NEXT: s_mov_b32 s3, 0xf000 4705; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 4706; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 4707; GCN-NEXT: v_mul_hi_u32 v0, s5, v0 4708; GCN-NEXT: v_mul_lo_u32 v0, v0, s6 4709; GCN-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 4710; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s6, v0 4711; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 4712; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 4713; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s6, v0 4714; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 4715; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 4716; GCN-NEXT: v_xor_b32_e32 v0, s4, v0 4717; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 4718; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 4719; GCN-NEXT: s_endpgm 4720 %shl.y = shl i32 4096, %y 4721 %r = srem i32 %x, %shl.y 4722 store i32 %r, i32 addrspace(1)* %out 4723 ret void 4724} 4725 4726define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 4727; CHECK-LABEL: @srem_v2i32_pow2k_denom( 4728; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4729; CHECK-NEXT: [[TMP2:%.*]] = srem i32 [[TMP1]], 4096 4730; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 4731; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 4732; CHECK-NEXT: [[TMP5:%.*]] = srem i32 [[TMP4]], 4096 4733; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 4734; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4735; CHECK-NEXT: ret void 4736; 4737; GCN-LABEL: srem_v2i32_pow2k_denom: 4738; GCN: ; %bb.0: 4739; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4740; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4741; GCN-NEXT: s_movk_i32 s2, 0xf000 4742; GCN-NEXT: s_mov_b32 s7, 0xf000 4743; GCN-NEXT: s_mov_b32 s6, -1 4744; GCN-NEXT: s_waitcnt lgkmcnt(0) 4745; GCN-NEXT: s_ashr_i32 s3, s0, 31 4746; GCN-NEXT: s_lshr_b32 s3, s3, 20 4747; GCN-NEXT: s_add_i32 s3, s0, s3 4748; GCN-NEXT: s_and_b32 s3, s3, s2 4749; GCN-NEXT: s_sub_i32 s0, s0, s3 4750; GCN-NEXT: s_ashr_i32 s3, s1, 31 4751; GCN-NEXT: s_lshr_b32 s3, s3, 20 4752; GCN-NEXT: s_add_i32 s3, s1, s3 4753; GCN-NEXT: s_and_b32 s2, s3, s2 4754; GCN-NEXT: s_sub_i32 s1, s1, s2 4755; GCN-NEXT: v_mov_b32_e32 v0, s0 4756; GCN-NEXT: v_mov_b32_e32 v1, s1 4757; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4758; GCN-NEXT: s_endpgm 4759 %r = srem <2 x i32> %x, <i32 4096, i32 4096> 4760 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4761 ret void 4762} 4763 4764define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 4765; CHECK-LABEL: @srem_v2i32_pow2_shl_denom( 4766; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 4767; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4768; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 4769; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 4770; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 4771; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 4772; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 4773; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 4774; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 4775; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 4776; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 4777; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000 4778; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 4779; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]] 4780; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]] 4781; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64 4782; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 4783; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]] 4784; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 4785; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32 4786; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 4787; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] 4788; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 4789; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64 4790; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 4791; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 4792; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 4793; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 4794; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]] 4795; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]] 4796; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]] 4797; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]] 4798; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]] 4799; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]] 4800; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]] 4801; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]] 4802; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]] 4803; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]] 4804; CHECK-NEXT: [[TMP38:%.*]] = insertelement <2 x i32> undef, i32 [[TMP37]], i64 0 4805; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[X]], i64 1 4806; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 4807; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31 4808; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31 4809; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]] 4810; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]] 4811; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]] 4812; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]] 4813; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float 4814; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]]) 4815; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000 4816; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32 4817; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]] 4818; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]] 4819; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64 4820; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64 4821; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]] 4822; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32 4823; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32 4824; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32 4825; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]] 4826; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64 4827; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64 4828; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]] 4829; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32 4830; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32 4831; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32 4832; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]] 4833; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]] 4834; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]] 4835; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]] 4836; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]] 4837; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]] 4838; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]] 4839; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]] 4840; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]] 4841; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]] 4842; CHECK-NEXT: [[TMP76:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP75]], i64 1 4843; CHECK-NEXT: store <2 x i32> [[TMP76]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4844; CHECK-NEXT: ret void 4845; 4846; GCN-LABEL: srem_v2i32_pow2_shl_denom: 4847; GCN: ; %bb.0: 4848; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 4849; GCN-NEXT: s_movk_i32 s6, 0x1000 4850; GCN-NEXT: s_mov_b32 s10, 0x4f7ffffe 4851; GCN-NEXT: s_mov_b32 s7, 0xf000 4852; GCN-NEXT: s_waitcnt lgkmcnt(0) 4853; GCN-NEXT: s_lshl_b32 s2, s6, s2 4854; GCN-NEXT: s_ashr_i32 s4, s2, 31 4855; GCN-NEXT: s_add_i32 s2, s2, s4 4856; GCN-NEXT: s_xor_b32 s9, s2, s4 4857; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 4858; GCN-NEXT: s_lshl_b32 s2, s6, s3 4859; GCN-NEXT: s_ashr_i32 s6, s2, 31 4860; GCN-NEXT: s_add_i32 s2, s2, s6 4861; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 4862; GCN-NEXT: s_sub_i32 s8, 0, s9 4863; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4864; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4865; GCN-NEXT: v_mul_f32_e32 v0, s10, v0 4866; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4867; GCN-NEXT: s_waitcnt lgkmcnt(0) 4868; GCN-NEXT: s_ashr_i32 s3, s0, 31 4869; GCN-NEXT: s_add_i32 s0, s0, s3 4870; GCN-NEXT: v_mul_lo_u32 v1, s8, v0 4871; GCN-NEXT: s_xor_b32 s8, s2, s6 4872; GCN-NEXT: v_cvt_f32_u32_e32 v2, s8 4873; GCN-NEXT: s_xor_b32 s0, s0, s3 4874; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 4875; GCN-NEXT: s_sub_i32 s2, 0, s8 4876; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 4877; GCN-NEXT: s_mov_b32 s6, -1 4878; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 4879; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 4880; GCN-NEXT: v_mul_f32_e32 v1, s10, v2 4881; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 4882; GCN-NEXT: v_mul_lo_u32 v0, v0, s9 4883; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 4884; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 4885; GCN-NEXT: s_ashr_i32 s0, s1, 31 4886; GCN-NEXT: v_mul_hi_u32 v2, v1, v2 4887; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s9, v0 4888; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v0 4889; GCN-NEXT: s_add_i32 s1, s1, s0 4890; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4891; GCN-NEXT: s_xor_b32 s1, s1, s0 4892; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 4893; GCN-NEXT: v_mul_hi_u32 v1, s1, v1 4894; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s9, v0 4895; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v0 4896; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4897; GCN-NEXT: v_mul_lo_u32 v1, v1, s8 4898; GCN-NEXT: v_xor_b32_e32 v0, s3, v0 4899; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 4900; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 4901; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s8, v1 4902; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 4903; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4904; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s8, v1 4905; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 4906; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4907; GCN-NEXT: v_xor_b32_e32 v1, s0, v1 4908; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1 4909; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4910; GCN-NEXT: s_endpgm 4911 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 4912 %r = srem <2 x i32> %x, %shl.y 4913 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4914 ret void 4915} 4916 4917define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 4918; CHECK-LABEL: @udiv_i64_oddk_denom( 4919; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943 4920; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 4921; CHECK-NEXT: ret void 4922; 4923; GCN-LABEL: udiv_i64_oddk_denom: 4924; GCN: ; %bb.0: 4925; GCN-NEXT: v_mov_b32_e32 v0, 0x4f176a73 4926; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 4927; GCN-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 4928; GCN-NEXT: v_rcp_f32_e32 v0, v0 4929; GCN-NEXT: s_movk_i32 s2, 0xfee0 4930; GCN-NEXT: s_mov_b32 s3, 0x68958c89 4931; GCN-NEXT: v_mov_b32_e32 v8, 0 4932; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 4933; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 4934; GCN-NEXT: v_trunc_f32_e32 v1, v1 4935; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 4936; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4937; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 4938; GCN-NEXT: v_mov_b32_e32 v7, 0 4939; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 4940; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 4941; GCN-NEXT: v_mul_hi_u32 v3, v0, s3 4942; GCN-NEXT: v_mul_lo_u32 v4, v1, s3 4943; GCN-NEXT: s_mov_b32 s11, 0xf000 4944; GCN-NEXT: s_waitcnt lgkmcnt(0) 4945; GCN-NEXT: s_mov_b32 s8, s4 4946; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 4947; GCN-NEXT: v_mul_lo_u32 v3, v0, s3 4948; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 4949; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 4950; GCN-NEXT: v_mul_hi_u32 v4, v0, v2 4951; GCN-NEXT: v_mul_hi_u32 v6, v0, v3 4952; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 4953; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 4954; GCN-NEXT: s_movk_i32 s4, 0x11e 4955; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 4956; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 4957; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 4958; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 4959; GCN-NEXT: s_mov_b32 s10, -1 4960; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 4961; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc 4962; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 4963; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 4964; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 4965; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 4966; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 4967; GCN-NEXT: v_mul_hi_u32 v5, v0, s3 4968; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 4969; GCN-NEXT: v_mul_lo_u32 v6, v2, s3 4970; GCN-NEXT: s_mov_b32 s2, 0x976a7377 4971; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 4972; GCN-NEXT: v_mul_lo_u32 v5, v0, s3 4973; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 4974; GCN-NEXT: v_mul_lo_u32 v6, v0, v4 4975; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 4976; GCN-NEXT: v_mul_hi_u32 v9, v0, v5 4977; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 4978; GCN-NEXT: s_movk_i32 s3, 0x11f 4979; GCN-NEXT: s_mov_b32 s9, s5 4980; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 4981; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc 4982; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 4983; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 4984; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 4985; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 4986; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc 4987; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc 4988; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 4989; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 4990; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 4991; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 4992; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 4993; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4994; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 4995; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 4996; GCN-NEXT: v_mul_hi_u32 v4, s6, v1 4997; GCN-NEXT: v_mul_hi_u32 v5, s7, v1 4998; GCN-NEXT: v_mul_lo_u32 v1, s7, v1 4999; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5000; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 5001; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 5002; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 5003; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 5004; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 5005; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 5006; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5007; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 5008; GCN-NEXT: v_mul_lo_u32 v2, v0, s3 5009; GCN-NEXT: v_mul_hi_u32 v3, v0, s2 5010; GCN-NEXT: v_mul_lo_u32 v4, v1, s2 5011; GCN-NEXT: v_mov_b32_e32 v5, s3 5012; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5013; GCN-NEXT: v_mul_lo_u32 v3, v0, s2 5014; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 5015; GCN-NEXT: v_sub_i32_e32 v4, vcc, s7, v2 5016; GCN-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 5017; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 5018; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 5019; GCN-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 5020; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v4 5021; GCN-NEXT: s_mov_b32 s2, 0x976a7376 5022; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 5023; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s2, v5 5024; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 5025; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 5026; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 5027; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 5028; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 5029; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 5030; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 5031; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 5032; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] 5033; GCN-NEXT: v_mov_b32_e32 v6, s7 5034; GCN-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 5035; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2 5036; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 5037; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s2, v3 5038; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 5039; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 5040; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 5041; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 5042; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 5043; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 5044; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5045; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 5046; GCN-NEXT: s_endpgm 5047 %r = udiv i64 %x, 1235195949943 5048 store i64 %r, i64 addrspace(1)* %out 5049 ret void 5050} 5051 5052define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 5053; CHECK-LABEL: @udiv_i64_pow2k_denom( 5054; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 4096 5055; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5056; CHECK-NEXT: ret void 5057; 5058; GCN-LABEL: udiv_i64_pow2k_denom: 5059; GCN: ; %bb.0: 5060; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 5061; GCN-NEXT: s_mov_b32 s7, 0xf000 5062; GCN-NEXT: s_mov_b32 s6, -1 5063; GCN-NEXT: s_waitcnt lgkmcnt(0) 5064; GCN-NEXT: s_mov_b32 s4, s0 5065; GCN-NEXT: s_mov_b32 s5, s1 5066; GCN-NEXT: s_lshr_b64 s[0:1], s[2:3], 12 5067; GCN-NEXT: v_mov_b32_e32 v0, s0 5068; GCN-NEXT: v_mov_b32_e32 v1, s1 5069; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5070; GCN-NEXT: s_endpgm 5071 %r = udiv i64 %x, 4096 5072 store i64 %r, i64 addrspace(1)* %out 5073 ret void 5074} 5075 5076define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 5077; CHECK-LABEL: @udiv_i64_pow2_shl_denom( 5078; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 5079; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]] 5080; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5081; CHECK-NEXT: ret void 5082; 5083; GCN-LABEL: udiv_i64_pow2_shl_denom: 5084; GCN: ; %bb.0: 5085; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 5086; GCN-NEXT: s_load_dword s8, s[0:1], 0xd 5087; GCN-NEXT: s_mov_b32 s3, 0xf000 5088; GCN-NEXT: s_mov_b32 s2, -1 5089; GCN-NEXT: s_waitcnt lgkmcnt(0) 5090; GCN-NEXT: s_mov_b32 s0, s4 5091; GCN-NEXT: s_add_i32 s8, s8, 12 5092; GCN-NEXT: s_mov_b32 s1, s5 5093; GCN-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 5094; GCN-NEXT: v_mov_b32_e32 v0, s4 5095; GCN-NEXT: v_mov_b32_e32 v1, s5 5096; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5097; GCN-NEXT: s_endpgm 5098 %shl.y = shl i64 4096, %y 5099 %r = udiv i64 %x, %shl.y 5100 store i64 %r, i64 addrspace(1)* %out 5101 ret void 5102} 5103 5104define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 5105; CHECK-LABEL: @udiv_v2i64_pow2k_denom( 5106; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5107; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 5108; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 5109; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 5110; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096 5111; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 5112; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 5113; CHECK-NEXT: ret void 5114; 5115; GCN-LABEL: udiv_v2i64_pow2k_denom: 5116; GCN: ; %bb.0: 5117; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5118; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 5119; GCN-NEXT: s_mov_b32 s7, 0xf000 5120; GCN-NEXT: s_mov_b32 s6, -1 5121; GCN-NEXT: s_waitcnt lgkmcnt(0) 5122; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], 12 5123; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 5124; GCN-NEXT: v_mov_b32_e32 v0, s0 5125; GCN-NEXT: v_mov_b32_e32 v1, s1 5126; GCN-NEXT: v_mov_b32_e32 v2, s2 5127; GCN-NEXT: v_mov_b32_e32 v3, s3 5128; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5129; GCN-NEXT: s_endpgm 5130 %r = udiv <2 x i64> %x, <i64 4096, i64 4096> 5131 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5132 ret void 5133} 5134 5135define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 5136; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom( 5137; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5138; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 5139; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 5140; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 5141; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095 5142; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 5143; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 5144; CHECK-NEXT: ret void 5145; 5146; GCN-LABEL: udiv_v2i64_mixed_pow2k_denom: 5147; GCN: ; %bb.0: 5148; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 5149; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 5150; GCN-NEXT: v_rcp_f32_e32 v0, v0 5151; GCN-NEXT: s_movk_i32 s6, 0xf001 5152; GCN-NEXT: v_mov_b32_e32 v7, 0 5153; GCN-NEXT: v_mov_b32_e32 v2, 0 5154; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 5155; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 5156; GCN-NEXT: v_trunc_f32_e32 v1, v1 5157; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 5158; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 5159; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 5160; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5161; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 5162; GCN-NEXT: s_movk_i32 s0, 0xfff 5163; GCN-NEXT: v_mul_hi_u32 v3, v0, s6 5164; GCN-NEXT: v_mul_lo_u32 v5, v1, s6 5165; GCN-NEXT: v_mul_lo_u32 v4, v0, s6 5166; GCN-NEXT: s_mov_b32 s7, 0xf000 5167; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v0, v3 5168; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 5169; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 5170; GCN-NEXT: v_mul_lo_u32 v5, v0, v3 5171; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 5172; GCN-NEXT: v_mul_hi_u32 v9, v1, v3 5173; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 5174; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 5175; GCN-NEXT: v_addc_u32_e32 v6, vcc, v7, v8, vcc 5176; GCN-NEXT: v_mul_lo_u32 v8, v1, v4 5177; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 5178; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 5179; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc 5180; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v2, vcc 5181; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 5182; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v3 5183; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc 5184; GCN-NEXT: v_mul_hi_u32 v5, v0, s6 5185; GCN-NEXT: v_addc_u32_e64 v3, vcc, v1, v4, s[2:3] 5186; GCN-NEXT: v_mul_lo_u32 v6, v3, s6 5187; GCN-NEXT: v_mul_lo_u32 v8, v0, s6 5188; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v0, v5 5189; GCN-NEXT: s_mov_b32 s6, -1 5190; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 5191; GCN-NEXT: v_mul_lo_u32 v6, v0, v5 5192; GCN-NEXT: v_mul_hi_u32 v9, v0, v8 5193; GCN-NEXT: v_mul_hi_u32 v10, v0, v5 5194; GCN-NEXT: v_mul_hi_u32 v11, v3, v5 5195; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 5196; GCN-NEXT: v_addc_u32_e32 v9, vcc, v7, v10, vcc 5197; GCN-NEXT: v_mul_lo_u32 v10, v3, v8 5198; GCN-NEXT: v_mul_hi_u32 v8, v3, v8 5199; GCN-NEXT: v_mul_lo_u32 v3, v3, v5 5200; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 5201; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v8, vcc 5202; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v2, vcc 5203; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 5204; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc 5205; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 5206; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] 5207; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 5208; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5209; GCN-NEXT: s_waitcnt lgkmcnt(0) 5210; GCN-NEXT: v_mul_lo_u32 v3, s10, v1 5211; GCN-NEXT: v_mul_hi_u32 v4, s10, v0 5212; GCN-NEXT: v_mul_hi_u32 v5, s10, v1 5213; GCN-NEXT: v_mul_hi_u32 v6, s11, v1 5214; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 5215; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 5216; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc 5217; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 5218; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 5219; GCN-NEXT: s_lshr_b64 s[2:3], s[8:9], 12 5220; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 5221; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc 5222; GCN-NEXT: v_addc_u32_e32 v2, vcc, v6, v2, vcc 5223; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5224; GCN-NEXT: v_addc_u32_e32 v1, vcc, v7, v2, vcc 5225; GCN-NEXT: v_mul_lo_u32 v2, v1, s0 5226; GCN-NEXT: v_mul_hi_u32 v3, v0, s0 5227; GCN-NEXT: v_mul_lo_u32 v4, v0, s0 5228; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5229; GCN-NEXT: v_mov_b32_e32 v3, s11 5230; GCN-NEXT: v_sub_i32_e32 v4, vcc, s10, v4 5231; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc 5232; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s0, v4 5233; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc 5234; GCN-NEXT: s_movk_i32 s0, 0xffe 5235; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 5236; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 5237; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 5238; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 5239; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 5240; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 5241; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 5242; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v4 5243; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc 5244; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] 5245; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 5246; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 5247; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] 5248; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc 5249; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 5250; GCN-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] 5251; GCN-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc 5252; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] 5253; GCN-NEXT: v_mov_b32_e32 v0, s2 5254; GCN-NEXT: v_mov_b32_e32 v1, s3 5255; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5256; GCN-NEXT: s_endpgm 5257 %r = udiv <2 x i64> %x, <i64 4096, i64 4095> 5258 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5259 ret void 5260} 5261 5262define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 5263; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom( 5264; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 5265; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5266; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 5267; CHECK-NEXT: [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]] 5268; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 5269; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 5270; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 5271; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]] 5272; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 5273; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 5274; CHECK-NEXT: ret void 5275; 5276; GCN-LABEL: udiv_v2i64_pow2_shl_denom: 5277; GCN: ; %bb.0: 5278; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5279; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 5280; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 5281; GCN-NEXT: s_mov_b32 s7, 0xf000 5282; GCN-NEXT: s_mov_b32 s6, -1 5283; GCN-NEXT: s_waitcnt lgkmcnt(0) 5284; GCN-NEXT: s_add_i32 s0, s0, 12 5285; GCN-NEXT: s_add_i32 s2, s2, 12 5286; GCN-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 5287; GCN-NEXT: s_lshr_b64 s[2:3], s[10:11], s2 5288; GCN-NEXT: v_mov_b32_e32 v0, s0 5289; GCN-NEXT: v_mov_b32_e32 v1, s1 5290; GCN-NEXT: v_mov_b32_e32 v2, s2 5291; GCN-NEXT: v_mov_b32_e32 v3, s3 5292; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5293; GCN-NEXT: s_endpgm 5294 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 5295 %r = udiv <2 x i64> %x, %shl.y 5296 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5297 ret void 5298} 5299 5300define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 5301; CHECK-LABEL: @urem_i64_oddk_denom( 5302; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993 5303; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5304; CHECK-NEXT: ret void 5305; 5306; GCN-LABEL: urem_i64_oddk_denom: 5307; GCN: ; %bb.0: 5308; GCN-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 5309; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 5310; GCN-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 5311; GCN-NEXT: v_rcp_f32_e32 v0, v0 5312; GCN-NEXT: s_movk_i32 s2, 0xfee0 5313; GCN-NEXT: s_mov_b32 s3, 0x689e0837 5314; GCN-NEXT: v_mov_b32_e32 v8, 0 5315; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 5316; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 5317; GCN-NEXT: v_trunc_f32_e32 v1, v1 5318; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 5319; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 5320; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 5321; GCN-NEXT: v_mov_b32_e32 v7, 0 5322; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 5323; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 5324; GCN-NEXT: v_mul_hi_u32 v3, v0, s3 5325; GCN-NEXT: v_mul_lo_u32 v4, v1, s3 5326; GCN-NEXT: s_movk_i32 s12, 0x11f 5327; GCN-NEXT: s_mov_b32 s13, 0x9761f7c9 5328; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5329; GCN-NEXT: v_mul_lo_u32 v3, v0, s3 5330; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 5331; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 5332; GCN-NEXT: v_mul_hi_u32 v4, v0, v2 5333; GCN-NEXT: v_mul_hi_u32 v6, v0, v3 5334; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 5335; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 5336; GCN-NEXT: s_waitcnt lgkmcnt(0) 5337; GCN-NEXT: s_mov_b32 s9, s5 5338; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 5339; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 5340; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 5341; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 5342; GCN-NEXT: s_movk_i32 s5, 0x11e 5343; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 5344; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc 5345; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 5346; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5347; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 5348; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 5349; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 5350; GCN-NEXT: v_mul_hi_u32 v5, v0, s3 5351; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 5352; GCN-NEXT: v_mul_lo_u32 v6, v2, s3 5353; GCN-NEXT: s_mov_b32 s8, s4 5354; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 5355; GCN-NEXT: v_mul_lo_u32 v5, v0, s3 5356; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 5357; GCN-NEXT: v_mul_lo_u32 v6, v0, v4 5358; GCN-NEXT: v_mul_hi_u32 v10, v0, v4 5359; GCN-NEXT: v_mul_hi_u32 v9, v0, v5 5360; GCN-NEXT: v_mul_hi_u32 v11, v2, v4 5361; GCN-NEXT: s_mov_b32 s4, 0x9761f7c8 5362; GCN-NEXT: s_mov_b32 s11, 0xf000 5363; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 5364; GCN-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc 5365; GCN-NEXT: v_mul_lo_u32 v10, v2, v5 5366; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 5367; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 5368; GCN-NEXT: s_mov_b32 s10, -1 5369; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 5370; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc 5371; GCN-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc 5372; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 5373; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 5374; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 5375; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 5376; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 5377; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5378; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 5379; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 5380; GCN-NEXT: v_mul_hi_u32 v4, s6, v1 5381; GCN-NEXT: v_mul_hi_u32 v5, s7, v1 5382; GCN-NEXT: v_mul_lo_u32 v1, s7, v1 5383; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5384; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 5385; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 5386; GCN-NEXT: v_mul_hi_u32 v0, s7, v0 5387; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 5388; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 5389; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 5390; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5391; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 5392; GCN-NEXT: v_mul_lo_u32 v2, v0, s12 5393; GCN-NEXT: v_mul_hi_u32 v3, v0, s13 5394; GCN-NEXT: v_mul_lo_u32 v1, v1, s13 5395; GCN-NEXT: v_mul_lo_u32 v0, v0, s13 5396; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5397; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 5398; GCN-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 5399; GCN-NEXT: v_mov_b32_e32 v3, s12 5400; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 5401; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 5402; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s13, v0 5403; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 5404; GCN-NEXT: v_cmp_lt_u32_e64 s[2:3], s5, v5 5405; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 5406; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 5407; GCN-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v4 5408; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s13, v4 5409; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 5410; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, v5 5411; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 5412; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 5413; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 5414; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 5415; GCN-NEXT: v_mov_b32_e32 v5, s7 5416; GCN-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 5417; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s5, v1 5418; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 5419; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0 5420; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 5421; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s12, v1 5422; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 5423; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 5424; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5425; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 5426; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5427; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 5428; GCN-NEXT: s_endpgm 5429 %r = urem i64 %x, 1235195393993 5430 store i64 %r, i64 addrspace(1)* %out 5431 ret void 5432} 5433 5434define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 5435; CHECK-LABEL: @urem_i64_pow2k_denom( 5436; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 4096 5437; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5438; CHECK-NEXT: ret void 5439; 5440; GCN-LABEL: urem_i64_pow2k_denom: 5441; GCN: ; %bb.0: 5442; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 5443; GCN-NEXT: s_mov_b32 s3, 0xf000 5444; GCN-NEXT: s_mov_b32 s2, -1 5445; GCN-NEXT: v_mov_b32_e32 v1, 0 5446; GCN-NEXT: s_waitcnt lgkmcnt(0) 5447; GCN-NEXT: s_mov_b32 s0, s4 5448; GCN-NEXT: s_and_b32 s4, s6, 0xfff 5449; GCN-NEXT: s_mov_b32 s1, s5 5450; GCN-NEXT: v_mov_b32_e32 v0, s4 5451; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5452; GCN-NEXT: s_endpgm 5453 %r = urem i64 %x, 4096 5454 store i64 %r, i64 addrspace(1)* %out 5455 ret void 5456} 5457 5458define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 5459; CHECK-LABEL: @urem_i64_pow2_shl_denom( 5460; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 5461; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]] 5462; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5463; CHECK-NEXT: ret void 5464; 5465; GCN-LABEL: urem_i64_pow2_shl_denom: 5466; GCN: ; %bb.0: 5467; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 5468; GCN-NEXT: s_load_dword s8, s[0:1], 0xd 5469; GCN-NEXT: s_mov_b32 s3, 0xf000 5470; GCN-NEXT: s_mov_b32 s2, -1 5471; GCN-NEXT: s_waitcnt lgkmcnt(0) 5472; GCN-NEXT: s_mov_b32 s0, s4 5473; GCN-NEXT: s_mov_b32 s1, s5 5474; GCN-NEXT: s_mov_b32 s5, 0 5475; GCN-NEXT: s_movk_i32 s4, 0x1000 5476; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 5477; GCN-NEXT: s_add_u32 s4, s4, -1 5478; GCN-NEXT: s_addc_u32 s5, s5, -1 5479; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 5480; GCN-NEXT: v_mov_b32_e32 v0, s4 5481; GCN-NEXT: v_mov_b32_e32 v1, s5 5482; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5483; GCN-NEXT: s_endpgm 5484 %shl.y = shl i64 4096, %y 5485 %r = urem i64 %x, %shl.y 5486 store i64 %r, i64 addrspace(1)* %out 5487 ret void 5488} 5489 5490define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 5491; CHECK-LABEL: @urem_v2i64_pow2k_denom( 5492; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5493; CHECK-NEXT: [[TMP2:%.*]] = urem i64 [[TMP1]], 4096 5494; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 5495; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 5496; CHECK-NEXT: [[TMP5:%.*]] = urem i64 [[TMP4]], 4096 5497; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 5498; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 5499; CHECK-NEXT: ret void 5500; 5501; GCN-LABEL: urem_v2i64_pow2k_denom: 5502; GCN: ; %bb.0: 5503; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5504; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 5505; GCN-NEXT: s_movk_i32 s8, 0xfff 5506; GCN-NEXT: v_mov_b32_e32 v1, 0 5507; GCN-NEXT: s_mov_b32 s7, 0xf000 5508; GCN-NEXT: s_mov_b32 s6, -1 5509; GCN-NEXT: s_waitcnt lgkmcnt(0) 5510; GCN-NEXT: s_and_b32 s0, s0, s8 5511; GCN-NEXT: s_and_b32 s1, s2, s8 5512; GCN-NEXT: v_mov_b32_e32 v0, s0 5513; GCN-NEXT: v_mov_b32_e32 v2, s1 5514; GCN-NEXT: v_mov_b32_e32 v3, v1 5515; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5516; GCN-NEXT: s_endpgm 5517 %r = urem <2 x i64> %x, <i64 4096, i64 4096> 5518 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5519 ret void 5520} 5521 5522define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 5523; CHECK-LABEL: @urem_v2i64_pow2_shl_denom( 5524; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 5525; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5526; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 5527; CHECK-NEXT: [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]] 5528; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 5529; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 5530; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 5531; CHECK-NEXT: [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]] 5532; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 5533; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 5534; CHECK-NEXT: ret void 5535; 5536; GCN-LABEL: urem_v2i64_pow2_shl_denom: 5537; GCN: ; %bb.0: 5538; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5539; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 5540; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 5541; GCN-NEXT: s_mov_b32 s13, 0 5542; GCN-NEXT: s_movk_i32 s12, 0x1000 5543; GCN-NEXT: s_mov_b32 s7, 0xf000 5544; GCN-NEXT: s_mov_b32 s6, -1 5545; GCN-NEXT: s_waitcnt lgkmcnt(0) 5546; GCN-NEXT: s_lshl_b64 s[2:3], s[12:13], s2 5547; GCN-NEXT: s_lshl_b64 s[0:1], s[12:13], s0 5548; GCN-NEXT: s_add_u32 s0, s0, -1 5549; GCN-NEXT: s_addc_u32 s1, s1, -1 5550; GCN-NEXT: s_and_b64 s[0:1], s[8:9], s[0:1] 5551; GCN-NEXT: s_add_u32 s2, s2, -1 5552; GCN-NEXT: s_addc_u32 s3, s3, -1 5553; GCN-NEXT: s_and_b64 s[2:3], s[10:11], s[2:3] 5554; GCN-NEXT: v_mov_b32_e32 v0, s0 5555; GCN-NEXT: v_mov_b32_e32 v1, s1 5556; GCN-NEXT: v_mov_b32_e32 v2, s2 5557; GCN-NEXT: v_mov_b32_e32 v3, s3 5558; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5559; GCN-NEXT: s_endpgm 5560 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 5561 %r = urem <2 x i64> %x, %shl.y 5562 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5563 ret void 5564} 5565 5566define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 5567; CHECK-LABEL: @sdiv_i64_oddk_denom( 5568; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195 5569; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5570; CHECK-NEXT: ret void 5571; 5572; GCN-LABEL: sdiv_i64_oddk_denom: 5573; GCN: ; %bb.0: 5574; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 5575; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 5576; GCN-NEXT: v_rcp_f32_e32 v0, v0 5577; GCN-NEXT: s_mov_b32 s2, 0xffed2705 5578; GCN-NEXT: v_mov_b32_e32 v8, 0 5579; GCN-NEXT: v_mov_b32_e32 v7, 0 5580; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 5581; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 5582; GCN-NEXT: v_trunc_f32_e32 v1, v1 5583; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 5584; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 5585; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 5586; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 5587; GCN-NEXT: s_mov_b32 s7, 0xf000 5588; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 5589; GCN-NEXT: v_mul_lo_u32 v2, v1, s2 5590; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 5591; GCN-NEXT: s_mov_b32 s6, -1 5592; GCN-NEXT: s_waitcnt lgkmcnt(0) 5593; GCN-NEXT: s_mov_b32 s4, s8 5594; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5595; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 5596; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 5597; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 5598; GCN-NEXT: v_mul_hi_u32 v3, v0, v2 5599; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 5600; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 5601; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 5602; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 5603; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 5604; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc 5605; GCN-NEXT: s_mov_b32 s5, s9 5606; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 5607; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc 5608; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 5609; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5610; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 5611; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 5612; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 5613; GCN-NEXT: v_mul_lo_u32 v4, v2, s2 5614; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 5615; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 5616; GCN-NEXT: v_mul_lo_u32 v5, v0, s2 5617; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 5618; GCN-NEXT: v_mul_lo_u32 v10, v0, v4 5619; GCN-NEXT: v_mul_hi_u32 v12, v0, v4 5620; GCN-NEXT: v_mul_hi_u32 v11, v0, v5 5621; GCN-NEXT: v_mul_hi_u32 v9, v2, v5 5622; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 5623; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 5624; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 5625; GCN-NEXT: v_addc_u32_e32 v11, vcc, v8, v12, vcc 5626; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 5627; GCN-NEXT: v_add_i32_e32 v5, vcc, v10, v5 5628; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc 5629; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc 5630; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 5631; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 5632; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 5633; GCN-NEXT: s_ashr_i32 s2, s11, 31 5634; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 5635; GCN-NEXT: s_add_u32 s0, s10, s2 5636; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 5637; GCN-NEXT: s_mov_b32 s3, s2 5638; GCN-NEXT: s_addc_u32 s1, s11, s2 5639; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 5640; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5641; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 5642; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 5643; GCN-NEXT: v_mul_hi_u32 v4, s0, v1 5644; GCN-NEXT: v_mul_hi_u32 v5, s1, v1 5645; GCN-NEXT: v_mul_lo_u32 v1, s1, v1 5646; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5647; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 5648; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 5649; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 5650; GCN-NEXT: s_mov_b32 s3, 0x12d8fb 5651; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 5652; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 5653; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 5654; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5655; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 5656; GCN-NEXT: v_mul_lo_u32 v2, v1, s3 5657; GCN-NEXT: v_mul_hi_u32 v3, s3, v0 5658; GCN-NEXT: v_mul_lo_u32 v4, v0, s3 5659; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5660; GCN-NEXT: v_sub_i32_e32 v4, vcc, s0, v4 5661; GCN-NEXT: v_mov_b32_e32 v3, s1 5662; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc 5663; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s3, v4 5664; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc 5665; GCN-NEXT: s_mov_b32 s0, 0x12d8fa 5666; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 5667; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 5668; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 5669; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 5670; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 5671; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 5672; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 5673; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v4 5674; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc 5675; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] 5676; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 5677; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 5678; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] 5679; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 5680; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc 5681; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 5682; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc 5683; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 5684; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 5685; GCN-NEXT: v_xor_b32_e32 v1, s2, v1 5686; GCN-NEXT: v_mov_b32_e32 v2, s2 5687; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 5688; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 5689; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5690; GCN-NEXT: s_endpgm 5691 %r = sdiv i64 %x, 1235195 5692 store i64 %r, i64 addrspace(1)* %out 5693 ret void 5694} 5695 5696define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 5697; CHECK-LABEL: @sdiv_i64_pow2k_denom( 5698; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 4096 5699; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5700; CHECK-NEXT: ret void 5701; 5702; GCN-LABEL: sdiv_i64_pow2k_denom: 5703; GCN: ; %bb.0: 5704; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 5705; GCN-NEXT: s_mov_b32 s7, 0xf000 5706; GCN-NEXT: s_mov_b32 s6, -1 5707; GCN-NEXT: s_waitcnt lgkmcnt(0) 5708; GCN-NEXT: s_mov_b32 s4, s0 5709; GCN-NEXT: s_ashr_i32 s0, s3, 31 5710; GCN-NEXT: s_lshr_b32 s0, s0, 20 5711; GCN-NEXT: s_add_u32 s0, s2, s0 5712; GCN-NEXT: s_mov_b32 s5, s1 5713; GCN-NEXT: s_addc_u32 s1, s3, 0 5714; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 5715; GCN-NEXT: v_mov_b32_e32 v0, s0 5716; GCN-NEXT: v_mov_b32_e32 v1, s1 5717; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5718; GCN-NEXT: s_endpgm 5719 %r = sdiv i64 %x, 4096 5720 store i64 %r, i64 addrspace(1)* %out 5721 ret void 5722} 5723 5724define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 5725; CHECK-LABEL: @sdiv_i64_pow2_shl_denom( 5726; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 5727; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]] 5728; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5729; CHECK-NEXT: ret void 5730; 5731; GCN-LABEL: sdiv_i64_pow2_shl_denom: 5732; GCN: ; %bb.0: 5733; GCN-NEXT: s_load_dword s4, s[0:1], 0xd 5734; GCN-NEXT: s_mov_b32 s3, 0 5735; GCN-NEXT: s_movk_i32 s2, 0x1000 5736; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 5737; GCN-NEXT: s_mov_b32 s7, 0xf000 5738; GCN-NEXT: s_waitcnt lgkmcnt(0) 5739; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 5740; GCN-NEXT: s_ashr_i32 s12, s3, 31 5741; GCN-NEXT: s_add_u32 s2, s2, s12 5742; GCN-NEXT: s_mov_b32 s13, s12 5743; GCN-NEXT: s_addc_u32 s3, s3, s12 5744; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] 5745; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 5746; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 5747; GCN-NEXT: s_sub_u32 s4, 0, s2 5748; GCN-NEXT: s_subb_u32 s5, 0, s3 5749; GCN-NEXT: s_ashr_i32 s14, s11, 31 5750; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 5751; GCN-NEXT: v_rcp_f32_e32 v0, v0 5752; GCN-NEXT: s_mov_b32 s15, s14 5753; GCN-NEXT: s_mov_b32 s6, -1 5754; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 5755; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 5756; GCN-NEXT: v_trunc_f32_e32 v1, v1 5757; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 5758; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 5759; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 5760; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 5761; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 5762; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 5763; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 5764; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5765; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 5766; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 5767; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 5768; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 5769; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 5770; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 5771; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 5772; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 5773; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 5774; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 5775; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 5776; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 5777; GCN-NEXT: v_mov_b32_e32 v4, 0 5778; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 5779; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5780; GCN-NEXT: v_mov_b32_e32 v6, 0 5781; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 5782; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 5783; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 5784; GCN-NEXT: v_mul_lo_u32 v5, s4, v2 5785; GCN-NEXT: v_mul_hi_u32 v7, s4, v0 5786; GCN-NEXT: v_mul_lo_u32 v8, s5, v0 5787; GCN-NEXT: s_mov_b32 s5, s9 5788; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 5789; GCN-NEXT: v_mul_lo_u32 v7, s4, v0 5790; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 5791; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 5792; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 5793; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 5794; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 5795; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 5796; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 5797; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 5798; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 5799; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 5800; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 5801; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 5802; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 5803; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 5804; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 5805; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 5806; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 5807; GCN-NEXT: s_add_u32 s0, s10, s14 5808; GCN-NEXT: s_addc_u32 s1, s11, s14 5809; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 5810; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 5811; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5812; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 5813; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 5814; GCN-NEXT: v_mul_hi_u32 v5, s10, v1 5815; GCN-NEXT: v_mul_hi_u32 v7, s11, v1 5816; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 5817; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5818; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 5819; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 5820; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 5821; GCN-NEXT: s_mov_b32 s4, s8 5822; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 5823; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 5824; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 5825; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5826; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 5827; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 5828; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 5829; GCN-NEXT: v_mul_lo_u32 v4, s3, v0 5830; GCN-NEXT: v_mov_b32_e32 v5, s3 5831; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5832; GCN-NEXT: v_mul_lo_u32 v3, s2, v0 5833; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 5834; GCN-NEXT: v_sub_i32_e32 v4, vcc, s11, v2 5835; GCN-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 5836; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 5837; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 5838; GCN-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 5839; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 5840; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 5841; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 5842; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 5843; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 5844; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 5845; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 5846; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 5847; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 5848; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 5849; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 5850; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] 5851; GCN-NEXT: v_mov_b32_e32 v6, s11 5852; GCN-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 5853; GCN-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 5854; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 5855; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 5856; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 5857; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 5858; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 5859; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 5860; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 5861; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5862; GCN-NEXT: s_xor_b64 s[0:1], s[14:15], s[12:13] 5863; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 5864; GCN-NEXT: v_xor_b32_e32 v0, s0, v0 5865; GCN-NEXT: v_xor_b32_e32 v1, s1, v1 5866; GCN-NEXT: v_mov_b32_e32 v2, s1 5867; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 5868; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 5869; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5870; GCN-NEXT: s_endpgm 5871 %shl.y = shl i64 4096, %y 5872 %r = sdiv i64 %x, %shl.y 5873 store i64 %r, i64 addrspace(1)* %out 5874 ret void 5875} 5876 5877define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 5878; CHECK-LABEL: @sdiv_v2i64_pow2k_denom( 5879; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5880; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 5881; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 5882; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 5883; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096 5884; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 5885; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 5886; CHECK-NEXT: ret void 5887; 5888; GCN-LABEL: sdiv_v2i64_pow2k_denom: 5889; GCN: ; %bb.0: 5890; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5891; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 5892; GCN-NEXT: s_mov_b32 s7, 0xf000 5893; GCN-NEXT: s_mov_b32 s6, -1 5894; GCN-NEXT: s_waitcnt lgkmcnt(0) 5895; GCN-NEXT: s_ashr_i32 s8, s1, 31 5896; GCN-NEXT: s_lshr_b32 s8, s8, 20 5897; GCN-NEXT: s_add_u32 s0, s0, s8 5898; GCN-NEXT: s_addc_u32 s1, s1, 0 5899; GCN-NEXT: s_ashr_i32 s8, s3, 31 5900; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 5901; GCN-NEXT: s_lshr_b32 s8, s8, 20 5902; GCN-NEXT: s_add_u32 s2, s2, s8 5903; GCN-NEXT: s_addc_u32 s3, s3, 0 5904; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 5905; GCN-NEXT: v_mov_b32_e32 v0, s0 5906; GCN-NEXT: v_mov_b32_e32 v1, s1 5907; GCN-NEXT: v_mov_b32_e32 v2, s2 5908; GCN-NEXT: v_mov_b32_e32 v3, s3 5909; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5910; GCN-NEXT: s_endpgm 5911 %r = sdiv <2 x i64> %x, <i64 4096, i64 4096> 5912 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5913 ret void 5914} 5915 5916define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 5917; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom( 5918; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5919; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 5920; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 5921; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 5922; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095 5923; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 5924; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 5925; CHECK-NEXT: ret void 5926; 5927; GCN-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 5928; GCN: ; %bb.0: 5929; GCN-NEXT: v_mov_b32_e32 v0, 0x457ff000 5930; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 5931; GCN-NEXT: v_mac_f32_e32 v0, 0, v1 5932; GCN-NEXT: v_rcp_f32_e32 v0, v0 5933; GCN-NEXT: s_movk_i32 s6, 0xf001 5934; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5935; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 5936; GCN-NEXT: s_mov_b32 s7, 0xf000 5937; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 5938; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 5939; GCN-NEXT: v_trunc_f32_e32 v1, v1 5940; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 5941; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 5942; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 5943; GCN-NEXT: s_waitcnt lgkmcnt(0) 5944; GCN-NEXT: s_ashr_i32 s0, s9, 31 5945; GCN-NEXT: s_lshr_b32 s0, s0, 20 5946; GCN-NEXT: v_mul_hi_u32 v2, s6, v0 5947; GCN-NEXT: v_mul_lo_u32 v3, v1, s6 5948; GCN-NEXT: s_add_u32 s2, s8, s0 5949; GCN-NEXT: s_addc_u32 s3, s9, 0 5950; GCN-NEXT: s_ashr_i32 s8, s11, 31 5951; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 5952; GCN-NEXT: v_mul_lo_u32 v3, v0, s6 5953; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 5954; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 5955; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 5956; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 5957; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 5958; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 5959; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 5960; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 5961; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 5962; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 5963; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 5964; GCN-NEXT: s_mov_b32 s9, s8 5965; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 5966; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 5967; GCN-NEXT: v_mov_b32_e32 v4, 0 5968; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 5969; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5970; GCN-NEXT: v_mov_b32_e32 v6, 0 5971; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 5972; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 5973; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 5974; GCN-NEXT: v_mul_lo_u32 v5, v2, s6 5975; GCN-NEXT: v_mul_hi_u32 v7, s6, v0 5976; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 5977; GCN-NEXT: v_mul_lo_u32 v7, v0, s6 5978; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v0, v5 5979; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 5980; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 5981; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 5982; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 5983; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 5984; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 5985; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 5986; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 5987; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 5988; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 5989; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 5990; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 5991; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 5992; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 5993; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 5994; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 5995; GCN-NEXT: s_add_u32 s0, s10, s8 5996; GCN-NEXT: s_addc_u32 s1, s11, s8 5997; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 5998; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] 5999; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6000; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 6001; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 6002; GCN-NEXT: v_mul_hi_u32 v5, s0, v1 6003; GCN-NEXT: v_mul_hi_u32 v7, s1, v1 6004; GCN-NEXT: v_mul_lo_u32 v1, s1, v1 6005; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6006; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6007; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 6008; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 6009; GCN-NEXT: s_movk_i32 s9, 0xfff 6010; GCN-NEXT: s_mov_b32 s6, -1 6011; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 6012; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 6013; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 6014; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6015; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 6016; GCN-NEXT: v_mul_lo_u32 v2, v1, s9 6017; GCN-NEXT: v_mul_hi_u32 v3, s9, v0 6018; GCN-NEXT: v_mul_lo_u32 v4, v0, s9 6019; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6020; GCN-NEXT: v_sub_i32_e32 v4, vcc, s0, v4 6021; GCN-NEXT: v_mov_b32_e32 v3, s1 6022; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc 6023; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s9, v4 6024; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc 6025; GCN-NEXT: s_movk_i32 s0, 0xffe 6026; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 6027; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 6028; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 6029; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 6030; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 6031; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 6032; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 6033; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v4 6034; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc 6035; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] 6036; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 6037; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 6038; GCN-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] 6039; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 6040; GCN-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc 6041; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 6042; GCN-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc 6043; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 6044; GCN-NEXT: v_xor_b32_e32 v0, s8, v0 6045; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0 6046; GCN-NEXT: v_xor_b32_e32 v1, s8, v1 6047; GCN-NEXT: v_mov_b32_e32 v3, s8 6048; GCN-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc 6049; GCN-NEXT: v_mov_b32_e32 v0, s2 6050; GCN-NEXT: v_mov_b32_e32 v1, s3 6051; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 6052; GCN-NEXT: s_endpgm 6053 %r = sdiv <2 x i64> %x, <i64 4096, i64 4095> 6054 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 6055 ret void 6056} 6057 6058define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 6059; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom( 6060; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 6061; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 6062; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 6063; CHECK-NEXT: [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]] 6064; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 6065; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 6066; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 6067; CHECK-NEXT: [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]] 6068; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 6069; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 6070; CHECK-NEXT: ret void 6071; 6072; GCN-LABEL: sdiv_v2i64_pow2_shl_denom: 6073; GCN: ; %bb.0: 6074; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 6075; GCN-NEXT: s_mov_b32 s3, 0 6076; GCN-NEXT: s_movk_i32 s2, 0x1000 6077; GCN-NEXT: s_mov_b32 s18, 0x4f800000 6078; GCN-NEXT: s_mov_b32 s19, 0x5f7ffffc 6079; GCN-NEXT: s_waitcnt lgkmcnt(0) 6080; GCN-NEXT: s_lshl_b64 s[12:13], s[2:3], s6 6081; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 6082; GCN-NEXT: s_ashr_i32 s16, s3, 31 6083; GCN-NEXT: s_add_u32 s2, s2, s16 6084; GCN-NEXT: s_mov_b32 s17, s16 6085; GCN-NEXT: s_addc_u32 s3, s3, s16 6086; GCN-NEXT: s_xor_b64 s[14:15], s[2:3], s[16:17] 6087; GCN-NEXT: v_cvt_f32_u32_e32 v0, s14 6088; GCN-NEXT: v_cvt_f32_u32_e32 v1, s15 6089; GCN-NEXT: s_mov_b32 s20, 0x2f800000 6090; GCN-NEXT: s_mov_b32 s21, 0xcf800000 6091; GCN-NEXT: s_sub_u32 s6, 0, s14 6092; GCN-NEXT: v_mac_f32_e32 v0, s18, v1 6093; GCN-NEXT: v_rcp_f32_e32 v0, v0 6094; GCN-NEXT: s_subb_u32 s7, 0, s15 6095; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6096; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 6097; GCN-NEXT: v_mul_f32_e32 v0, s19, v0 6098; GCN-NEXT: v_mul_f32_e32 v1, s20, v0 6099; GCN-NEXT: v_trunc_f32_e32 v1, v1 6100; GCN-NEXT: v_mac_f32_e32 v0, s21, v1 6101; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 6102; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 6103; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 6104; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 6105; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 6106; GCN-NEXT: v_mul_lo_u32 v5, s6, v0 6107; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6108; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 6109; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 6110; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 6111; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 6112; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 6113; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 6114; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 6115; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 6116; GCN-NEXT: v_mul_lo_u32 v6, v1, v5 6117; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 6118; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 6119; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 6120; GCN-NEXT: v_mov_b32_e32 v4, 0 6121; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 6122; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6123; GCN-NEXT: v_mov_b32_e32 v6, 0 6124; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2 6125; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 6126; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[2:3] 6127; GCN-NEXT: v_mul_lo_u32 v5, s6, v2 6128; GCN-NEXT: v_mul_hi_u32 v7, s6, v0 6129; GCN-NEXT: v_mul_lo_u32 v8, s7, v0 6130; GCN-NEXT: s_mov_b32 s7, 0xf000 6131; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 6132; GCN-NEXT: v_mul_lo_u32 v7, s6, v0 6133; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 6134; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 6135; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 6136; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 6137; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 6138; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 6139; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 6140; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 6141; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 6142; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 6143; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 6144; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 6145; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 6146; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 6147; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 6148; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6149; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] 6150; GCN-NEXT: s_waitcnt lgkmcnt(0) 6151; GCN-NEXT: s_ashr_i32 s2, s9, 31 6152; GCN-NEXT: s_add_u32 s0, s8, s2 6153; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 6154; GCN-NEXT: s_mov_b32 s3, s2 6155; GCN-NEXT: s_addc_u32 s1, s9, s2 6156; GCN-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3] 6157; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6158; GCN-NEXT: v_mul_lo_u32 v2, s8, v1 6159; GCN-NEXT: v_mul_hi_u32 v3, s8, v0 6160; GCN-NEXT: v_mul_hi_u32 v5, s8, v1 6161; GCN-NEXT: v_mul_hi_u32 v7, s9, v1 6162; GCN-NEXT: v_mul_lo_u32 v1, s9, v1 6163; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6164; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6165; GCN-NEXT: v_mul_lo_u32 v5, s9, v0 6166; GCN-NEXT: v_mul_hi_u32 v0, s9, v0 6167; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] 6168; GCN-NEXT: s_mov_b32 s6, -1 6169; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 6170; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 6171; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 6172; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6173; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 6174; GCN-NEXT: v_mul_lo_u32 v2, s14, v1 6175; GCN-NEXT: v_mul_hi_u32 v3, s14, v0 6176; GCN-NEXT: v_mul_lo_u32 v5, s15, v0 6177; GCN-NEXT: v_mov_b32_e32 v7, s15 6178; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6179; GCN-NEXT: v_mul_lo_u32 v3, s14, v0 6180; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 6181; GCN-NEXT: v_sub_i32_e32 v5, vcc, s9, v2 6182; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 6183; GCN-NEXT: v_subb_u32_e64 v5, s[0:1], v5, v7, vcc 6184; GCN-NEXT: v_subrev_i32_e64 v7, s[0:1], s14, v3 6185; GCN-NEXT: v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1] 6186; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v5 6187; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 6188; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 6189; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 6190; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v5 6191; GCN-NEXT: v_cndmask_b32_e64 v5, v8, v7, s[0:1] 6192; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0 6193; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 6194; GCN-NEXT: v_add_i32_e64 v9, s[0:1], 1, v0 6195; GCN-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1] 6196; GCN-NEXT: s_ashr_i32 s8, s13, 31 6197; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 6198; GCN-NEXT: s_add_u32 s12, s12, s8 6199; GCN-NEXT: v_cndmask_b32_e64 v5, v10, v8, s[0:1] 6200; GCN-NEXT: v_mov_b32_e32 v8, s9 6201; GCN-NEXT: s_mov_b32 s9, s8 6202; GCN-NEXT: s_addc_u32 s13, s13, s8 6203; GCN-NEXT: s_xor_b64 s[12:13], s[12:13], s[8:9] 6204; GCN-NEXT: v_cvt_f32_u32_e32 v10, s12 6205; GCN-NEXT: v_cvt_f32_u32_e32 v11, s13 6206; GCN-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc 6207; GCN-NEXT: v_cmp_le_u32_e32 vcc, s15, v2 6208; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 6209; GCN-NEXT: v_cmp_le_u32_e32 vcc, s14, v3 6210; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 6211; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s15, v2 6212; GCN-NEXT: v_mac_f32_e32 v10, s18, v11 6213; GCN-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc 6214; GCN-NEXT: v_rcp_f32_e32 v3, v10 6215; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 6216; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 6217; GCN-NEXT: s_sub_u32 s14, 0, s12 6218; GCN-NEXT: v_mul_f32_e32 v3, s19, v3 6219; GCN-NEXT: v_mul_f32_e32 v5, s20, v3 6220; GCN-NEXT: v_trunc_f32_e32 v5, v5 6221; GCN-NEXT: v_mac_f32_e32 v3, s21, v5 6222; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 6223; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 6224; GCN-NEXT: v_cndmask_b32_e64 v2, v9, v7, s[0:1] 6225; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6226; GCN-NEXT: v_mul_hi_u32 v2, s14, v3 6227; GCN-NEXT: v_mul_lo_u32 v7, s14, v5 6228; GCN-NEXT: s_subb_u32 s15, 0, s13 6229; GCN-NEXT: v_mul_lo_u32 v8, s15, v3 6230; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 6231; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v7 6232; GCN-NEXT: v_mul_lo_u32 v7, s14, v3 6233; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v8 6234; GCN-NEXT: v_mul_lo_u32 v8, v3, v2 6235; GCN-NEXT: v_mul_hi_u32 v10, v3, v2 6236; GCN-NEXT: v_mul_hi_u32 v9, v3, v7 6237; GCN-NEXT: v_mul_hi_u32 v11, v5, v2 6238; GCN-NEXT: v_mul_lo_u32 v2, v5, v2 6239; GCN-NEXT: v_xor_b32_e32 v1, s3, v1 6240; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 6241; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 6242; GCN-NEXT: v_mul_lo_u32 v10, v5, v7 6243; GCN-NEXT: v_mul_hi_u32 v7, v5, v7 6244; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 6245; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc 6246; GCN-NEXT: v_addc_u32_e32 v8, vcc, v11, v4, vcc 6247; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 6248; GCN-NEXT: v_add_i32_e64 v2, s[0:1], v3, v2 6249; GCN-NEXT: v_addc_u32_e32 v7, vcc, v6, v8, vcc 6250; GCN-NEXT: v_addc_u32_e64 v3, vcc, v5, v7, s[0:1] 6251; GCN-NEXT: v_mul_lo_u32 v8, s14, v3 6252; GCN-NEXT: v_mul_hi_u32 v9, s14, v2 6253; GCN-NEXT: v_mul_lo_u32 v10, s15, v2 6254; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 6255; GCN-NEXT: v_mul_lo_u32 v9, s14, v2 6256; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 6257; GCN-NEXT: v_mul_lo_u32 v12, v2, v8 6258; GCN-NEXT: v_mul_hi_u32 v14, v2, v8 6259; GCN-NEXT: v_mul_hi_u32 v13, v2, v9 6260; GCN-NEXT: v_mul_hi_u32 v11, v3, v9 6261; GCN-NEXT: v_mul_lo_u32 v9, v3, v9 6262; GCN-NEXT: v_mul_hi_u32 v10, v3, v8 6263; GCN-NEXT: v_add_i32_e32 v12, vcc, v13, v12 6264; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc 6265; GCN-NEXT: v_mul_lo_u32 v3, v3, v8 6266; GCN-NEXT: v_add_i32_e32 v9, vcc, v12, v9 6267; GCN-NEXT: v_addc_u32_e32 v9, vcc, v13, v11, vcc 6268; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v4, vcc 6269; GCN-NEXT: v_add_i32_e32 v3, vcc, v9, v3 6270; GCN-NEXT: v_addc_u32_e32 v8, vcc, v6, v8, vcc 6271; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 6272; GCN-NEXT: s_ashr_i32 s14, s11, 31 6273; GCN-NEXT: v_addc_u32_e64 v5, vcc, v5, v8, s[0:1] 6274; GCN-NEXT: s_add_u32 s0, s10, s14 6275; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 6276; GCN-NEXT: s_mov_b32 s15, s14 6277; GCN-NEXT: s_addc_u32 s1, s11, s14 6278; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 6279; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6280; GCN-NEXT: v_mul_lo_u32 v5, s10, v3 6281; GCN-NEXT: v_mul_hi_u32 v7, s10, v2 6282; GCN-NEXT: v_mul_hi_u32 v9, s10, v3 6283; GCN-NEXT: v_mul_hi_u32 v10, s11, v3 6284; GCN-NEXT: v_mul_lo_u32 v3, s11, v3 6285; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 6286; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc 6287; GCN-NEXT: v_mul_lo_u32 v9, s11, v2 6288; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 6289; GCN-NEXT: v_mov_b32_e32 v8, s3 6290; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 6291; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v2, vcc 6292; GCN-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc 6293; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 6294; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc 6295; GCN-NEXT: v_mul_lo_u32 v4, s12, v3 6296; GCN-NEXT: v_mul_hi_u32 v5, s12, v2 6297; GCN-NEXT: v_mul_lo_u32 v6, s13, v2 6298; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 6299; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc 6300; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 6301; GCN-NEXT: v_mul_lo_u32 v5, s12, v2 6302; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 6303; GCN-NEXT: v_sub_i32_e32 v6, vcc, s11, v4 6304; GCN-NEXT: v_mov_b32_e32 v7, s13 6305; GCN-NEXT: v_sub_i32_e32 v5, vcc, s10, v5 6306; GCN-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc 6307; GCN-NEXT: v_subrev_i32_e64 v7, s[0:1], s12, v5 6308; GCN-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] 6309; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 6310; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 6311; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 6312; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 6313; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 6314; GCN-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] 6315; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 2, v2 6316; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] 6317; GCN-NEXT: v_add_i32_e64 v9, s[0:1], 1, v2 6318; GCN-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] 6319; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 6320; GCN-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[0:1] 6321; GCN-NEXT: v_mov_b32_e32 v8, s11 6322; GCN-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc 6323; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 6324; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 6325; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 6326; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 6327; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v4 6328; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc 6329; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 6330; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] 6331; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 6332; GCN-NEXT: s_xor_b64 s[0:1], s[14:15], s[8:9] 6333; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 6334; GCN-NEXT: v_xor_b32_e32 v2, s0, v2 6335; GCN-NEXT: v_xor_b32_e32 v3, s1, v3 6336; GCN-NEXT: v_mov_b32_e32 v4, s1 6337; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s0, v2 6338; GCN-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc 6339; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 6340; GCN-NEXT: s_endpgm 6341 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 6342 %r = sdiv <2 x i64> %x, %shl.y 6343 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 6344 ret void 6345} 6346 6347define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 6348; CHECK-LABEL: @srem_i64_oddk_denom( 6349; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 1235195 6350; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 6351; CHECK-NEXT: ret void 6352; 6353; GCN-LABEL: srem_i64_oddk_denom: 6354; GCN: ; %bb.0: 6355; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 6356; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 6357; GCN-NEXT: v_rcp_f32_e32 v0, v0 6358; GCN-NEXT: s_mov_b32 s2, 0xffed2705 6359; GCN-NEXT: v_mov_b32_e32 v8, 0 6360; GCN-NEXT: v_mov_b32_e32 v7, 0 6361; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 6362; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 6363; GCN-NEXT: v_trunc_f32_e32 v1, v1 6364; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 6365; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 6366; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 6367; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 6368; GCN-NEXT: s_mov_b32 s7, 0xf000 6369; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 6370; GCN-NEXT: v_mul_lo_u32 v2, v1, s2 6371; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 6372; GCN-NEXT: s_mov_b32 s6, -1 6373; GCN-NEXT: s_waitcnt lgkmcnt(0) 6374; GCN-NEXT: s_mov_b32 s4, s8 6375; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6376; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 6377; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 6378; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 6379; GCN-NEXT: v_mul_hi_u32 v3, v0, v2 6380; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 6381; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 6382; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 6383; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 6384; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 6385; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc 6386; GCN-NEXT: s_mov_b32 s5, s9 6387; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 6388; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc 6389; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 6390; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6391; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 6392; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 6393; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 6394; GCN-NEXT: v_mul_lo_u32 v4, v2, s2 6395; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 6396; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 6397; GCN-NEXT: v_mul_lo_u32 v5, v0, s2 6398; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 6399; GCN-NEXT: v_mul_lo_u32 v10, v0, v4 6400; GCN-NEXT: v_mul_hi_u32 v12, v0, v4 6401; GCN-NEXT: v_mul_hi_u32 v11, v0, v5 6402; GCN-NEXT: v_mul_hi_u32 v9, v2, v5 6403; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 6404; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 6405; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 6406; GCN-NEXT: v_addc_u32_e32 v11, vcc, v8, v12, vcc 6407; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 6408; GCN-NEXT: v_add_i32_e32 v5, vcc, v10, v5 6409; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc 6410; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc 6411; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 6412; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 6413; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6414; GCN-NEXT: s_ashr_i32 s2, s11, 31 6415; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 6416; GCN-NEXT: s_add_u32 s0, s10, s2 6417; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 6418; GCN-NEXT: s_mov_b32 s3, s2 6419; GCN-NEXT: s_addc_u32 s1, s11, s2 6420; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 6421; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6422; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 6423; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 6424; GCN-NEXT: v_mul_hi_u32 v4, s0, v1 6425; GCN-NEXT: v_mul_hi_u32 v5, s1, v1 6426; GCN-NEXT: v_mul_lo_u32 v1, s1, v1 6427; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6428; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 6429; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 6430; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 6431; GCN-NEXT: s_mov_b32 s3, 0x12d8fb 6432; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 6433; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 6434; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 6435; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6436; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 6437; GCN-NEXT: v_mul_hi_u32 v2, s3, v0 6438; GCN-NEXT: v_mul_lo_u32 v1, v1, s3 6439; GCN-NEXT: v_mul_lo_u32 v0, v0, s3 6440; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 6441; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 6442; GCN-NEXT: v_mov_b32_e32 v2, s1 6443; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 6444; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s3, v0 6445; GCN-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc 6446; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s3, v2 6447; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v3, vcc 6448; GCN-NEXT: s_mov_b32 s0, 0x12d8fa 6449; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s0, v2 6450; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 6451; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 6452; GCN-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc 6453; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 6454; GCN-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v0 6455; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 6456; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 6457; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 6458; GCN-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[0:1] 6459; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 6460; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 6461; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 6462; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 6463; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 6464; GCN-NEXT: v_xor_b32_e32 v1, s2, v1 6465; GCN-NEXT: v_mov_b32_e32 v2, s2 6466; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 6467; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 6468; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6469; GCN-NEXT: s_endpgm 6470 %r = srem i64 %x, 1235195 6471 store i64 %r, i64 addrspace(1)* %out 6472 ret void 6473} 6474 6475define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 6476; CHECK-LABEL: @srem_i64_pow2k_denom( 6477; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 4096 6478; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 6479; CHECK-NEXT: ret void 6480; 6481; GCN-LABEL: srem_i64_pow2k_denom: 6482; GCN: ; %bb.0: 6483; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 6484; GCN-NEXT: s_mov_b32 s3, 0xf000 6485; GCN-NEXT: s_mov_b32 s2, -1 6486; GCN-NEXT: s_waitcnt lgkmcnt(0) 6487; GCN-NEXT: s_mov_b32 s0, s4 6488; GCN-NEXT: s_ashr_i32 s4, s7, 31 6489; GCN-NEXT: s_lshr_b32 s4, s4, 20 6490; GCN-NEXT: s_add_u32 s4, s6, s4 6491; GCN-NEXT: s_mov_b32 s1, s5 6492; GCN-NEXT: s_addc_u32 s5, s7, 0 6493; GCN-NEXT: s_and_b32 s4, s4, 0xfffff000 6494; GCN-NEXT: s_sub_u32 s4, s6, s4 6495; GCN-NEXT: s_subb_u32 s5, s7, s5 6496; GCN-NEXT: v_mov_b32_e32 v0, s4 6497; GCN-NEXT: v_mov_b32_e32 v1, s5 6498; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6499; GCN-NEXT: s_endpgm 6500 %r = srem i64 %x, 4096 6501 store i64 %r, i64 addrspace(1)* %out 6502 ret void 6503} 6504 6505define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 6506; CHECK-LABEL: @srem_i64_pow2_shl_denom( 6507; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 6508; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]] 6509; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 6510; CHECK-NEXT: ret void 6511; 6512; GCN-LABEL: srem_i64_pow2_shl_denom: 6513; GCN: ; %bb.0: 6514; GCN-NEXT: s_load_dword s4, s[0:1], 0xd 6515; GCN-NEXT: s_mov_b32 s3, 0 6516; GCN-NEXT: s_movk_i32 s2, 0x1000 6517; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 6518; GCN-NEXT: s_mov_b32 s7, 0xf000 6519; GCN-NEXT: s_waitcnt lgkmcnt(0) 6520; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 6521; GCN-NEXT: s_ashr_i32 s4, s3, 31 6522; GCN-NEXT: s_add_u32 s2, s2, s4 6523; GCN-NEXT: s_mov_b32 s5, s4 6524; GCN-NEXT: s_addc_u32 s3, s3, s4 6525; GCN-NEXT: s_xor_b64 s[12:13], s[2:3], s[4:5] 6526; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 6527; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13 6528; GCN-NEXT: s_sub_u32 s2, 0, s12 6529; GCN-NEXT: s_subb_u32 s3, 0, s13 6530; GCN-NEXT: s_ashr_i32 s14, s11, 31 6531; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 6532; GCN-NEXT: v_rcp_f32_e32 v0, v0 6533; GCN-NEXT: s_mov_b32 s15, s14 6534; GCN-NEXT: s_mov_b32 s6, -1 6535; GCN-NEXT: s_mov_b32 s4, s8 6536; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 6537; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 6538; GCN-NEXT: v_trunc_f32_e32 v1, v1 6539; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 6540; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 6541; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 6542; GCN-NEXT: s_mov_b32 s5, s9 6543; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 6544; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 6545; GCN-NEXT: v_mul_lo_u32 v5, s3, v0 6546; GCN-NEXT: v_mul_lo_u32 v4, s2, v0 6547; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6548; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 6549; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 6550; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 6551; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 6552; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 6553; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 6554; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 6555; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 6556; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 6557; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 6558; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 6559; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 6560; GCN-NEXT: v_mov_b32_e32 v4, 0 6561; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 6562; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6563; GCN-NEXT: v_mov_b32_e32 v6, 0 6564; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 6565; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 6566; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 6567; GCN-NEXT: v_mul_lo_u32 v5, s2, v2 6568; GCN-NEXT: v_mul_hi_u32 v7, s2, v0 6569; GCN-NEXT: v_mul_lo_u32 v8, s3, v0 6570; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 6571; GCN-NEXT: v_mul_lo_u32 v7, s2, v0 6572; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 6573; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 6574; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 6575; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 6576; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 6577; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 6578; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 6579; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 6580; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 6581; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 6582; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 6583; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 6584; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 6585; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 6586; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 6587; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6588; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 6589; GCN-NEXT: s_add_u32 s0, s10, s14 6590; GCN-NEXT: s_addc_u32 s1, s11, s14 6591; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 6592; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 6593; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6594; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 6595; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 6596; GCN-NEXT: v_mul_hi_u32 v5, s10, v1 6597; GCN-NEXT: v_mul_hi_u32 v7, s11, v1 6598; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 6599; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6600; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6601; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 6602; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 6603; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 6604; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 6605; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 6606; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6607; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 6608; GCN-NEXT: v_mul_lo_u32 v1, s12, v1 6609; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 6610; GCN-NEXT: v_mul_lo_u32 v3, s13, v0 6611; GCN-NEXT: v_mul_lo_u32 v0, s12, v0 6612; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 6613; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6614; GCN-NEXT: v_sub_i32_e32 v2, vcc, s11, v1 6615; GCN-NEXT: v_mov_b32_e32 v3, s13 6616; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 6617; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 6618; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 6619; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 6620; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v5 6621; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 6622; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 6623; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v4 6624; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 6625; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 6626; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v5 6627; GCN-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 6628; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 6629; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 6630; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 6631; GCN-NEXT: v_mov_b32_e32 v5, s11 6632; GCN-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 6633; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 6634; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 6635; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 6636; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 6637; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 6638; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 6639; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 6640; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6641; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 6642; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6643; GCN-NEXT: v_xor_b32_e32 v0, s14, v0 6644; GCN-NEXT: v_xor_b32_e32 v1, s14, v1 6645; GCN-NEXT: v_mov_b32_e32 v2, s14 6646; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s14, v0 6647; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 6648; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6649; GCN-NEXT: s_endpgm 6650 %shl.y = shl i64 4096, %y 6651 %r = srem i64 %x, %shl.y 6652 store i64 %r, i64 addrspace(1)* %out 6653 ret void 6654} 6655 6656define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 6657; CHECK-LABEL: @srem_v2i64_pow2k_denom( 6658; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 6659; CHECK-NEXT: [[TMP2:%.*]] = srem i64 [[TMP1]], 4096 6660; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 6661; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 6662; CHECK-NEXT: [[TMP5:%.*]] = srem i64 [[TMP4]], 4096 6663; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 6664; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 6665; CHECK-NEXT: ret void 6666; 6667; GCN-LABEL: srem_v2i64_pow2k_denom: 6668; GCN: ; %bb.0: 6669; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6670; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 6671; GCN-NEXT: s_movk_i32 s8, 0xf000 6672; GCN-NEXT: s_mov_b32 s7, 0xf000 6673; GCN-NEXT: s_mov_b32 s6, -1 6674; GCN-NEXT: s_waitcnt lgkmcnt(0) 6675; GCN-NEXT: s_ashr_i32 s9, s1, 31 6676; GCN-NEXT: s_lshr_b32 s9, s9, 20 6677; GCN-NEXT: s_add_u32 s9, s0, s9 6678; GCN-NEXT: s_addc_u32 s10, s1, 0 6679; GCN-NEXT: s_and_b32 s9, s9, s8 6680; GCN-NEXT: s_sub_u32 s0, s0, s9 6681; GCN-NEXT: s_subb_u32 s1, s1, s10 6682; GCN-NEXT: s_ashr_i32 s9, s3, 31 6683; GCN-NEXT: s_lshr_b32 s9, s9, 20 6684; GCN-NEXT: s_add_u32 s9, s2, s9 6685; GCN-NEXT: s_addc_u32 s10, s3, 0 6686; GCN-NEXT: s_and_b32 s8, s9, s8 6687; GCN-NEXT: s_sub_u32 s2, s2, s8 6688; GCN-NEXT: s_subb_u32 s3, s3, s10 6689; GCN-NEXT: v_mov_b32_e32 v0, s0 6690; GCN-NEXT: v_mov_b32_e32 v1, s1 6691; GCN-NEXT: v_mov_b32_e32 v2, s2 6692; GCN-NEXT: v_mov_b32_e32 v3, s3 6693; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 6694; GCN-NEXT: s_endpgm 6695 %r = srem <2 x i64> %x, <i64 4096, i64 4096> 6696 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 6697 ret void 6698} 6699 6700define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 6701; CHECK-LABEL: @srem_v2i64_pow2_shl_denom( 6702; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 6703; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 6704; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 6705; CHECK-NEXT: [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]] 6706; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 6707; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 6708; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 6709; CHECK-NEXT: [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]] 6710; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 6711; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 6712; CHECK-NEXT: ret void 6713; 6714; GCN-LABEL: srem_v2i64_pow2_shl_denom: 6715; GCN: ; %bb.0: 6716; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 6717; GCN-NEXT: s_mov_b32 s3, 0 6718; GCN-NEXT: s_movk_i32 s2, 0x1000 6719; GCN-NEXT: s_mov_b32 s18, 0x4f800000 6720; GCN-NEXT: s_mov_b32 s19, 0x5f7ffffc 6721; GCN-NEXT: s_waitcnt lgkmcnt(0) 6722; GCN-NEXT: s_lshl_b64 s[14:15], s[2:3], s6 6723; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 6724; GCN-NEXT: s_ashr_i32 s4, s3, 31 6725; GCN-NEXT: s_add_u32 s2, s2, s4 6726; GCN-NEXT: s_mov_b32 s5, s4 6727; GCN-NEXT: s_addc_u32 s3, s3, s4 6728; GCN-NEXT: s_xor_b64 s[16:17], s[2:3], s[4:5] 6729; GCN-NEXT: v_cvt_f32_u32_e32 v0, s16 6730; GCN-NEXT: v_cvt_f32_u32_e32 v1, s17 6731; GCN-NEXT: s_mov_b32 s20, 0x2f800000 6732; GCN-NEXT: s_mov_b32 s21, 0xcf800000 6733; GCN-NEXT: s_sub_u32 s6, 0, s16 6734; GCN-NEXT: v_mac_f32_e32 v0, s18, v1 6735; GCN-NEXT: v_rcp_f32_e32 v0, v0 6736; GCN-NEXT: s_subb_u32 s7, 0, s17 6737; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6738; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 6739; GCN-NEXT: v_mul_f32_e32 v0, s19, v0 6740; GCN-NEXT: v_mul_f32_e32 v1, s20, v0 6741; GCN-NEXT: v_trunc_f32_e32 v1, v1 6742; GCN-NEXT: v_mac_f32_e32 v0, s21, v1 6743; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 6744; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 6745; GCN-NEXT: s_waitcnt lgkmcnt(0) 6746; GCN-NEXT: s_ashr_i32 s12, s9, 31 6747; GCN-NEXT: s_add_u32 s0, s8, s12 6748; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 6749; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 6750; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 6751; GCN-NEXT: v_mul_lo_u32 v5, s6, v0 6752; GCN-NEXT: s_mov_b32 s13, s12 6753; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6754; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 6755; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 6756; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 6757; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 6758; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 6759; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 6760; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 6761; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 6762; GCN-NEXT: v_mul_lo_u32 v6, v1, v5 6763; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 6764; GCN-NEXT: s_addc_u32 s1, s9, s12 6765; GCN-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] 6766; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 6767; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 6768; GCN-NEXT: v_mov_b32_e32 v4, 0 6769; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 6770; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6771; GCN-NEXT: v_mov_b32_e32 v6, 0 6772; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2 6773; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 6774; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[2:3] 6775; GCN-NEXT: v_mul_lo_u32 v5, s6, v2 6776; GCN-NEXT: v_mul_hi_u32 v7, s6, v0 6777; GCN-NEXT: v_mul_lo_u32 v8, s7, v0 6778; GCN-NEXT: s_mov_b32 s7, 0xf000 6779; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 6780; GCN-NEXT: v_mul_lo_u32 v7, s6, v0 6781; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 6782; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 6783; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 6784; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 6785; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 6786; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 6787; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 6788; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 6789; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 6790; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 6791; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 6792; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 6793; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 6794; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 6795; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 6796; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6797; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] 6798; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 6799; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6800; GCN-NEXT: v_mul_lo_u32 v2, s8, v1 6801; GCN-NEXT: v_mul_hi_u32 v3, s8, v0 6802; GCN-NEXT: v_mul_hi_u32 v5, s8, v1 6803; GCN-NEXT: v_mul_hi_u32 v7, s9, v1 6804; GCN-NEXT: v_mul_lo_u32 v1, s9, v1 6805; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6806; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6807; GCN-NEXT: v_mul_lo_u32 v5, s9, v0 6808; GCN-NEXT: v_mul_hi_u32 v0, s9, v0 6809; GCN-NEXT: s_mov_b32 s6, -1 6810; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 6811; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 6812; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 6813; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6814; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 6815; GCN-NEXT: v_mul_lo_u32 v1, s16, v1 6816; GCN-NEXT: v_mul_hi_u32 v2, s16, v0 6817; GCN-NEXT: v_mul_lo_u32 v3, s17, v0 6818; GCN-NEXT: v_mul_lo_u32 v0, s16, v0 6819; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 6820; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6821; GCN-NEXT: v_sub_i32_e32 v2, vcc, s9, v1 6822; GCN-NEXT: v_mov_b32_e32 v3, s17 6823; GCN-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 6824; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 6825; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s16, v0 6826; GCN-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v2, s[0:1] 6827; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s17, v7 6828; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 6829; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 6830; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v5 6831; GCN-NEXT: v_subrev_i32_e64 v3, s[0:1], s16, v5 6832; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 6833; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v7 6834; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 6835; GCN-NEXT: s_ashr_i32 s2, s15, 31 6836; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 6837; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 6838; GCN-NEXT: s_add_u32 s8, s14, s2 6839; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[0:1] 6840; GCN-NEXT: v_mov_b32_e32 v7, s9 6841; GCN-NEXT: s_mov_b32 s3, s2 6842; GCN-NEXT: s_addc_u32 s9, s15, s2 6843; GCN-NEXT: s_xor_b64 s[8:9], s[8:9], s[2:3] 6844; GCN-NEXT: v_cvt_f32_u32_e32 v8, s8 6845; GCN-NEXT: v_cvt_f32_u32_e32 v9, s9 6846; GCN-NEXT: v_subb_u32_e32 v1, vcc, v7, v1, vcc 6847; GCN-NEXT: v_cmp_le_u32_e32 vcc, s17, v1 6848; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 6849; GCN-NEXT: v_mac_f32_e32 v8, s18, v9 6850; GCN-NEXT: v_cmp_le_u32_e32 vcc, s16, v0 6851; GCN-NEXT: v_rcp_f32_e32 v8, v8 6852; GCN-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc 6853; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s17, v1 6854; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc 6855; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 6856; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6857; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v3, s[0:1] 6858; GCN-NEXT: v_mul_f32_e32 v3, s19, v8 6859; GCN-NEXT: v_mul_f32_e32 v5, s20, v3 6860; GCN-NEXT: v_trunc_f32_e32 v5, v5 6861; GCN-NEXT: v_mac_f32_e32 v3, s21, v5 6862; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 6863; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 6864; GCN-NEXT: s_sub_u32 s2, 0, s8 6865; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6866; GCN-NEXT: v_mul_hi_u32 v2, s2, v3 6867; GCN-NEXT: v_mul_lo_u32 v7, s2, v5 6868; GCN-NEXT: s_subb_u32 s3, 0, s9 6869; GCN-NEXT: v_mul_lo_u32 v8, s3, v3 6870; GCN-NEXT: s_ashr_i32 s14, s11, 31 6871; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v7 6872; GCN-NEXT: v_mul_lo_u32 v7, s2, v3 6873; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v8 6874; GCN-NEXT: v_mul_lo_u32 v8, v3, v2 6875; GCN-NEXT: v_mul_hi_u32 v10, v3, v2 6876; GCN-NEXT: v_mul_hi_u32 v9, v3, v7 6877; GCN-NEXT: v_mul_hi_u32 v11, v5, v2 6878; GCN-NEXT: v_mul_lo_u32 v2, v5, v2 6879; GCN-NEXT: s_mov_b32 s15, s14 6880; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 6881; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 6882; GCN-NEXT: v_mul_lo_u32 v10, v5, v7 6883; GCN-NEXT: v_mul_hi_u32 v7, v5, v7 6884; GCN-NEXT: v_xor_b32_e32 v0, s12, v0 6885; GCN-NEXT: v_xor_b32_e32 v1, s12, v1 6886; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 6887; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc 6888; GCN-NEXT: v_addc_u32_e32 v8, vcc, v11, v4, vcc 6889; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 6890; GCN-NEXT: v_add_i32_e64 v2, s[0:1], v3, v2 6891; GCN-NEXT: v_addc_u32_e32 v7, vcc, v6, v8, vcc 6892; GCN-NEXT: v_addc_u32_e64 v3, vcc, v5, v7, s[0:1] 6893; GCN-NEXT: v_mul_lo_u32 v8, s2, v3 6894; GCN-NEXT: v_mul_hi_u32 v9, s2, v2 6895; GCN-NEXT: v_mul_lo_u32 v10, s3, v2 6896; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 6897; GCN-NEXT: v_mul_lo_u32 v9, s2, v2 6898; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 6899; GCN-NEXT: v_mul_lo_u32 v12, v2, v8 6900; GCN-NEXT: v_mul_hi_u32 v14, v2, v8 6901; GCN-NEXT: v_mul_hi_u32 v13, v2, v9 6902; GCN-NEXT: v_mul_hi_u32 v11, v3, v9 6903; GCN-NEXT: v_mul_lo_u32 v9, v3, v9 6904; GCN-NEXT: v_mul_hi_u32 v10, v3, v8 6905; GCN-NEXT: v_add_i32_e32 v12, vcc, v13, v12 6906; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc 6907; GCN-NEXT: v_mul_lo_u32 v3, v3, v8 6908; GCN-NEXT: v_add_i32_e32 v9, vcc, v12, v9 6909; GCN-NEXT: v_addc_u32_e32 v9, vcc, v13, v11, vcc 6910; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v4, vcc 6911; GCN-NEXT: v_add_i32_e32 v3, vcc, v9, v3 6912; GCN-NEXT: v_addc_u32_e32 v8, vcc, v6, v8, vcc 6913; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 6914; GCN-NEXT: v_addc_u32_e64 v5, vcc, v5, v8, s[0:1] 6915; GCN-NEXT: s_add_u32 s0, s10, s14 6916; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 6917; GCN-NEXT: s_addc_u32 s1, s11, s14 6918; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 6919; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6920; GCN-NEXT: v_mul_lo_u32 v5, s10, v3 6921; GCN-NEXT: v_mul_hi_u32 v7, s10, v2 6922; GCN-NEXT: v_mul_hi_u32 v9, s10, v3 6923; GCN-NEXT: v_mul_hi_u32 v10, s11, v3 6924; GCN-NEXT: v_mul_lo_u32 v3, s11, v3 6925; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 6926; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc 6927; GCN-NEXT: v_mul_lo_u32 v9, s11, v2 6928; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 6929; GCN-NEXT: v_mov_b32_e32 v8, s12 6930; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 6931; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v2, vcc 6932; GCN-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc 6933; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 6934; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc 6935; GCN-NEXT: v_mul_lo_u32 v3, s8, v3 6936; GCN-NEXT: v_mul_hi_u32 v4, s8, v2 6937; GCN-NEXT: v_mul_lo_u32 v5, s9, v2 6938; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 6939; GCN-NEXT: v_mul_lo_u32 v2, s8, v2 6940; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc 6941; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 6942; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 6943; GCN-NEXT: v_sub_i32_e32 v4, vcc, s11, v3 6944; GCN-NEXT: v_mov_b32_e32 v5, s9 6945; GCN-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 6946; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 6947; GCN-NEXT: v_subrev_i32_e64 v6, s[0:1], s8, v2 6948; GCN-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] 6949; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v7 6950; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1] 6951; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 6952; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v6 6953; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s8, v6 6954; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 6955; GCN-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v7 6956; GCN-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 6957; GCN-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 6958; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 6959; GCN-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] 6960; GCN-NEXT: v_mov_b32_e32 v7, s11 6961; GCN-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc 6962; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 6963; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 6964; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 6965; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 6966; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 6967; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 6968; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 6969; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 6970; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 6971; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 6972; GCN-NEXT: v_xor_b32_e32 v2, s14, v2 6973; GCN-NEXT: v_xor_b32_e32 v3, s14, v3 6974; GCN-NEXT: v_mov_b32_e32 v4, s14 6975; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s14, v2 6976; GCN-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc 6977; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 6978; GCN-NEXT: s_endpgm 6979 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 6980 %r = srem <2 x i64> %x, %shl.y 6981 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 6982 ret void 6983} 6984