1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 3; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s 5 6define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 7; CHECK-LABEL: @udiv_i32( 8; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 9; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 10; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000 11; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 12; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]] 13; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]] 14; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64 15; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 16; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 17; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 18; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 19; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 20; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]] 21; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64 22; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 23; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 24; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 25; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 26; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 27; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]] 28; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]] 29; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]] 30; CHECK-NEXT: [[TMP23:%.*]] = add i32 [[TMP19]], 1 31; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP19]] 32; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP21]], [[Y]] 33; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP22]], i32 [[TMP25]], i32 [[TMP21]] 34; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]] 35; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP24]], 1 36; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]] 37; CHECK-NEXT: store i32 [[TMP29]], i32 addrspace(1)* [[OUT:%.*]], align 4 38; CHECK-NEXT: ret void 39; 40; GCN-LABEL: udiv_i32: 41; GCN: ; %bb.0: 42; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 43; GCN-NEXT: s_mov_b32 s7, 0xf000 44; GCN-NEXT: s_mov_b32 s6, -1 45; GCN-NEXT: s_waitcnt lgkmcnt(0) 46; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 47; GCN-NEXT: s_sub_i32 s4, 0, s3 48; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 49; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 50; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 51; GCN-NEXT: v_mul_lo_u32 v1, s4, v0 52; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 53; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 54; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 55; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 56; GCN-NEXT: v_mul_lo_u32 v1, v0, s3 57; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 58; GCN-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 59; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 60; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 61; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 62; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 63; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 64; GCN-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 65; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 66; GCN-NEXT: s_waitcnt lgkmcnt(0) 67; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 68; GCN-NEXT: s_endpgm 69 %r = udiv i32 %x, %y 70 store i32 %r, i32 addrspace(1)* %out 71 ret void 72} 73 74define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 75; CHECK-LABEL: @urem_i32( 76; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 77; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 78; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000 79; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 80; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]] 81; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]] 82; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64 83; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 84; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 85; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 86; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 87; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 88; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]] 89; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64 90; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 91; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 92; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 93; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 94; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 95; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]] 96; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]] 97; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]] 98; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP21]], [[Y]] 99; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP21]] 100; CHECK-NEXT: [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]] 101; CHECK-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]] 102; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]] 103; CHECK-NEXT: store i32 [[TMP27]], i32 addrspace(1)* [[OUT:%.*]], align 4 104; CHECK-NEXT: ret void 105; 106; GCN-LABEL: urem_i32: 107; GCN: ; %bb.0: 108; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 109; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 110; GCN-NEXT: s_mov_b32 s3, 0xf000 111; GCN-NEXT: s_waitcnt lgkmcnt(0) 112; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5 113; GCN-NEXT: s_sub_i32 s2, 0, s5 114; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 115; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 116; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 117; GCN-NEXT: v_mul_lo_u32 v1, s2, v0 118; GCN-NEXT: s_mov_b32 s2, -1 119; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 120; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 121; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 122; GCN-NEXT: v_mul_lo_u32 v0, v0, s5 123; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 124; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s5, v0 125; GCN-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 126; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 127; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s5, v0 128; GCN-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 129; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 130; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 131; GCN-NEXT: s_endpgm 132 %r = urem i32 %x, %y 133 store i32 %r, i32 addrspace(1)* %out 134 ret void 135} 136 137define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 138; CHECK-LABEL: @sdiv_i32( 139; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 140; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 141; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 142; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[X]], [[TMP1]] 143; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]] 144; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]] 145; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]] 146; CHECK-NEXT: [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float 147; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]]) 148; CHECK-NEXT: [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41EFFFFFC0000000 149; CHECK-NEXT: [[TMP11:%.*]] = fptoui float [[TMP10]] to i32 150; CHECK-NEXT: [[TMP12:%.*]] = sub i32 0, [[TMP7]] 151; CHECK-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP11]] 152; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP11]] to i64 153; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 154; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 155; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 156; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 157; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 158; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP11]], [[TMP19]] 159; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP6]] to i64 160; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP20]] to i64 161; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]] 162; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 163; CHECK-NEXT: [[TMP25:%.*]] = lshr i64 [[TMP23]], 32 164; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 165; CHECK-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], [[TMP7]] 166; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP6]], [[TMP27]] 167; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP7]] 168; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 169; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 170; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP28]], [[TMP7]] 171; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP29]], i32 [[TMP32]], i32 [[TMP28]] 172; CHECK-NEXT: [[TMP34:%.*]] = icmp uge i32 [[TMP33]], [[TMP7]] 173; CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP31]], 1 174; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]] 175; CHECK-NEXT: [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]] 176; CHECK-NEXT: [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]] 177; CHECK-NEXT: store i32 [[TMP38]], i32 addrspace(1)* [[OUT:%.*]], align 4 178; CHECK-NEXT: ret void 179; 180; GCN-LABEL: sdiv_i32: 181; GCN: ; %bb.0: 182; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 183; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 184; GCN-NEXT: s_mov_b32 s7, 0xf000 185; GCN-NEXT: s_mov_b32 s6, -1 186; GCN-NEXT: s_waitcnt lgkmcnt(0) 187; GCN-NEXT: s_ashr_i32 s8, s3, 31 188; GCN-NEXT: s_add_i32 s3, s3, s8 189; GCN-NEXT: s_xor_b32 s9, s3, s8 190; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 191; GCN-NEXT: s_sub_i32 s3, 0, s9 192; GCN-NEXT: s_ashr_i32 s0, s2, 31 193; GCN-NEXT: s_add_i32 s1, s2, s0 194; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 195; GCN-NEXT: s_xor_b32 s1, s1, s0 196; GCN-NEXT: s_xor_b32 s2, s0, s8 197; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 198; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 199; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 200; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 201; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 202; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 203; GCN-NEXT: v_mul_lo_u32 v1, v0, s9 204; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 205; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 206; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v1 207; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 208; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s9, v1 209; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 210; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 211; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 212; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 213; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 214; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 215; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 216; GCN-NEXT: s_endpgm 217 %r = sdiv i32 %x, %y 218 store i32 %r, i32 addrspace(1)* %out 219 ret void 220} 221 222define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 223; CHECK-LABEL: @srem_i32( 224; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 225; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 226; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[X]], [[TMP1]] 227; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]] 228; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]] 229; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]] 230; CHECK-NEXT: [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float 231; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 232; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41EFFFFFC0000000 233; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP9]] to i32 234; CHECK-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP6]] 235; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP10]] 236; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP10]] to i64 237; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64 238; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]] 239; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 240; CHECK-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP15]], 32 241; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 242; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP10]], [[TMP18]] 243; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64 244; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP19]] to i64 245; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]] 246; CHECK-NEXT: [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32 247; CHECK-NEXT: [[TMP24:%.*]] = lshr i64 [[TMP22]], 32 248; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 249; CHECK-NEXT: [[TMP26:%.*]] = mul i32 [[TMP25]], [[TMP6]] 250; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP5]], [[TMP26]] 251; CHECK-NEXT: [[TMP28:%.*]] = icmp uge i32 [[TMP27]], [[TMP6]] 252; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP6]] 253; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP28]], i32 [[TMP29]], i32 [[TMP27]] 254; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP6]] 255; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP30]], [[TMP6]] 256; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]] 257; CHECK-NEXT: [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]] 258; CHECK-NEXT: [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]] 259; CHECK-NEXT: store i32 [[TMP35]], i32 addrspace(1)* [[OUT:%.*]], align 4 260; CHECK-NEXT: ret void 261; 262; GCN-LABEL: srem_i32: 263; GCN: ; %bb.0: 264; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 265; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 266; GCN-NEXT: s_waitcnt lgkmcnt(0) 267; GCN-NEXT: s_ashr_i32 s4, s3, 31 268; GCN-NEXT: s_add_i32 s3, s3, s4 269; GCN-NEXT: s_xor_b32 s4, s3, s4 270; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 271; GCN-NEXT: s_sub_i32 s3, 0, s4 272; GCN-NEXT: s_ashr_i32 s5, s2, 31 273; GCN-NEXT: s_add_i32 s2, s2, s5 274; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 275; GCN-NEXT: s_xor_b32 s6, s2, s5 276; GCN-NEXT: s_mov_b32 s2, -1 277; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 278; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 279; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 280; GCN-NEXT: s_mov_b32 s3, 0xf000 281; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 282; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 283; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 284; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 285; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 286; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 287; GCN-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 288; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 289; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 290; GCN-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 291; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 292; GCN-NEXT: v_xor_b32_e32 v0, s5, v0 293; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s5, v0 294; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 295; GCN-NEXT: s_endpgm 296 %r = srem i32 %x, %y 297 store i32 %r, i32 addrspace(1)* %out 298 ret void 299} 300 301define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 302; CHECK-LABEL: @udiv_i16( 303; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 304; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 305; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 306; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 307; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 308; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 309; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 310; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 311; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 312; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 313; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 314; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 315; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 316; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 317; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 318; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 65535 319; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 320; CHECK-NEXT: store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]], align 2 321; CHECK-NEXT: ret void 322; 323; GCN-LABEL: udiv_i16: 324; GCN: ; %bb.0: 325; GCN-NEXT: s_load_dword s2, s[0:1], 0xb 326; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 327; GCN-NEXT: s_waitcnt lgkmcnt(0) 328; GCN-NEXT: s_lshr_b32 s3, s2, 16 329; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3 330; GCN-NEXT: s_and_b32 s2, s2, 0xffff 331; GCN-NEXT: v_cvt_f32_u32_e32 v1, s2 332; GCN-NEXT: s_mov_b32 s3, 0xf000 333; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 334; GCN-NEXT: s_mov_b32 s2, -1 335; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 336; GCN-NEXT: v_trunc_f32_e32 v2, v2 337; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 338; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 339; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 340; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 341; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 342; GCN-NEXT: s_endpgm 343 %r = udiv i16 %x, %y 344 store i16 %r, i16 addrspace(1)* %out 345 ret void 346} 347 348define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 349; CHECK-LABEL: @urem_i16( 350; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 351; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 352; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 353; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 354; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 355; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 356; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 357; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 358; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 359; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 360; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 361; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 362; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 363; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 364; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 365; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 366; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 367; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 368; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 369; CHECK-NEXT: store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]], align 2 370; CHECK-NEXT: ret void 371; 372; GCN-LABEL: urem_i16: 373; GCN: ; %bb.0: 374; GCN-NEXT: s_load_dword s4, s[0:1], 0xb 375; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 376; GCN-NEXT: s_waitcnt lgkmcnt(0) 377; GCN-NEXT: s_lshr_b32 s2, s4, 16 378; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 379; GCN-NEXT: s_and_b32 s3, s4, 0xffff 380; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 381; GCN-NEXT: s_mov_b32 s3, 0xf000 382; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 383; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 384; GCN-NEXT: v_trunc_f32_e32 v2, v2 385; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 386; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 387; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 388; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 389; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 390; GCN-NEXT: s_mov_b32 s2, -1 391; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 392; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 393; GCN-NEXT: s_endpgm 394 %r = urem i16 %x, %y 395 store i16 %r, i16 addrspace(1)* %out 396 ret void 397} 398 399define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 400; CHECK-LABEL: @sdiv_i16( 401; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 402; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 403; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 404; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 405; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 406; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 407; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 408; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 409; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 410; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 411; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 412; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 413; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 414; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 415; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 416; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 417; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 418; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 419; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 16 420; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 16 421; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 422; CHECK-NEXT: store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]], align 2 423; CHECK-NEXT: ret void 424; 425; GCN-LABEL: sdiv_i16: 426; GCN: ; %bb.0: 427; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 428; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 429; GCN-NEXT: s_mov_b32 s7, 0xf000 430; GCN-NEXT: s_mov_b32 s6, -1 431; GCN-NEXT: s_waitcnt lgkmcnt(0) 432; GCN-NEXT: s_ashr_i32 s1, s0, 16 433; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 434; GCN-NEXT: s_sext_i32_i16 s0, s0 435; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 436; GCN-NEXT: s_xor_b32 s0, s0, s1 437; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 438; GCN-NEXT: s_ashr_i32 s0, s0, 30 439; GCN-NEXT: s_or_b32 s2, s0, 1 440; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 441; GCN-NEXT: v_trunc_f32_e32 v2, v2 442; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 443; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 444; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 445; GCN-NEXT: s_cmp_lg_u32 s0, 0 446; GCN-NEXT: s_cselect_b32 s0, s2, 0 447; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v2 448; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 449; GCN-NEXT: s_endpgm 450 %r = sdiv i16 %x, %y 451 store i16 %r, i16 addrspace(1)* %out 452 ret void 453} 454 455define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 456; CHECK-LABEL: @srem_i16( 457; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 458; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 459; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 460; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 461; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 462; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 463; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 464; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 465; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 466; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 467; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 468; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 469; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 470; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 471; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 472; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 473; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 474; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 475; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 476; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 477; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 478; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 479; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 480; CHECK-NEXT: store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]], align 2 481; CHECK-NEXT: ret void 482; 483; GCN-LABEL: srem_i16: 484; GCN: ; %bb.0: 485; GCN-NEXT: s_load_dword s4, s[0:1], 0xb 486; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 487; GCN-NEXT: s_waitcnt lgkmcnt(0) 488; GCN-NEXT: s_ashr_i32 s5, s4, 16 489; GCN-NEXT: v_cvt_f32_i32_e32 v0, s5 490; GCN-NEXT: s_sext_i32_i16 s2, s4 491; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 492; GCN-NEXT: s_xor_b32 s2, s2, s5 493; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 494; GCN-NEXT: s_ashr_i32 s2, s2, 30 495; GCN-NEXT: s_or_b32 s6, s2, 1 496; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 497; GCN-NEXT: v_trunc_f32_e32 v2, v2 498; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 499; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 500; GCN-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 501; GCN-NEXT: s_cmp_lg_u32 s2, 0 502; GCN-NEXT: s_cselect_b32 s2, s6, 0 503; GCN-NEXT: v_add_i32_e32 v0, vcc, s2, v2 504; GCN-NEXT: v_mul_lo_u32 v0, v0, s5 505; GCN-NEXT: s_mov_b32 s3, 0xf000 506; GCN-NEXT: s_mov_b32 s2, -1 507; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 508; GCN-NEXT: buffer_store_short v0, off, s[0:3], 0 509; GCN-NEXT: s_endpgm 510 %r = srem i16 %x, %y 511 store i16 %r, i16 addrspace(1)* %out 512 ret void 513} 514 515define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 516; CHECK-LABEL: @udiv_i8( 517; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 518; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 519; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 520; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 521; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 522; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 523; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 524; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 525; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 526; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 527; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 528; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 529; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 530; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 531; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 532; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 255 533; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 534; CHECK-NEXT: store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]], align 1 535; CHECK-NEXT: ret void 536; 537; GCN-LABEL: udiv_i8: 538; GCN: ; %bb.0: 539; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 540; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 541; GCN-NEXT: s_mov_b32 s7, 0xf000 542; GCN-NEXT: s_mov_b32 s6, -1 543; GCN-NEXT: s_waitcnt lgkmcnt(0) 544; GCN-NEXT: v_cvt_f32_ubyte1_e32 v0, s0 545; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 546; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 547; GCN-NEXT: v_mul_f32_e32 v1, v2, v1 548; GCN-NEXT: v_trunc_f32_e32 v1, v1 549; GCN-NEXT: v_cvt_u32_f32_e32 v3, v1 550; GCN-NEXT: v_mad_f32 v1, -v1, v0, v2 551; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 552; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 553; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 554; GCN-NEXT: s_endpgm 555 %r = udiv i8 %x, %y 556 store i8 %r, i8 addrspace(1)* %out 557 ret void 558} 559 560define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 561; CHECK-LABEL: @urem_i8( 562; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 563; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 564; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 565; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 566; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 567; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 568; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 569; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 570; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 571; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 572; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 573; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 574; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 575; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 576; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 577; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 578; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 579; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 255 580; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8 581; CHECK-NEXT: store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]], align 1 582; CHECK-NEXT: ret void 583; 584; GCN-LABEL: urem_i8: 585; GCN: ; %bb.0: 586; GCN-NEXT: s_load_dword s4, s[0:1], 0xb 587; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 588; GCN-NEXT: s_mov_b32 s3, 0xf000 589; GCN-NEXT: s_waitcnt lgkmcnt(0) 590; GCN-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 591; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 592; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 593; GCN-NEXT: s_lshr_b32 s2, s4, 8 594; GCN-NEXT: v_mul_f32_e32 v1, v2, v1 595; GCN-NEXT: v_trunc_f32_e32 v1, v1 596; GCN-NEXT: v_cvt_u32_f32_e32 v3, v1 597; GCN-NEXT: v_mad_f32 v1, -v1, v0, v2 598; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 599; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 600; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 601; GCN-NEXT: s_mov_b32 s2, -1 602; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 603; GCN-NEXT: buffer_store_byte v0, off, s[0:3], 0 604; GCN-NEXT: s_endpgm 605 %r = urem i8 %x, %y 606 store i8 %r, i8 addrspace(1)* %out 607 ret void 608} 609 610define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 611; CHECK-LABEL: @sdiv_i8( 612; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 613; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 614; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 615; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 616; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 617; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 618; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 619; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 620; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 621; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 622; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 623; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 624; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 625; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 626; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 627; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 628; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 629; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 630; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 24 631; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 24 632; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8 633; CHECK-NEXT: store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]], align 1 634; CHECK-NEXT: ret void 635; 636; GCN-LABEL: sdiv_i8: 637; GCN: ; %bb.0: 638; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 639; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 640; GCN-NEXT: s_mov_b32 s7, 0xf000 641; GCN-NEXT: s_mov_b32 s6, -1 642; GCN-NEXT: s_waitcnt lgkmcnt(0) 643; GCN-NEXT: s_bfe_i32 s1, s0, 0x80008 644; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 645; GCN-NEXT: s_sext_i32_i8 s0, s0 646; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 647; GCN-NEXT: s_xor_b32 s0, s0, s1 648; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 649; GCN-NEXT: s_ashr_i32 s0, s0, 30 650; GCN-NEXT: s_or_b32 s2, s0, 1 651; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 652; GCN-NEXT: v_trunc_f32_e32 v2, v2 653; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 654; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 655; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 656; GCN-NEXT: s_cmp_lg_u32 s0, 0 657; GCN-NEXT: s_cselect_b32 s0, s2, 0 658; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v2 659; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 660; GCN-NEXT: s_endpgm 661 %r = sdiv i8 %x, %y 662 store i8 %r, i8 addrspace(1)* %out 663 ret void 664} 665 666define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 667; CHECK-LABEL: @srem_i8( 668; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 669; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 670; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 671; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 672; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 673; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 674; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 675; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 676; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 677; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 678; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 679; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 680; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 681; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 682; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 683; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 684; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 685; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 686; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 687; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 688; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 24 689; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 24 690; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8 691; CHECK-NEXT: store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]], align 1 692; CHECK-NEXT: ret void 693; 694; GCN-LABEL: srem_i8: 695; GCN: ; %bb.0: 696; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 697; GCN-NEXT: s_load_dword s2, s[0:1], 0xb 698; GCN-NEXT: s_mov_b32 s7, 0xf000 699; GCN-NEXT: s_waitcnt lgkmcnt(0) 700; GCN-NEXT: s_bfe_i32 s0, s2, 0x80008 701; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 702; GCN-NEXT: s_sext_i32_i8 s1, s2 703; GCN-NEXT: v_cvt_f32_i32_e32 v1, s1 704; GCN-NEXT: s_xor_b32 s0, s1, s0 705; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 706; GCN-NEXT: s_ashr_i32 s0, s0, 30 707; GCN-NEXT: s_lshr_b32 s3, s2, 8 708; GCN-NEXT: s_or_b32 s6, s0, 1 709; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 710; GCN-NEXT: v_trunc_f32_e32 v2, v2 711; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 712; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 713; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 714; GCN-NEXT: s_cmp_lg_u32 s0, 0 715; GCN-NEXT: s_cselect_b32 s0, s6, 0 716; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v2 717; GCN-NEXT: v_mul_lo_u32 v0, v0, s3 718; GCN-NEXT: s_mov_b32 s6, -1 719; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 720; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 721; GCN-NEXT: s_endpgm 722 %r = srem i8 %x, %y 723 store i8 %r, i8 addrspace(1)* %out 724 ret void 725} 726 727define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 728; CHECK-LABEL: @udiv_v4i32( 729; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 730; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 731; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 732; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 733; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 734; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 735; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 736; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 737; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 738; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 739; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 740; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 741; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 742; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 743; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 744; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 745; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 746; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 747; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 748; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 749; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 750; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 751; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 752; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 753; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1 754; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]] 755; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]] 756; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]] 757; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]] 758; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 759; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 760; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i64 0 761; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[X]], i64 1 762; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[Y]], i64 1 763; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float 764; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 765; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000 766; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32 767; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]] 768; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]] 769; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64 770; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64 771; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]] 772; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 773; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32 774; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32 775; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]] 776; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64 777; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64 778; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]] 779; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 780; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32 781; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32 782; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]] 783; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]] 784; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]] 785; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1 786; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]] 787; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]] 788; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]] 789; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]] 790; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1 791; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]] 792; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP63]], i64 1 793; CHECK-NEXT: [[TMP65:%.*]] = extractelement <4 x i32> [[X]], i64 2 794; CHECK-NEXT: [[TMP66:%.*]] = extractelement <4 x i32> [[Y]], i64 2 795; CHECK-NEXT: [[TMP67:%.*]] = uitofp i32 [[TMP66]] to float 796; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP67]]) 797; CHECK-NEXT: [[TMP69:%.*]] = fmul fast float [[TMP68]], 0x41EFFFFFC0000000 798; CHECK-NEXT: [[TMP70:%.*]] = fptoui float [[TMP69]] to i32 799; CHECK-NEXT: [[TMP71:%.*]] = sub i32 0, [[TMP66]] 800; CHECK-NEXT: [[TMP72:%.*]] = mul i32 [[TMP71]], [[TMP70]] 801; CHECK-NEXT: [[TMP73:%.*]] = zext i32 [[TMP70]] to i64 802; CHECK-NEXT: [[TMP74:%.*]] = zext i32 [[TMP72]] to i64 803; CHECK-NEXT: [[TMP75:%.*]] = mul i64 [[TMP73]], [[TMP74]] 804; CHECK-NEXT: [[TMP76:%.*]] = trunc i64 [[TMP75]] to i32 805; CHECK-NEXT: [[TMP77:%.*]] = lshr i64 [[TMP75]], 32 806; CHECK-NEXT: [[TMP78:%.*]] = trunc i64 [[TMP77]] to i32 807; CHECK-NEXT: [[TMP79:%.*]] = add i32 [[TMP70]], [[TMP78]] 808; CHECK-NEXT: [[TMP80:%.*]] = zext i32 [[TMP65]] to i64 809; CHECK-NEXT: [[TMP81:%.*]] = zext i32 [[TMP79]] to i64 810; CHECK-NEXT: [[TMP82:%.*]] = mul i64 [[TMP80]], [[TMP81]] 811; CHECK-NEXT: [[TMP83:%.*]] = trunc i64 [[TMP82]] to i32 812; CHECK-NEXT: [[TMP84:%.*]] = lshr i64 [[TMP82]], 32 813; CHECK-NEXT: [[TMP85:%.*]] = trunc i64 [[TMP84]] to i32 814; CHECK-NEXT: [[TMP86:%.*]] = mul i32 [[TMP85]], [[TMP66]] 815; CHECK-NEXT: [[TMP87:%.*]] = sub i32 [[TMP65]], [[TMP86]] 816; CHECK-NEXT: [[TMP88:%.*]] = icmp uge i32 [[TMP87]], [[TMP66]] 817; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP85]], 1 818; CHECK-NEXT: [[TMP90:%.*]] = select i1 [[TMP88]], i32 [[TMP89]], i32 [[TMP85]] 819; CHECK-NEXT: [[TMP91:%.*]] = sub i32 [[TMP87]], [[TMP66]] 820; CHECK-NEXT: [[TMP92:%.*]] = select i1 [[TMP88]], i32 [[TMP91]], i32 [[TMP87]] 821; CHECK-NEXT: [[TMP93:%.*]] = icmp uge i32 [[TMP92]], [[TMP66]] 822; CHECK-NEXT: [[TMP94:%.*]] = add i32 [[TMP90]], 1 823; CHECK-NEXT: [[TMP95:%.*]] = select i1 [[TMP93]], i32 [[TMP94]], i32 [[TMP90]] 824; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP95]], i64 2 825; CHECK-NEXT: [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 3 826; CHECK-NEXT: [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 3 827; CHECK-NEXT: [[TMP99:%.*]] = uitofp i32 [[TMP98]] to float 828; CHECK-NEXT: [[TMP100:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP99]]) 829; CHECK-NEXT: [[TMP101:%.*]] = fmul fast float [[TMP100]], 0x41EFFFFFC0000000 830; CHECK-NEXT: [[TMP102:%.*]] = fptoui float [[TMP101]] to i32 831; CHECK-NEXT: [[TMP103:%.*]] = sub i32 0, [[TMP98]] 832; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP102]] 833; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP102]] to i64 834; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64 835; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 836; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 837; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 838; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 839; CHECK-NEXT: [[TMP111:%.*]] = add i32 [[TMP102]], [[TMP110]] 840; CHECK-NEXT: [[TMP112:%.*]] = zext i32 [[TMP97]] to i64 841; CHECK-NEXT: [[TMP113:%.*]] = zext i32 [[TMP111]] to i64 842; CHECK-NEXT: [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]] 843; CHECK-NEXT: [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32 844; CHECK-NEXT: [[TMP116:%.*]] = lshr i64 [[TMP114]], 32 845; CHECK-NEXT: [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32 846; CHECK-NEXT: [[TMP118:%.*]] = mul i32 [[TMP117]], [[TMP98]] 847; CHECK-NEXT: [[TMP119:%.*]] = sub i32 [[TMP97]], [[TMP118]] 848; CHECK-NEXT: [[TMP120:%.*]] = icmp uge i32 [[TMP119]], [[TMP98]] 849; CHECK-NEXT: [[TMP121:%.*]] = add i32 [[TMP117]], 1 850; CHECK-NEXT: [[TMP122:%.*]] = select i1 [[TMP120]], i32 [[TMP121]], i32 [[TMP117]] 851; CHECK-NEXT: [[TMP123:%.*]] = sub i32 [[TMP119]], [[TMP98]] 852; CHECK-NEXT: [[TMP124:%.*]] = select i1 [[TMP120]], i32 [[TMP123]], i32 [[TMP119]] 853; CHECK-NEXT: [[TMP125:%.*]] = icmp uge i32 [[TMP124]], [[TMP98]] 854; CHECK-NEXT: [[TMP126:%.*]] = add i32 [[TMP122]], 1 855; CHECK-NEXT: [[TMP127:%.*]] = select i1 [[TMP125]], i32 [[TMP126]], i32 [[TMP122]] 856; CHECK-NEXT: [[TMP128:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP127]], i64 3 857; CHECK-NEXT: store <4 x i32> [[TMP128]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 858; CHECK-NEXT: ret void 859; 860; GCN-LABEL: udiv_v4i32: 861; GCN: ; %bb.0: 862; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 863; GCN-NEXT: s_mov_b32 s12, 0x4f7ffffe 864; GCN-NEXT: s_mov_b32 s15, 0xf000 865; GCN-NEXT: s_mov_b32 s14, -1 866; GCN-NEXT: s_waitcnt lgkmcnt(0) 867; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 868; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 869; GCN-NEXT: s_sub_i32 s2, 0, s8 870; GCN-NEXT: v_cvt_f32_u32_e32 v3, s10 871; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 872; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 873; GCN-NEXT: v_mul_f32_e32 v0, s12, v0 874; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 875; GCN-NEXT: v_mul_f32_e32 v1, s12, v1 876; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 877; GCN-NEXT: v_mul_lo_u32 v2, s2, v0 878; GCN-NEXT: s_sub_i32 s2, 0, s9 879; GCN-NEXT: v_mul_lo_u32 v4, s2, v1 880; GCN-NEXT: v_mul_hi_u32 v2, v0, v2 881; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 882; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 883; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v3 884; GCN-NEXT: v_mul_hi_u32 v3, v1, v4 885; GCN-NEXT: v_mul_lo_u32 v4, v0, s8 886; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v0 887; GCN-NEXT: v_mul_f32_e32 v2, s12, v2 888; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 889; GCN-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 890; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v4 891; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[2:3] 892; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s8, v4 893; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[2:3] 894; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v0 895; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v4 896; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 897; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 898; GCN-NEXT: v_mul_hi_u32 v1, s5, v1 899; GCN-NEXT: s_sub_i32 s4, 0, s10 900; GCN-NEXT: v_mul_lo_u32 v5, s4, v2 901; GCN-NEXT: s_sub_i32 s4, 0, s11 902; GCN-NEXT: v_mul_lo_u32 v3, v1, s9 903; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v1 904; GCN-NEXT: v_mul_hi_u32 v5, v2, v5 905; GCN-NEXT: v_sub_i32_e32 v3, vcc, s5, v3 906; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v3 907; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[2:3] 908; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s9, v3 909; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] 910; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v1 911; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 912; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 913; GCN-NEXT: v_cvt_f32_u32_e32 v4, s11 914; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 915; GCN-NEXT: v_mul_hi_u32 v2, s6, v2 916; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v4 917; GCN-NEXT: v_mul_lo_u32 v3, v2, s10 918; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v2 919; GCN-NEXT: v_mul_f32_e32 v4, s12, v4 920; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 921; GCN-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 922; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s10, v3 923; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[2:3] 924; GCN-NEXT: v_mul_lo_u32 v6, s4, v4 925; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s10, v3 926; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[2:3] 927; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v2 928; GCN-NEXT: v_mul_hi_u32 v6, v4, v6 929; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 930; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 931; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 932; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v4 933; GCN-NEXT: v_mul_hi_u32 v3, s7, v3 934; GCN-NEXT: v_mul_lo_u32 v4, v3, s11 935; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 936; GCN-NEXT: v_sub_i32_e32 v4, vcc, s7, v4 937; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v4 938; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 939; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s11, v4 940; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] 941; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 942; GCN-NEXT: v_cmp_le_u32_e32 vcc, s11, v4 943; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 944; GCN-NEXT: s_waitcnt lgkmcnt(0) 945; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 946; GCN-NEXT: s_endpgm 947 %r = udiv <4 x i32> %x, %y 948 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 949 ret void 950} 951 952define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 953; CHECK-LABEL: @urem_v4i32( 954; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 955; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 956; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 957; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 958; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 959; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 960; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 961; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 962; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 963; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 964; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 965; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 966; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 967; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 968; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 969; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 970; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 971; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 972; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 973; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 974; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 975; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 976; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 977; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 978; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]] 979; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]] 980; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]] 981; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]] 982; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]] 983; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> undef, i32 [[TMP29]], i64 0 984; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[X]], i64 1 985; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[Y]], i64 1 986; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float 987; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 988; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000 989; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32 990; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]] 991; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]] 992; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64 993; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64 994; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]] 995; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32 996; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32 997; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 998; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]] 999; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64 1000; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64 1001; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]] 1002; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32 1003; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32 1004; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 1005; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]] 1006; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]] 1007; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]] 1008; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]] 1009; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]] 1010; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]] 1011; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]] 1012; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]] 1013; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP59]], i64 1 1014; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i32> [[X]], i64 2 1015; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1016; CHECK-NEXT: [[TMP63:%.*]] = uitofp i32 [[TMP62]] to float 1017; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP63]]) 1018; CHECK-NEXT: [[TMP65:%.*]] = fmul fast float [[TMP64]], 0x41EFFFFFC0000000 1019; CHECK-NEXT: [[TMP66:%.*]] = fptoui float [[TMP65]] to i32 1020; CHECK-NEXT: [[TMP67:%.*]] = sub i32 0, [[TMP62]] 1021; CHECK-NEXT: [[TMP68:%.*]] = mul i32 [[TMP67]], [[TMP66]] 1022; CHECK-NEXT: [[TMP69:%.*]] = zext i32 [[TMP66]] to i64 1023; CHECK-NEXT: [[TMP70:%.*]] = zext i32 [[TMP68]] to i64 1024; CHECK-NEXT: [[TMP71:%.*]] = mul i64 [[TMP69]], [[TMP70]] 1025; CHECK-NEXT: [[TMP72:%.*]] = trunc i64 [[TMP71]] to i32 1026; CHECK-NEXT: [[TMP73:%.*]] = lshr i64 [[TMP71]], 32 1027; CHECK-NEXT: [[TMP74:%.*]] = trunc i64 [[TMP73]] to i32 1028; CHECK-NEXT: [[TMP75:%.*]] = add i32 [[TMP66]], [[TMP74]] 1029; CHECK-NEXT: [[TMP76:%.*]] = zext i32 [[TMP61]] to i64 1030; CHECK-NEXT: [[TMP77:%.*]] = zext i32 [[TMP75]] to i64 1031; CHECK-NEXT: [[TMP78:%.*]] = mul i64 [[TMP76]], [[TMP77]] 1032; CHECK-NEXT: [[TMP79:%.*]] = trunc i64 [[TMP78]] to i32 1033; CHECK-NEXT: [[TMP80:%.*]] = lshr i64 [[TMP78]], 32 1034; CHECK-NEXT: [[TMP81:%.*]] = trunc i64 [[TMP80]] to i32 1035; CHECK-NEXT: [[TMP82:%.*]] = mul i32 [[TMP81]], [[TMP62]] 1036; CHECK-NEXT: [[TMP83:%.*]] = sub i32 [[TMP61]], [[TMP82]] 1037; CHECK-NEXT: [[TMP84:%.*]] = icmp uge i32 [[TMP83]], [[TMP62]] 1038; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP83]], [[TMP62]] 1039; CHECK-NEXT: [[TMP86:%.*]] = select i1 [[TMP84]], i32 [[TMP85]], i32 [[TMP83]] 1040; CHECK-NEXT: [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP62]] 1041; CHECK-NEXT: [[TMP88:%.*]] = sub i32 [[TMP86]], [[TMP62]] 1042; CHECK-NEXT: [[TMP89:%.*]] = select i1 [[TMP87]], i32 [[TMP88]], i32 [[TMP86]] 1043; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP89]], i64 2 1044; CHECK-NEXT: [[TMP91:%.*]] = extractelement <4 x i32> [[X]], i64 3 1045; CHECK-NEXT: [[TMP92:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1046; CHECK-NEXT: [[TMP93:%.*]] = uitofp i32 [[TMP92]] to float 1047; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP93]]) 1048; CHECK-NEXT: [[TMP95:%.*]] = fmul fast float [[TMP94]], 0x41EFFFFFC0000000 1049; CHECK-NEXT: [[TMP96:%.*]] = fptoui float [[TMP95]] to i32 1050; CHECK-NEXT: [[TMP97:%.*]] = sub i32 0, [[TMP92]] 1051; CHECK-NEXT: [[TMP98:%.*]] = mul i32 [[TMP97]], [[TMP96]] 1052; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP96]] to i64 1053; CHECK-NEXT: [[TMP100:%.*]] = zext i32 [[TMP98]] to i64 1054; CHECK-NEXT: [[TMP101:%.*]] = mul i64 [[TMP99]], [[TMP100]] 1055; CHECK-NEXT: [[TMP102:%.*]] = trunc i64 [[TMP101]] to i32 1056; CHECK-NEXT: [[TMP103:%.*]] = lshr i64 [[TMP101]], 32 1057; CHECK-NEXT: [[TMP104:%.*]] = trunc i64 [[TMP103]] to i32 1058; CHECK-NEXT: [[TMP105:%.*]] = add i32 [[TMP96]], [[TMP104]] 1059; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP91]] to i64 1060; CHECK-NEXT: [[TMP107:%.*]] = zext i32 [[TMP105]] to i64 1061; CHECK-NEXT: [[TMP108:%.*]] = mul i64 [[TMP106]], [[TMP107]] 1062; CHECK-NEXT: [[TMP109:%.*]] = trunc i64 [[TMP108]] to i32 1063; CHECK-NEXT: [[TMP110:%.*]] = lshr i64 [[TMP108]], 32 1064; CHECK-NEXT: [[TMP111:%.*]] = trunc i64 [[TMP110]] to i32 1065; CHECK-NEXT: [[TMP112:%.*]] = mul i32 [[TMP111]], [[TMP92]] 1066; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP91]], [[TMP112]] 1067; CHECK-NEXT: [[TMP114:%.*]] = icmp uge i32 [[TMP113]], [[TMP92]] 1068; CHECK-NEXT: [[TMP115:%.*]] = sub i32 [[TMP113]], [[TMP92]] 1069; CHECK-NEXT: [[TMP116:%.*]] = select i1 [[TMP114]], i32 [[TMP115]], i32 [[TMP113]] 1070; CHECK-NEXT: [[TMP117:%.*]] = icmp uge i32 [[TMP116]], [[TMP92]] 1071; CHECK-NEXT: [[TMP118:%.*]] = sub i32 [[TMP116]], [[TMP92]] 1072; CHECK-NEXT: [[TMP119:%.*]] = select i1 [[TMP117]], i32 [[TMP118]], i32 [[TMP116]] 1073; CHECK-NEXT: [[TMP120:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP119]], i64 3 1074; CHECK-NEXT: store <4 x i32> [[TMP120]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1075; CHECK-NEXT: ret void 1076; 1077; GCN-LABEL: urem_v4i32: 1078; GCN: ; %bb.0: 1079; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1080; GCN-NEXT: s_mov_b32 s12, 0x4f7ffffe 1081; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1082; GCN-NEXT: s_waitcnt lgkmcnt(0) 1083; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 1084; GCN-NEXT: s_sub_i32 s2, 0, s8 1085; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 1086; GCN-NEXT: v_cvt_f32_u32_e32 v4, s11 1087; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 1088; GCN-NEXT: s_sub_i32 s3, 0, s9 1089; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 1090; GCN-NEXT: v_cvt_f32_u32_e32 v2, s10 1091; GCN-NEXT: v_mul_f32_e32 v0, s12, v0 1092; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 1093; GCN-NEXT: v_mul_f32_e32 v1, s12, v1 1094; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 1095; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 1096; GCN-NEXT: v_mul_lo_u32 v3, s2, v0 1097; GCN-NEXT: s_sub_i32 s2, 0, s10 1098; GCN-NEXT: v_mul_f32_e32 v2, s12, v2 1099; GCN-NEXT: v_mul_hi_u32 v3, v0, v3 1100; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 1101; GCN-NEXT: v_add_i32_e32 v0, vcc, v3, v0 1102; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 1103; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v4 1104; GCN-NEXT: v_mul_lo_u32 v4, s3, v1 1105; GCN-NEXT: s_mov_b32 s3, 0xf000 1106; GCN-NEXT: v_mul_lo_u32 v0, v0, s8 1107; GCN-NEXT: v_mul_f32_e32 v3, s12, v3 1108; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 1109; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 1110; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1111; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s8, v0 1112; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1113; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 1114; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s8, v0 1115; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1116; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 1117; GCN-NEXT: v_add_i32_e32 v1, vcc, v4, v1 1118; GCN-NEXT: v_mul_hi_u32 v1, s5, v1 1119; GCN-NEXT: v_mul_lo_u32 v4, s2, v2 1120; GCN-NEXT: s_sub_i32 s2, 0, s11 1121; GCN-NEXT: v_mul_lo_u32 v1, v1, s9 1122; GCN-NEXT: v_mul_hi_u32 v4, v2, v4 1123; GCN-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 1124; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s9, v1 1125; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1126; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1127; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s9, v1 1128; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1129; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1130; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 1131; GCN-NEXT: v_mul_hi_u32 v2, s6, v2 1132; GCN-NEXT: v_mul_lo_u32 v4, s2, v3 1133; GCN-NEXT: s_mov_b32 s2, -1 1134; GCN-NEXT: v_mul_lo_u32 v2, v2, s10 1135; GCN-NEXT: v_mul_hi_u32 v4, v3, v4 1136; GCN-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 1137; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 1138; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1139; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1140; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 1141; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1142; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1143; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 1144; GCN-NEXT: v_mul_hi_u32 v3, s7, v3 1145; GCN-NEXT: v_mul_lo_u32 v3, v3, s11 1146; GCN-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 1147; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 1148; GCN-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1149; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1150; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 1151; GCN-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1152; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1153; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1154; GCN-NEXT: s_endpgm 1155 %r = urem <4 x i32> %x, %y 1156 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1157 ret void 1158} 1159 1160define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1161; CHECK-LABEL: @sdiv_v4i32( 1162; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1163; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1164; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 1165; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 1166; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 1167; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 1168; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 1169; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 1170; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 1171; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 1172; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 1173; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000 1174; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 1175; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]] 1176; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]] 1177; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64 1178; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1179; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1180; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1181; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1182; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1183; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]] 1184; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64 1185; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64 1186; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 1187; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 1188; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 1189; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 1190; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]] 1191; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]] 1192; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]] 1193; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1 1194; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]] 1195; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]] 1196; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]] 1197; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]] 1198; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1 1199; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]] 1200; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]] 1201; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]] 1202; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> undef, i32 [[TMP40]], i64 0 1203; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[X]], i64 1 1204; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1205; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31 1206; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31 1207; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]] 1208; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]] 1209; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]] 1210; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]] 1211; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]] 1212; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float 1213; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]]) 1214; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000 1215; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32 1216; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]] 1217; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]] 1218; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64 1219; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64 1220; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]] 1221; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32 1222; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32 1223; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32 1224; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]] 1225; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64 1226; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64 1227; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]] 1228; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32 1229; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32 1230; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32 1231; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]] 1232; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]] 1233; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]] 1234; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1 1235; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]] 1236; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]] 1237; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]] 1238; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]] 1239; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1 1240; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]] 1241; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]] 1242; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]] 1243; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP81]], i64 1 1244; CHECK-NEXT: [[TMP83:%.*]] = extractelement <4 x i32> [[X]], i64 2 1245; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1246; CHECK-NEXT: [[TMP85:%.*]] = ashr i32 [[TMP83]], 31 1247; CHECK-NEXT: [[TMP86:%.*]] = ashr i32 [[TMP84]], 31 1248; CHECK-NEXT: [[TMP87:%.*]] = xor i32 [[TMP85]], [[TMP86]] 1249; CHECK-NEXT: [[TMP88:%.*]] = add i32 [[TMP83]], [[TMP85]] 1250; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP84]], [[TMP86]] 1251; CHECK-NEXT: [[TMP90:%.*]] = xor i32 [[TMP88]], [[TMP85]] 1252; CHECK-NEXT: [[TMP91:%.*]] = xor i32 [[TMP89]], [[TMP86]] 1253; CHECK-NEXT: [[TMP92:%.*]] = uitofp i32 [[TMP91]] to float 1254; CHECK-NEXT: [[TMP93:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP92]]) 1255; CHECK-NEXT: [[TMP94:%.*]] = fmul fast float [[TMP93]], 0x41EFFFFFC0000000 1256; CHECK-NEXT: [[TMP95:%.*]] = fptoui float [[TMP94]] to i32 1257; CHECK-NEXT: [[TMP96:%.*]] = sub i32 0, [[TMP91]] 1258; CHECK-NEXT: [[TMP97:%.*]] = mul i32 [[TMP96]], [[TMP95]] 1259; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP95]] to i64 1260; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64 1261; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]] 1262; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 1263; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32 1264; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32 1265; CHECK-NEXT: [[TMP104:%.*]] = add i32 [[TMP95]], [[TMP103]] 1266; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP90]] to i64 1267; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64 1268; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 1269; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 1270; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 1271; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 1272; CHECK-NEXT: [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP91]] 1273; CHECK-NEXT: [[TMP112:%.*]] = sub i32 [[TMP90]], [[TMP111]] 1274; CHECK-NEXT: [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP91]] 1275; CHECK-NEXT: [[TMP114:%.*]] = add i32 [[TMP110]], 1 1276; CHECK-NEXT: [[TMP115:%.*]] = select i1 [[TMP113]], i32 [[TMP114]], i32 [[TMP110]] 1277; CHECK-NEXT: [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP91]] 1278; CHECK-NEXT: [[TMP117:%.*]] = select i1 [[TMP113]], i32 [[TMP116]], i32 [[TMP112]] 1279; CHECK-NEXT: [[TMP118:%.*]] = icmp uge i32 [[TMP117]], [[TMP91]] 1280; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], 1 1281; CHECK-NEXT: [[TMP120:%.*]] = select i1 [[TMP118]], i32 [[TMP119]], i32 [[TMP115]] 1282; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP120]], [[TMP87]] 1283; CHECK-NEXT: [[TMP122:%.*]] = sub i32 [[TMP121]], [[TMP87]] 1284; CHECK-NEXT: [[TMP123:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP122]], i64 2 1285; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i32> [[X]], i64 3 1286; CHECK-NEXT: [[TMP125:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1287; CHECK-NEXT: [[TMP126:%.*]] = ashr i32 [[TMP124]], 31 1288; CHECK-NEXT: [[TMP127:%.*]] = ashr i32 [[TMP125]], 31 1289; CHECK-NEXT: [[TMP128:%.*]] = xor i32 [[TMP126]], [[TMP127]] 1290; CHECK-NEXT: [[TMP129:%.*]] = add i32 [[TMP124]], [[TMP126]] 1291; CHECK-NEXT: [[TMP130:%.*]] = add i32 [[TMP125]], [[TMP127]] 1292; CHECK-NEXT: [[TMP131:%.*]] = xor i32 [[TMP129]], [[TMP126]] 1293; CHECK-NEXT: [[TMP132:%.*]] = xor i32 [[TMP130]], [[TMP127]] 1294; CHECK-NEXT: [[TMP133:%.*]] = uitofp i32 [[TMP132]] to float 1295; CHECK-NEXT: [[TMP134:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP133]]) 1296; CHECK-NEXT: [[TMP135:%.*]] = fmul fast float [[TMP134]], 0x41EFFFFFC0000000 1297; CHECK-NEXT: [[TMP136:%.*]] = fptoui float [[TMP135]] to i32 1298; CHECK-NEXT: [[TMP137:%.*]] = sub i32 0, [[TMP132]] 1299; CHECK-NEXT: [[TMP138:%.*]] = mul i32 [[TMP137]], [[TMP136]] 1300; CHECK-NEXT: [[TMP139:%.*]] = zext i32 [[TMP136]] to i64 1301; CHECK-NEXT: [[TMP140:%.*]] = zext i32 [[TMP138]] to i64 1302; CHECK-NEXT: [[TMP141:%.*]] = mul i64 [[TMP139]], [[TMP140]] 1303; CHECK-NEXT: [[TMP142:%.*]] = trunc i64 [[TMP141]] to i32 1304; CHECK-NEXT: [[TMP143:%.*]] = lshr i64 [[TMP141]], 32 1305; CHECK-NEXT: [[TMP144:%.*]] = trunc i64 [[TMP143]] to i32 1306; CHECK-NEXT: [[TMP145:%.*]] = add i32 [[TMP136]], [[TMP144]] 1307; CHECK-NEXT: [[TMP146:%.*]] = zext i32 [[TMP131]] to i64 1308; CHECK-NEXT: [[TMP147:%.*]] = zext i32 [[TMP145]] to i64 1309; CHECK-NEXT: [[TMP148:%.*]] = mul i64 [[TMP146]], [[TMP147]] 1310; CHECK-NEXT: [[TMP149:%.*]] = trunc i64 [[TMP148]] to i32 1311; CHECK-NEXT: [[TMP150:%.*]] = lshr i64 [[TMP148]], 32 1312; CHECK-NEXT: [[TMP151:%.*]] = trunc i64 [[TMP150]] to i32 1313; CHECK-NEXT: [[TMP152:%.*]] = mul i32 [[TMP151]], [[TMP132]] 1314; CHECK-NEXT: [[TMP153:%.*]] = sub i32 [[TMP131]], [[TMP152]] 1315; CHECK-NEXT: [[TMP154:%.*]] = icmp uge i32 [[TMP153]], [[TMP132]] 1316; CHECK-NEXT: [[TMP155:%.*]] = add i32 [[TMP151]], 1 1317; CHECK-NEXT: [[TMP156:%.*]] = select i1 [[TMP154]], i32 [[TMP155]], i32 [[TMP151]] 1318; CHECK-NEXT: [[TMP157:%.*]] = sub i32 [[TMP153]], [[TMP132]] 1319; CHECK-NEXT: [[TMP158:%.*]] = select i1 [[TMP154]], i32 [[TMP157]], i32 [[TMP153]] 1320; CHECK-NEXT: [[TMP159:%.*]] = icmp uge i32 [[TMP158]], [[TMP132]] 1321; CHECK-NEXT: [[TMP160:%.*]] = add i32 [[TMP156]], 1 1322; CHECK-NEXT: [[TMP161:%.*]] = select i1 [[TMP159]], i32 [[TMP160]], i32 [[TMP156]] 1323; CHECK-NEXT: [[TMP162:%.*]] = xor i32 [[TMP161]], [[TMP128]] 1324; CHECK-NEXT: [[TMP163:%.*]] = sub i32 [[TMP162]], [[TMP128]] 1325; CHECK-NEXT: [[TMP164:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP163]], i64 3 1326; CHECK-NEXT: store <4 x i32> [[TMP164]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1327; CHECK-NEXT: ret void 1328; 1329; GCN-LABEL: sdiv_v4i32: 1330; GCN: ; %bb.0: 1331; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1332; GCN-NEXT: s_mov_b32 s16, 0x4f7ffffe 1333; GCN-NEXT: s_waitcnt lgkmcnt(0) 1334; GCN-NEXT: s_ashr_i32 s14, s8, 31 1335; GCN-NEXT: s_add_i32 s2, s8, s14 1336; GCN-NEXT: s_xor_b32 s12, s2, s14 1337; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 1338; GCN-NEXT: s_ashr_i32 s8, s9, 31 1339; GCN-NEXT: s_add_i32 s2, s9, s8 1340; GCN-NEXT: s_xor_b32 s15, s2, s8 1341; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 1342; GCN-NEXT: v_cvt_f32_u32_e32 v1, s15 1343; GCN-NEXT: s_sub_i32 s3, 0, s12 1344; GCN-NEXT: s_ashr_i32 s9, s4, 31 1345; GCN-NEXT: v_mul_f32_e32 v0, s16, v0 1346; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 1347; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 1348; GCN-NEXT: s_add_i32 s2, s4, s9 1349; GCN-NEXT: s_xor_b32 s2, s2, s9 1350; GCN-NEXT: v_mul_lo_u32 v2, s3, v0 1351; GCN-NEXT: v_mul_f32_e32 v1, s16, v1 1352; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 1353; GCN-NEXT: s_sub_i32 s3, 0, s15 1354; GCN-NEXT: v_mul_hi_u32 v2, v0, v2 1355; GCN-NEXT: s_ashr_i32 s4, s5, 31 1356; GCN-NEXT: v_mul_lo_u32 v3, s3, v1 1357; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1358; GCN-NEXT: v_mul_hi_u32 v0, s2, v0 1359; GCN-NEXT: v_mul_hi_u32 v2, v1, v3 1360; GCN-NEXT: v_mul_lo_u32 v3, v0, s12 1361; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v0 1362; GCN-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 1363; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v3 1364; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[2:3] 1365; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s12, v3 1366; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] 1367; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v0 1368; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 1369; GCN-NEXT: s_add_i32 s2, s5, s4 1370; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 1371; GCN-NEXT: s_xor_b32 s2, s2, s4 1372; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 1373; GCN-NEXT: v_mul_hi_u32 v1, s2, v1 1374; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 1375; GCN-NEXT: s_xor_b32 s0, s9, s14 1376; GCN-NEXT: v_xor_b32_e32 v0, s0, v0 1377; GCN-NEXT: v_mul_lo_u32 v2, v1, s15 1378; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 1379; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1380; GCN-NEXT: s_ashr_i32 s3, s6, 31 1381; GCN-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 1382; GCN-NEXT: s_ashr_i32 s2, s10, 31 1383; GCN-NEXT: s_add_i32 s0, s10, s2 1384; GCN-NEXT: s_xor_b32 s5, s0, s2 1385; GCN-NEXT: v_cvt_f32_u32_e32 v3, s5 1386; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v2 1387; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 1388; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s15, v2 1389; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 1390; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 1391; GCN-NEXT: s_sub_i32 s0, 0, s5 1392; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1393; GCN-NEXT: v_mul_f32_e32 v3, s16, v3 1394; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 1395; GCN-NEXT: v_cmp_le_u32_e32 vcc, s15, v2 1396; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1397; GCN-NEXT: s_xor_b32 s1, s4, s8 1398; GCN-NEXT: v_mul_lo_u32 v5, s0, v3 1399; GCN-NEXT: s_add_i32 s0, s6, s3 1400; GCN-NEXT: s_xor_b32 s0, s0, s3 1401; GCN-NEXT: s_ashr_i32 s4, s11, 31 1402; GCN-NEXT: v_mul_hi_u32 v2, v3, v5 1403; GCN-NEXT: v_xor_b32_e32 v1, s1, v1 1404; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s1, v1 1405; GCN-NEXT: s_xor_b32 s2, s3, s2 1406; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 1407; GCN-NEXT: v_mul_hi_u32 v2, s0, v2 1408; GCN-NEXT: s_mov_b32 s15, 0xf000 1409; GCN-NEXT: s_mov_b32 s14, -1 1410; GCN-NEXT: v_mul_lo_u32 v3, v2, s5 1411; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v2 1412; GCN-NEXT: v_sub_i32_e32 v3, vcc, s0, v3 1413; GCN-NEXT: s_add_i32 s0, s11, s4 1414; GCN-NEXT: s_xor_b32 s6, s0, s4 1415; GCN-NEXT: v_cvt_f32_u32_e32 v4, s6 1416; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v3 1417; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 1418; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s5, v3 1419; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v4 1420; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1421; GCN-NEXT: s_sub_i32 s0, 0, s6 1422; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v2 1423; GCN-NEXT: v_mul_f32_e32 v4, s16, v4 1424; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 1425; GCN-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 1426; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1427; GCN-NEXT: v_xor_b32_e32 v2, s2, v2 1428; GCN-NEXT: v_mul_lo_u32 v6, s0, v4 1429; GCN-NEXT: s_ashr_i32 s0, s7, 31 1430; GCN-NEXT: s_add_i32 s1, s7, s0 1431; GCN-NEXT: s_xor_b32 s1, s1, s0 1432; GCN-NEXT: v_mul_hi_u32 v3, v4, v6 1433; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s2, v2 1434; GCN-NEXT: s_xor_b32 s2, s0, s4 1435; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 1436; GCN-NEXT: v_mul_hi_u32 v3, s1, v3 1437; GCN-NEXT: v_mul_lo_u32 v4, v3, s6 1438; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 1439; GCN-NEXT: v_sub_i32_e32 v4, vcc, s1, v4 1440; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v4 1441; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1442; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s6, v4 1443; GCN-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] 1444; GCN-NEXT: v_add_i32_e32 v5, vcc, 1, v3 1445; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v4 1446; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 1447; GCN-NEXT: v_xor_b32_e32 v3, s2, v3 1448; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s2, v3 1449; GCN-NEXT: s_waitcnt lgkmcnt(0) 1450; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 1451; GCN-NEXT: s_endpgm 1452 %r = sdiv <4 x i32> %x, %y 1453 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1454 ret void 1455} 1456 1457define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1458; CHECK-LABEL: @srem_v4i32( 1459; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1460; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1461; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 1462; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 1463; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 1464; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 1465; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 1466; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 1467; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 1468; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 1469; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000 1470; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 1471; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]] 1472; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]] 1473; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64 1474; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 1475; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]] 1476; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 1477; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32 1478; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 1479; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] 1480; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 1481; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64 1482; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 1483; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 1484; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 1485; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 1486; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]] 1487; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]] 1488; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]] 1489; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]] 1490; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]] 1491; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]] 1492; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]] 1493; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]] 1494; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]] 1495; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]] 1496; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i32> undef, i32 [[TMP37]], i64 0 1497; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i32> [[X]], i64 1 1498; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1499; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31 1500; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31 1501; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]] 1502; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]] 1503; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]] 1504; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]] 1505; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float 1506; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]]) 1507; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000 1508; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32 1509; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]] 1510; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]] 1511; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64 1512; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64 1513; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]] 1514; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32 1515; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32 1516; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32 1517; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]] 1518; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64 1519; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64 1520; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]] 1521; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32 1522; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32 1523; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32 1524; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]] 1525; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]] 1526; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]] 1527; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]] 1528; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]] 1529; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]] 1530; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]] 1531; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]] 1532; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]] 1533; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]] 1534; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP75]], i64 1 1535; CHECK-NEXT: [[TMP77:%.*]] = extractelement <4 x i32> [[X]], i64 2 1536; CHECK-NEXT: [[TMP78:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1537; CHECK-NEXT: [[TMP79:%.*]] = ashr i32 [[TMP77]], 31 1538; CHECK-NEXT: [[TMP80:%.*]] = ashr i32 [[TMP78]], 31 1539; CHECK-NEXT: [[TMP81:%.*]] = add i32 [[TMP77]], [[TMP79]] 1540; CHECK-NEXT: [[TMP82:%.*]] = add i32 [[TMP78]], [[TMP80]] 1541; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP79]] 1542; CHECK-NEXT: [[TMP84:%.*]] = xor i32 [[TMP82]], [[TMP80]] 1543; CHECK-NEXT: [[TMP85:%.*]] = uitofp i32 [[TMP84]] to float 1544; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP85]]) 1545; CHECK-NEXT: [[TMP87:%.*]] = fmul fast float [[TMP86]], 0x41EFFFFFC0000000 1546; CHECK-NEXT: [[TMP88:%.*]] = fptoui float [[TMP87]] to i32 1547; CHECK-NEXT: [[TMP89:%.*]] = sub i32 0, [[TMP84]] 1548; CHECK-NEXT: [[TMP90:%.*]] = mul i32 [[TMP89]], [[TMP88]] 1549; CHECK-NEXT: [[TMP91:%.*]] = zext i32 [[TMP88]] to i64 1550; CHECK-NEXT: [[TMP92:%.*]] = zext i32 [[TMP90]] to i64 1551; CHECK-NEXT: [[TMP93:%.*]] = mul i64 [[TMP91]], [[TMP92]] 1552; CHECK-NEXT: [[TMP94:%.*]] = trunc i64 [[TMP93]] to i32 1553; CHECK-NEXT: [[TMP95:%.*]] = lshr i64 [[TMP93]], 32 1554; CHECK-NEXT: [[TMP96:%.*]] = trunc i64 [[TMP95]] to i32 1555; CHECK-NEXT: [[TMP97:%.*]] = add i32 [[TMP88]], [[TMP96]] 1556; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP83]] to i64 1557; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64 1558; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]] 1559; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 1560; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32 1561; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32 1562; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP84]] 1563; CHECK-NEXT: [[TMP105:%.*]] = sub i32 [[TMP83]], [[TMP104]] 1564; CHECK-NEXT: [[TMP106:%.*]] = icmp uge i32 [[TMP105]], [[TMP84]] 1565; CHECK-NEXT: [[TMP107:%.*]] = sub i32 [[TMP105]], [[TMP84]] 1566; CHECK-NEXT: [[TMP108:%.*]] = select i1 [[TMP106]], i32 [[TMP107]], i32 [[TMP105]] 1567; CHECK-NEXT: [[TMP109:%.*]] = icmp uge i32 [[TMP108]], [[TMP84]] 1568; CHECK-NEXT: [[TMP110:%.*]] = sub i32 [[TMP108]], [[TMP84]] 1569; CHECK-NEXT: [[TMP111:%.*]] = select i1 [[TMP109]], i32 [[TMP110]], i32 [[TMP108]] 1570; CHECK-NEXT: [[TMP112:%.*]] = xor i32 [[TMP111]], [[TMP79]] 1571; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP112]], [[TMP79]] 1572; CHECK-NEXT: [[TMP114:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP113]], i64 2 1573; CHECK-NEXT: [[TMP115:%.*]] = extractelement <4 x i32> [[X]], i64 3 1574; CHECK-NEXT: [[TMP116:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1575; CHECK-NEXT: [[TMP117:%.*]] = ashr i32 [[TMP115]], 31 1576; CHECK-NEXT: [[TMP118:%.*]] = ashr i32 [[TMP116]], 31 1577; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], [[TMP117]] 1578; CHECK-NEXT: [[TMP120:%.*]] = add i32 [[TMP116]], [[TMP118]] 1579; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP119]], [[TMP117]] 1580; CHECK-NEXT: [[TMP122:%.*]] = xor i32 [[TMP120]], [[TMP118]] 1581; CHECK-NEXT: [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float 1582; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]]) 1583; CHECK-NEXT: [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41EFFFFFC0000000 1584; CHECK-NEXT: [[TMP126:%.*]] = fptoui float [[TMP125]] to i32 1585; CHECK-NEXT: [[TMP127:%.*]] = sub i32 0, [[TMP122]] 1586; CHECK-NEXT: [[TMP128:%.*]] = mul i32 [[TMP127]], [[TMP126]] 1587; CHECK-NEXT: [[TMP129:%.*]] = zext i32 [[TMP126]] to i64 1588; CHECK-NEXT: [[TMP130:%.*]] = zext i32 [[TMP128]] to i64 1589; CHECK-NEXT: [[TMP131:%.*]] = mul i64 [[TMP129]], [[TMP130]] 1590; CHECK-NEXT: [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32 1591; CHECK-NEXT: [[TMP133:%.*]] = lshr i64 [[TMP131]], 32 1592; CHECK-NEXT: [[TMP134:%.*]] = trunc i64 [[TMP133]] to i32 1593; CHECK-NEXT: [[TMP135:%.*]] = add i32 [[TMP126]], [[TMP134]] 1594; CHECK-NEXT: [[TMP136:%.*]] = zext i32 [[TMP121]] to i64 1595; CHECK-NEXT: [[TMP137:%.*]] = zext i32 [[TMP135]] to i64 1596; CHECK-NEXT: [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]] 1597; CHECK-NEXT: [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32 1598; CHECK-NEXT: [[TMP140:%.*]] = lshr i64 [[TMP138]], 32 1599; CHECK-NEXT: [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32 1600; CHECK-NEXT: [[TMP142:%.*]] = mul i32 [[TMP141]], [[TMP122]] 1601; CHECK-NEXT: [[TMP143:%.*]] = sub i32 [[TMP121]], [[TMP142]] 1602; CHECK-NEXT: [[TMP144:%.*]] = icmp uge i32 [[TMP143]], [[TMP122]] 1603; CHECK-NEXT: [[TMP145:%.*]] = sub i32 [[TMP143]], [[TMP122]] 1604; CHECK-NEXT: [[TMP146:%.*]] = select i1 [[TMP144]], i32 [[TMP145]], i32 [[TMP143]] 1605; CHECK-NEXT: [[TMP147:%.*]] = icmp uge i32 [[TMP146]], [[TMP122]] 1606; CHECK-NEXT: [[TMP148:%.*]] = sub i32 [[TMP146]], [[TMP122]] 1607; CHECK-NEXT: [[TMP149:%.*]] = select i1 [[TMP147]], i32 [[TMP148]], i32 [[TMP146]] 1608; CHECK-NEXT: [[TMP150:%.*]] = xor i32 [[TMP149]], [[TMP117]] 1609; CHECK-NEXT: [[TMP151:%.*]] = sub i32 [[TMP150]], [[TMP117]] 1610; CHECK-NEXT: [[TMP152:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP151]], i64 3 1611; CHECK-NEXT: store <4 x i32> [[TMP152]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1612; CHECK-NEXT: ret void 1613; 1614; GCN-LABEL: srem_v4i32: 1615; GCN: ; %bb.0: 1616; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1617; GCN-NEXT: s_mov_b32 s14, 0x4f7ffffe 1618; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1619; GCN-NEXT: s_waitcnt lgkmcnt(0) 1620; GCN-NEXT: s_ashr_i32 s2, s8, 31 1621; GCN-NEXT: s_add_i32 s3, s8, s2 1622; GCN-NEXT: s_xor_b32 s2, s3, s2 1623; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 1624; GCN-NEXT: s_sub_i32 s13, 0, s2 1625; GCN-NEXT: s_ashr_i32 s12, s9, 31 1626; GCN-NEXT: s_add_i32 s9, s9, s12 1627; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 1628; GCN-NEXT: s_xor_b32 s9, s9, s12 1629; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 1630; GCN-NEXT: s_ashr_i32 s3, s4, 31 1631; GCN-NEXT: v_mul_f32_e32 v0, s14, v0 1632; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 1633; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 1634; GCN-NEXT: s_add_i32 s4, s4, s3 1635; GCN-NEXT: s_xor_b32 s4, s4, s3 1636; GCN-NEXT: v_mul_lo_u32 v2, s13, v0 1637; GCN-NEXT: v_mul_f32_e32 v1, s14, v1 1638; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 1639; GCN-NEXT: s_sub_i32 s13, 0, s9 1640; GCN-NEXT: v_mul_hi_u32 v2, v0, v2 1641; GCN-NEXT: s_ashr_i32 s12, s10, 31 1642; GCN-NEXT: s_ashr_i32 s8, s5, 31 1643; GCN-NEXT: s_add_i32 s5, s5, s8 1644; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1645; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 1646; GCN-NEXT: v_mul_lo_u32 v2, s13, v1 1647; GCN-NEXT: s_xor_b32 s5, s5, s8 1648; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 1649; GCN-NEXT: v_mul_hi_u32 v2, v1, v2 1650; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1651; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s2, v0 1652; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 1653; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1654; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s2, v0 1655; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 1656; GCN-NEXT: s_add_i32 s2, s10, s12 1657; GCN-NEXT: s_xor_b32 s2, s2, s12 1658; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1659; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 1660; GCN-NEXT: v_cvt_f32_u32_e32 v2, s2 1661; GCN-NEXT: v_mul_hi_u32 v1, s5, v1 1662; GCN-NEXT: v_xor_b32_e32 v0, s3, v0 1663; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 1664; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 1665; GCN-NEXT: v_mul_lo_u32 v1, v1, s9 1666; GCN-NEXT: s_sub_i32 s3, 0, s2 1667; GCN-NEXT: s_ashr_i32 s4, s6, 31 1668; GCN-NEXT: v_mul_f32_e32 v2, s14, v2 1669; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 1670; GCN-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 1671; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s9, v1 1672; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1673; GCN-NEXT: v_mul_lo_u32 v4, s3, v2 1674; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 1675; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s9, v1 1676; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1677; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 1678; GCN-NEXT: v_mul_hi_u32 v3, v2, v4 1679; GCN-NEXT: s_ashr_i32 s5, s11, 31 1680; GCN-NEXT: s_add_i32 s3, s6, s4 1681; GCN-NEXT: s_add_i32 s6, s11, s5 1682; GCN-NEXT: s_xor_b32 s5, s6, s5 1683; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 1684; GCN-NEXT: v_cvt_f32_u32_e32 v3, s5 1685; GCN-NEXT: s_xor_b32 s3, s3, s4 1686; GCN-NEXT: v_mul_hi_u32 v2, s3, v2 1687; GCN-NEXT: v_xor_b32_e32 v1, s8, v1 1688; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v3 1689; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s8, v1 1690; GCN-NEXT: v_mul_lo_u32 v2, v2, s2 1691; GCN-NEXT: s_ashr_i32 s6, s7, 31 1692; GCN-NEXT: v_mul_f32_e32 v3, s14, v3 1693; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 1694; GCN-NEXT: v_sub_i32_e32 v2, vcc, s3, v2 1695; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s2, v2 1696; GCN-NEXT: s_sub_i32 s3, 0, s5 1697; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 1698; GCN-NEXT: v_mul_lo_u32 v5, s3, v3 1699; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 1700; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s2, v2 1701; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 1702; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 1703; GCN-NEXT: v_mul_hi_u32 v4, v3, v5 1704; GCN-NEXT: s_add_i32 s2, s7, s6 1705; GCN-NEXT: s_xor_b32 s7, s2, s6 1706; GCN-NEXT: v_xor_b32_e32 v2, s4, v2 1707; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 1708; GCN-NEXT: v_mul_hi_u32 v3, s7, v3 1709; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s4, v2 1710; GCN-NEXT: s_mov_b32 s3, 0xf000 1711; GCN-NEXT: s_mov_b32 s2, -1 1712; GCN-NEXT: v_mul_lo_u32 v3, v3, s5 1713; GCN-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 1714; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s5, v3 1715; GCN-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 1716; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1717; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s5, v3 1718; GCN-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 1719; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1720; GCN-NEXT: v_xor_b32_e32 v3, s6, v3 1721; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s6, v3 1722; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1723; GCN-NEXT: s_endpgm 1724 %r = srem <4 x i32> %x, %y 1725 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1726 ret void 1727} 1728 1729define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 1730; CHECK-LABEL: @udiv_v4i16( 1731; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 1732; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 1733; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 1734; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 1735; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 1736; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 1737; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 1738; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 1739; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 1740; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 1741; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 1742; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 1743; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 1744; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 1745; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 1746; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 1747; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 1748; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 1749; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 1750; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i16> undef, i16 [[TMP19]], i64 0 1751; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1 1752; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1 1753; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 1754; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 1755; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 1756; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 1757; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 1758; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 1759; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 1760; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 1761; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 1762; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 1763; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 1764; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 1765; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 1766; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 1767; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 1768; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 1769; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 1770; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1 1771; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2 1772; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2 1773; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 1774; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 1775; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 1776; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 1777; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 1778; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 1779; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 1780; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 1781; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 1782; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 1783; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 1784; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 1785; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 1786; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 1787; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 1788; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 1789; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 1790; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2 1791; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3 1792; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3 1793; CHECK-NEXT: [[TMP63:%.*]] = zext i16 [[TMP61]] to i32 1794; CHECK-NEXT: [[TMP64:%.*]] = zext i16 [[TMP62]] to i32 1795; CHECK-NEXT: [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float 1796; CHECK-NEXT: [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float 1797; CHECK-NEXT: [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]]) 1798; CHECK-NEXT: [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]] 1799; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]]) 1800; CHECK-NEXT: [[TMP70:%.*]] = fneg fast float [[TMP69]] 1801; CHECK-NEXT: [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]]) 1802; CHECK-NEXT: [[TMP72:%.*]] = fptoui float [[TMP69]] to i32 1803; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]]) 1804; CHECK-NEXT: [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 1805; CHECK-NEXT: [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]] 1806; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0 1807; CHECK-NEXT: [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]] 1808; CHECK-NEXT: [[TMP78:%.*]] = and i32 [[TMP77]], 65535 1809; CHECK-NEXT: [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16 1810; CHECK-NEXT: [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3 1811; CHECK-NEXT: store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 1812; CHECK-NEXT: ret void 1813; 1814; GCN-LABEL: udiv_v4i16: 1815; GCN: ; %bb.0: 1816; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1817; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 1818; GCN-NEXT: s_mov_b32 s8, 0xffff 1819; GCN-NEXT: s_mov_b32 s7, 0xf000 1820; GCN-NEXT: s_mov_b32 s6, -1 1821; GCN-NEXT: s_waitcnt lgkmcnt(0) 1822; GCN-NEXT: s_and_b32 s9, s2, s8 1823; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 1824; GCN-NEXT: s_lshr_b32 s9, s0, 16 1825; GCN-NEXT: s_and_b32 s0, s0, s8 1826; GCN-NEXT: s_lshr_b32 s2, s2, 16 1827; GCN-NEXT: v_cvt_f32_u32_e32 v3, s2 1828; GCN-NEXT: v_cvt_f32_u32_e32 v1, s0 1829; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 1830; GCN-NEXT: v_cvt_f32_u32_e32 v4, s9 1831; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 1832; GCN-NEXT: s_and_b32 s2, s3, s8 1833; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 1834; GCN-NEXT: v_trunc_f32_e32 v2, v2 1835; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 1836; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 1837; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1838; GCN-NEXT: v_mul_f32_e32 v1, v4, v5 1839; GCN-NEXT: v_trunc_f32_e32 v1, v1 1840; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 1841; GCN-NEXT: v_mad_f32 v2, -v1, v3, v4 1842; GCN-NEXT: v_cvt_f32_u32_e32 v4, s2 1843; GCN-NEXT: s_lshr_b32 s0, s1, 16 1844; GCN-NEXT: s_and_b32 s1, s1, s8 1845; GCN-NEXT: s_lshr_b32 s10, s3, 16 1846; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 1847; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 1848; GCN-NEXT: v_cvt_f32_u32_e32 v3, s10 1849; GCN-NEXT: v_cvt_f32_u32_e32 v5, s1 1850; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 1851; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc 1852; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v3 1853; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1854; GCN-NEXT: v_mul_f32_e32 v1, v5, v6 1855; GCN-NEXT: v_cvt_f32_u32_e32 v6, s0 1856; GCN-NEXT: v_trunc_f32_e32 v1, v1 1857; GCN-NEXT: v_mad_f32 v5, -v1, v4, v5 1858; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 1859; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 1860; GCN-NEXT: v_mul_f32_e32 v4, v6, v7 1861; GCN-NEXT: v_trunc_f32_e32 v4, v4 1862; GCN-NEXT: v_cvt_u32_f32_e32 v5, v4 1863; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 1864; GCN-NEXT: v_mad_f32 v4, -v4, v3, v6 1865; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 1866; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 1867; GCN-NEXT: v_and_b32_e32 v0, s8, v0 1868; GCN-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1869; GCN-NEXT: v_and_b32_e32 v1, s8, v1 1870; GCN-NEXT: v_or_b32_e32 v1, v1, v3 1871; GCN-NEXT: v_or_b32_e32 v0, v0, v2 1872; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 1873; GCN-NEXT: s_endpgm 1874 %r = udiv <4 x i16> %x, %y 1875 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 1876 ret void 1877} 1878 1879define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 1880; CHECK-LABEL: @urem_v4i16( 1881; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 1882; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 1883; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 1884; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 1885; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 1886; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 1887; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 1888; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 1889; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 1890; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 1891; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 1892; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 1893; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 1894; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 1895; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 1896; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 1897; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 1898; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 1899; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 1900; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 1901; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 1902; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i16> undef, i16 [[TMP21]], i64 0 1903; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1 1904; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1 1905; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 1906; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 1907; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 1908; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 1909; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 1910; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 1911; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 1912; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 1913; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 1914; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 1915; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 1916; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 1917; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 1918; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 1919; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 1920; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 1921; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 1922; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 1923; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 1924; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1 1925; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2 1926; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2 1927; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 1928; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 1929; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 1930; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 1931; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 1932; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 1933; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 1934; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 1935; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 1936; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 1937; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 1938; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 1939; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 1940; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 1941; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 1942; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 1943; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 1944; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 1945; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 1946; CHECK-NEXT: [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2 1947; CHECK-NEXT: [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3 1948; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3 1949; CHECK-NEXT: [[TMP69:%.*]] = zext i16 [[TMP67]] to i32 1950; CHECK-NEXT: [[TMP70:%.*]] = zext i16 [[TMP68]] to i32 1951; CHECK-NEXT: [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float 1952; CHECK-NEXT: [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float 1953; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]]) 1954; CHECK-NEXT: [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]] 1955; CHECK-NEXT: [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]]) 1956; CHECK-NEXT: [[TMP76:%.*]] = fneg fast float [[TMP75]] 1957; CHECK-NEXT: [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]]) 1958; CHECK-NEXT: [[TMP78:%.*]] = fptoui float [[TMP75]] to i32 1959; CHECK-NEXT: [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]]) 1960; CHECK-NEXT: [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]]) 1961; CHECK-NEXT: [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]] 1962; CHECK-NEXT: [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0 1963; CHECK-NEXT: [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]] 1964; CHECK-NEXT: [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]] 1965; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]] 1966; CHECK-NEXT: [[TMP86:%.*]] = and i32 [[TMP85]], 65535 1967; CHECK-NEXT: [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16 1968; CHECK-NEXT: [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3 1969; CHECK-NEXT: store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 1970; CHECK-NEXT: ret void 1971; 1972; GCN-LABEL: urem_v4i16: 1973; GCN: ; %bb.0: 1974; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1975; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 1976; GCN-NEXT: s_mov_b32 s8, 0xffff 1977; GCN-NEXT: s_mov_b32 s7, 0xf000 1978; GCN-NEXT: s_mov_b32 s6, -1 1979; GCN-NEXT: s_waitcnt lgkmcnt(0) 1980; GCN-NEXT: s_and_b32 s9, s2, s8 1981; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 1982; GCN-NEXT: s_and_b32 s10, s0, s8 1983; GCN-NEXT: s_lshr_b32 s11, s2, 16 1984; GCN-NEXT: v_cvt_f32_u32_e32 v1, s10 1985; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 1986; GCN-NEXT: v_cvt_f32_u32_e32 v3, s11 1987; GCN-NEXT: s_lshr_b32 s9, s0, 16 1988; GCN-NEXT: v_cvt_f32_u32_e32 v4, s9 1989; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 1990; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 1991; GCN-NEXT: v_trunc_f32_e32 v2, v2 1992; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 1993; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 1994; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1995; GCN-NEXT: v_mul_f32_e32 v1, v4, v5 1996; GCN-NEXT: v_trunc_f32_e32 v1, v1 1997; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 1998; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1 1999; GCN-NEXT: v_mad_f32 v1, -v1, v3, v4 2000; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3 2001; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 2002; GCN-NEXT: s_and_b32 s2, s3, s8 2003; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 2004; GCN-NEXT: v_cvt_f32_u32_e32 v2, s2 2005; GCN-NEXT: s_and_b32 s2, s1, s8 2006; GCN-NEXT: v_mul_lo_u32 v1, v1, s11 2007; GCN-NEXT: v_cvt_f32_u32_e32 v3, s2 2008; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 2009; GCN-NEXT: s_lshr_b32 s12, s3, 16 2010; GCN-NEXT: v_sub_i32_e32 v5, vcc, s9, v1 2011; GCN-NEXT: s_lshr_b32 s10, s1, 16 2012; GCN-NEXT: v_mul_f32_e32 v1, v3, v4 2013; GCN-NEXT: v_cvt_f32_u32_e32 v4, s12 2014; GCN-NEXT: v_cvt_f32_u32_e32 v6, s10 2015; GCN-NEXT: v_trunc_f32_e32 v1, v1 2016; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2017; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v4 2018; GCN-NEXT: v_mad_f32 v3, -v1, v2, v3 2019; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 2020; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 2021; GCN-NEXT: v_mul_f32_e32 v2, v6, v7 2022; GCN-NEXT: v_trunc_f32_e32 v2, v2 2023; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 2024; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2025; GCN-NEXT: v_mad_f32 v2, -v2, v4, v6 2026; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 2027; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 2028; GCN-NEXT: v_mul_lo_u32 v1, v1, s3 2029; GCN-NEXT: v_mul_lo_u32 v2, v2, s12 2030; GCN-NEXT: v_and_b32_e32 v0, s8, v0 2031; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 2032; GCN-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 2033; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2034; GCN-NEXT: v_and_b32_e32 v1, s8, v1 2035; GCN-NEXT: v_or_b32_e32 v1, v1, v2 2036; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v5 2037; GCN-NEXT: v_or_b32_e32 v0, v0, v2 2038; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2039; GCN-NEXT: s_endpgm 2040 %r = urem <4 x i16> %x, %y 2041 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 2042 ret void 2043} 2044 2045define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 2046; CHECK-LABEL: @sdiv_v4i16( 2047; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2048; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2049; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 2050; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 2051; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 2052; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 2053; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 2054; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 2055; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 2056; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2057; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 2058; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 2059; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 2060; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 2061; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 2062; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 2063; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2064; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 2065; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 2066; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 2067; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 2068; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 2069; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 2070; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> undef, i16 [[TMP23]], i64 0 2071; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1 2072; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2073; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 2074; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 2075; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 2076; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 2077; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 2078; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 2079; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 2080; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 2081; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 2082; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 2083; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 2084; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 2085; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 2086; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 2087; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 2088; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 2089; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 2090; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 2091; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 2092; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 2093; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 2094; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1 2095; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2 2096; CHECK-NEXT: [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2097; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 2098; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 2099; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 2100; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 2101; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 2102; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 2103; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 2104; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 2105; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 2106; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 2107; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 2108; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 2109; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 2110; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 2111; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 2112; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 2113; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 2114; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 2115; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 2116; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 2117; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 2118; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2 2119; CHECK-NEXT: [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3 2120; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2121; CHECK-NEXT: [[TMP75:%.*]] = sext i16 [[TMP73]] to i32 2122; CHECK-NEXT: [[TMP76:%.*]] = sext i16 [[TMP74]] to i32 2123; CHECK-NEXT: [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]] 2124; CHECK-NEXT: [[TMP78:%.*]] = ashr i32 [[TMP77]], 30 2125; CHECK-NEXT: [[TMP79:%.*]] = or i32 [[TMP78]], 1 2126; CHECK-NEXT: [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float 2127; CHECK-NEXT: [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float 2128; CHECK-NEXT: [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]]) 2129; CHECK-NEXT: [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]] 2130; CHECK-NEXT: [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]]) 2131; CHECK-NEXT: [[TMP85:%.*]] = fneg fast float [[TMP84]] 2132; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]]) 2133; CHECK-NEXT: [[TMP87:%.*]] = fptosi float [[TMP84]] to i32 2134; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]]) 2135; CHECK-NEXT: [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]]) 2136; CHECK-NEXT: [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]] 2137; CHECK-NEXT: [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0 2138; CHECK-NEXT: [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]] 2139; CHECK-NEXT: [[TMP93:%.*]] = shl i32 [[TMP92]], 16 2140; CHECK-NEXT: [[TMP94:%.*]] = ashr i32 [[TMP93]], 16 2141; CHECK-NEXT: [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16 2142; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3 2143; CHECK-NEXT: store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 2144; CHECK-NEXT: ret void 2145; 2146; GCN-LABEL: sdiv_v4i16: 2147; GCN: ; %bb.0: 2148; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2149; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 2150; GCN-NEXT: s_mov_b32 s7, 0xf000 2151; GCN-NEXT: s_mov_b32 s6, -1 2152; GCN-NEXT: s_waitcnt lgkmcnt(0) 2153; GCN-NEXT: s_sext_i32_i16 s8, s2 2154; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 2155; GCN-NEXT: s_sext_i32_i16 s9, s0 2156; GCN-NEXT: v_cvt_f32_i32_e32 v1, s9 2157; GCN-NEXT: s_xor_b32 s8, s9, s8 2158; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2159; GCN-NEXT: s_ashr_i32 s8, s8, 30 2160; GCN-NEXT: s_or_b32 s10, s8, 1 2161; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2162; GCN-NEXT: v_trunc_f32_e32 v2, v2 2163; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 2164; GCN-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| 2165; GCN-NEXT: s_cmp_lg_u32 s8, 0 2166; GCN-NEXT: s_cselect_b32 s8, s10, 0 2167; GCN-NEXT: s_ashr_i32 s2, s2, 16 2168; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 2169; GCN-NEXT: s_ashr_i32 s0, s0, 16 2170; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 2171; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 2172; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v0 2173; GCN-NEXT: s_xor_b32 s0, s0, s2 2174; GCN-NEXT: s_ashr_i32 s0, s0, 30 2175; GCN-NEXT: s_sext_i32_i16 s2, s3 2176; GCN-NEXT: v_mul_f32_e32 v3, v1, v3 2177; GCN-NEXT: v_trunc_f32_e32 v3, v3 2178; GCN-NEXT: v_mad_f32 v1, -v3, v0, v1 2179; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 2180; GCN-NEXT: v_add_i32_e32 v2, vcc, s8, v2 2181; GCN-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| 2182; GCN-NEXT: s_or_b32 s0, s0, 1 2183; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 2184; GCN-NEXT: s_cmp_lg_u32 s8, 0 2185; GCN-NEXT: s_cselect_b32 s0, s0, 0 2186; GCN-NEXT: v_add_i32_e32 v3, vcc, s0, v3 2187; GCN-NEXT: s_sext_i32_i16 s0, s1 2188; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 2189; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v0 2190; GCN-NEXT: s_xor_b32 s0, s0, s2 2191; GCN-NEXT: s_ashr_i32 s0, s0, 30 2192; GCN-NEXT: s_or_b32 s0, s0, 1 2193; GCN-NEXT: v_mul_f32_e32 v4, v1, v4 2194; GCN-NEXT: v_trunc_f32_e32 v4, v4 2195; GCN-NEXT: v_mad_f32 v1, -v4, v0, v1 2196; GCN-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| 2197; GCN-NEXT: s_cmp_lg_u32 s8, 0 2198; GCN-NEXT: s_cselect_b32 s0, s0, 0 2199; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 2200; GCN-NEXT: s_ashr_i32 s2, s3, 16 2201; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2 2202; GCN-NEXT: v_add_i32_e32 v1, vcc, s0, v4 2203; GCN-NEXT: s_ashr_i32 s0, s1, 16 2204; GCN-NEXT: v_cvt_f32_i32_e32 v4, s0 2205; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v0 2206; GCN-NEXT: s_xor_b32 s0, s0, s2 2207; GCN-NEXT: s_ashr_i32 s0, s0, 30 2208; GCN-NEXT: s_or_b32 s2, s0, 1 2209; GCN-NEXT: v_mul_f32_e32 v5, v4, v5 2210; GCN-NEXT: v_trunc_f32_e32 v5, v5 2211; GCN-NEXT: v_mad_f32 v4, -v5, v0, v4 2212; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 2213; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| 2214; GCN-NEXT: s_cmp_lg_u32 s0, 0 2215; GCN-NEXT: s_cselect_b32 s0, s2, 0 2216; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v5 2217; GCN-NEXT: s_mov_b32 s0, 0xffff 2218; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2219; GCN-NEXT: v_and_b32_e32 v1, s0, v1 2220; GCN-NEXT: v_or_b32_e32 v1, v1, v0 2221; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v3 2222; GCN-NEXT: v_and_b32_e32 v2, s0, v2 2223; GCN-NEXT: v_or_b32_e32 v0, v2, v0 2224; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2225; GCN-NEXT: s_endpgm 2226 %r = sdiv <4 x i16> %x, %y 2227 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 2228 ret void 2229} 2230 2231define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 2232; CHECK-LABEL: @srem_v4i16( 2233; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2234; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2235; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 2236; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 2237; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 2238; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 2239; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 2240; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 2241; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 2242; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2243; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 2244; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 2245; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 2246; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 2247; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 2248; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 2249; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2250; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 2251; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 2252; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 2253; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 2254; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 2255; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 2256; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 2257; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 2258; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> undef, i16 [[TMP25]], i64 0 2259; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1 2260; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2261; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 2262; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 2263; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 2264; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 2265; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 2266; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 2267; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 2268; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 2269; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 2270; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 2271; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 2272; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 2273; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 2274; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 2275; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 2276; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 2277; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 2278; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 2279; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 2280; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 2281; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 2282; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 2283; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 2284; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1 2285; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2 2286; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2287; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 2288; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 2289; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 2290; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 2291; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 2292; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 2293; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 2294; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 2295; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 2296; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 2297; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 2298; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 2299; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 2300; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 2301; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 2302; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 2303; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 2304; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 2305; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 2306; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 2307; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 2308; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 2309; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 2310; CHECK-NEXT: [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2 2311; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3 2312; CHECK-NEXT: [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2313; CHECK-NEXT: [[TMP81:%.*]] = sext i16 [[TMP79]] to i32 2314; CHECK-NEXT: [[TMP82:%.*]] = sext i16 [[TMP80]] to i32 2315; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]] 2316; CHECK-NEXT: [[TMP84:%.*]] = ashr i32 [[TMP83]], 30 2317; CHECK-NEXT: [[TMP85:%.*]] = or i32 [[TMP84]], 1 2318; CHECK-NEXT: [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float 2319; CHECK-NEXT: [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float 2320; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]]) 2321; CHECK-NEXT: [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]] 2322; CHECK-NEXT: [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]]) 2323; CHECK-NEXT: [[TMP91:%.*]] = fneg fast float [[TMP90]] 2324; CHECK-NEXT: [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]]) 2325; CHECK-NEXT: [[TMP93:%.*]] = fptosi float [[TMP90]] to i32 2326; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]]) 2327; CHECK-NEXT: [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]]) 2328; CHECK-NEXT: [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]] 2329; CHECK-NEXT: [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0 2330; CHECK-NEXT: [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]] 2331; CHECK-NEXT: [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]] 2332; CHECK-NEXT: [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]] 2333; CHECK-NEXT: [[TMP101:%.*]] = shl i32 [[TMP100]], 16 2334; CHECK-NEXT: [[TMP102:%.*]] = ashr i32 [[TMP101]], 16 2335; CHECK-NEXT: [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16 2336; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3 2337; CHECK-NEXT: store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 2338; CHECK-NEXT: ret void 2339; 2340; GCN-LABEL: srem_v4i16: 2341; GCN: ; %bb.0: 2342; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2343; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 2344; GCN-NEXT: s_mov_b32 s7, 0xf000 2345; GCN-NEXT: s_mov_b32 s6, -1 2346; GCN-NEXT: s_waitcnt lgkmcnt(0) 2347; GCN-NEXT: s_sext_i32_i16 s8, s2 2348; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 2349; GCN-NEXT: s_sext_i32_i16 s9, s0 2350; GCN-NEXT: v_cvt_f32_i32_e32 v1, s9 2351; GCN-NEXT: s_xor_b32 s8, s9, s8 2352; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2353; GCN-NEXT: s_ashr_i32 s8, s8, 30 2354; GCN-NEXT: s_or_b32 s10, s8, 1 2355; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2356; GCN-NEXT: v_trunc_f32_e32 v2, v2 2357; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 2358; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 2359; GCN-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| 2360; GCN-NEXT: s_cmp_lg_u32 s8, 0 2361; GCN-NEXT: s_cselect_b32 s8, s10, 0 2362; GCN-NEXT: v_add_i32_e32 v0, vcc, s8, v2 2363; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 2364; GCN-NEXT: s_ashr_i32 s2, s2, 16 2365; GCN-NEXT: v_cvt_f32_i32_e32 v1, s2 2366; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2367; GCN-NEXT: s_ashr_i32 s0, s0, 16 2368; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 2369; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v1 2370; GCN-NEXT: s_xor_b32 s8, s0, s2 2371; GCN-NEXT: s_ashr_i32 s8, s8, 30 2372; GCN-NEXT: s_or_b32 s10, s8, 1 2373; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 2374; GCN-NEXT: v_trunc_f32_e32 v3, v3 2375; GCN-NEXT: v_mad_f32 v2, -v3, v1, v2 2376; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 2377; GCN-NEXT: v_cmp_ge_f32_e64 s[8:9], |v2|, |v1| 2378; GCN-NEXT: s_cmp_lg_u32 s8, 0 2379; GCN-NEXT: s_cselect_b32 s8, s10, 0 2380; GCN-NEXT: v_add_i32_e32 v1, vcc, s8, v3 2381; GCN-NEXT: v_mul_lo_u32 v1, v1, s2 2382; GCN-NEXT: s_sext_i32_i16 s2, s3 2383; GCN-NEXT: v_cvt_f32_i32_e32 v2, s2 2384; GCN-NEXT: v_sub_i32_e32 v3, vcc, s0, v1 2385; GCN-NEXT: s_sext_i32_i16 s0, s1 2386; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 2387; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 2388; GCN-NEXT: s_xor_b32 s0, s0, s2 2389; GCN-NEXT: s_ashr_i32 s0, s0, 30 2390; GCN-NEXT: s_or_b32 s0, s0, 1 2391; GCN-NEXT: v_mul_f32_e32 v4, v1, v4 2392; GCN-NEXT: v_trunc_f32_e32 v4, v4 2393; GCN-NEXT: v_mad_f32 v1, -v4, v2, v1 2394; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 2395; GCN-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v2| 2396; GCN-NEXT: s_cmp_lg_u32 s8, 0 2397; GCN-NEXT: s_cselect_b32 s0, s0, 0 2398; GCN-NEXT: v_add_i32_e32 v1, vcc, s0, v4 2399; GCN-NEXT: s_ashr_i32 s0, s3, 16 2400; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 2401; GCN-NEXT: s_ashr_i32 s8, s1, 16 2402; GCN-NEXT: v_cvt_f32_i32_e32 v4, s8 2403; GCN-NEXT: s_xor_b32 s2, s8, s0 2404; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v2 2405; GCN-NEXT: s_ashr_i32 s2, s2, 30 2406; GCN-NEXT: v_mul_lo_u32 v1, v1, s3 2407; GCN-NEXT: s_or_b32 s9, s2, 1 2408; GCN-NEXT: v_mul_f32_e32 v5, v4, v5 2409; GCN-NEXT: v_trunc_f32_e32 v5, v5 2410; GCN-NEXT: v_mad_f32 v4, -v5, v2, v4 2411; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 2412; GCN-NEXT: v_cmp_ge_f32_e64 s[2:3], |v4|, |v2| 2413; GCN-NEXT: s_cmp_lg_u32 s2, 0 2414; GCN-NEXT: s_cselect_b32 s2, s9, 0 2415; GCN-NEXT: v_add_i32_e32 v2, vcc, s2, v5 2416; GCN-NEXT: v_mul_lo_u32 v2, v2, s0 2417; GCN-NEXT: s_mov_b32 s0, 0xffff 2418; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 2419; GCN-NEXT: v_and_b32_e32 v1, s0, v1 2420; GCN-NEXT: v_sub_i32_e32 v2, vcc, s8, v2 2421; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2422; GCN-NEXT: v_or_b32_e32 v1, v1, v2 2423; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v3 2424; GCN-NEXT: v_and_b32_e32 v0, s0, v0 2425; GCN-NEXT: v_or_b32_e32 v0, v0, v2 2426; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2427; GCN-NEXT: s_endpgm 2428 %r = srem <4 x i16> %x, %y 2429 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 2430 ret void 2431} 2432 2433define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 2434; CHECK-LABEL: @udiv_i3( 2435; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 2436; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 2437; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 2438; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 2439; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 2440; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 2441; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 2442; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 2443; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 2444; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 2445; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2446; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 2447; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 2448; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 2449; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 2450; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 7 2451; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3 2452; CHECK-NEXT: store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]], align 1 2453; CHECK-NEXT: ret void 2454; 2455; GCN-LABEL: udiv_i3: 2456; GCN: ; %bb.0: 2457; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2458; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 2459; GCN-NEXT: s_mov_b32 s7, 0xf000 2460; GCN-NEXT: s_mov_b32 s6, -1 2461; GCN-NEXT: s_waitcnt lgkmcnt(0) 2462; GCN-NEXT: s_bfe_u32 s1, s0, 0x30008 2463; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, s1 2464; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 2465; GCN-NEXT: s_and_b32 s0, s0, 7 2466; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 2467; GCN-NEXT: v_mul_f32_e32 v1, v2, v1 2468; GCN-NEXT: v_trunc_f32_e32 v1, v1 2469; GCN-NEXT: v_cvt_u32_f32_e32 v3, v1 2470; GCN-NEXT: v_mad_f32 v1, -v1, v0, v2 2471; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2472; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 2473; GCN-NEXT: v_and_b32_e32 v0, 7, v0 2474; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 2475; GCN-NEXT: s_endpgm 2476 %r = udiv i3 %x, %y 2477 store i3 %r, i3 addrspace(1)* %out 2478 ret void 2479} 2480 2481define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 2482; CHECK-LABEL: @urem_i3( 2483; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 2484; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 2485; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 2486; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 2487; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 2488; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 2489; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 2490; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 2491; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 2492; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 2493; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2494; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 2495; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 2496; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 2497; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 2498; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 2499; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 2500; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 7 2501; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3 2502; CHECK-NEXT: store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]], align 1 2503; CHECK-NEXT: ret void 2504; 2505; GCN-LABEL: urem_i3: 2506; GCN: ; %bb.0: 2507; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2508; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 2509; GCN-NEXT: s_mov_b32 s7, 0xf000 2510; GCN-NEXT: s_mov_b32 s6, -1 2511; GCN-NEXT: s_waitcnt lgkmcnt(0) 2512; GCN-NEXT: s_bfe_u32 s1, s0, 0x30008 2513; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, s1 2514; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0 2515; GCN-NEXT: s_and_b32 s2, s0, 7 2516; GCN-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 2517; GCN-NEXT: s_lshr_b32 s1, s0, 8 2518; GCN-NEXT: v_mul_f32_e32 v1, v2, v1 2519; GCN-NEXT: v_trunc_f32_e32 v1, v1 2520; GCN-NEXT: v_cvt_u32_f32_e32 v3, v1 2521; GCN-NEXT: v_mad_f32 v1, -v1, v0, v2 2522; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2523; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 2524; GCN-NEXT: v_mul_lo_u32 v0, v0, s1 2525; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2526; GCN-NEXT: v_and_b32_e32 v0, 7, v0 2527; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 2528; GCN-NEXT: s_endpgm 2529 %r = urem i3 %x, %y 2530 store i3 %r, i3 addrspace(1)* %out 2531 ret void 2532} 2533 2534define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 2535; CHECK-LABEL: @sdiv_i3( 2536; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 2537; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 2538; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 2539; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 2540; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 2541; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 2542; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 2543; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 2544; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 2545; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 2546; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 2547; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 2548; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 2549; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 2550; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 2551; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 2552; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 2553; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 2554; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 29 2555; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 29 2556; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3 2557; CHECK-NEXT: store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]], align 1 2558; CHECK-NEXT: ret void 2559; 2560; GCN-LABEL: sdiv_i3: 2561; GCN: ; %bb.0: 2562; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2563; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 2564; GCN-NEXT: s_mov_b32 s7, 0xf000 2565; GCN-NEXT: s_mov_b32 s6, -1 2566; GCN-NEXT: s_waitcnt lgkmcnt(0) 2567; GCN-NEXT: s_bfe_i32 s1, s0, 0x30008 2568; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 2569; GCN-NEXT: s_bfe_i32 s0, s0, 0x30000 2570; GCN-NEXT: v_cvt_f32_i32_e32 v1, s0 2571; GCN-NEXT: s_xor_b32 s0, s0, s1 2572; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2573; GCN-NEXT: s_ashr_i32 s0, s0, 30 2574; GCN-NEXT: s_or_b32 s2, s0, 1 2575; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2576; GCN-NEXT: v_trunc_f32_e32 v2, v2 2577; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 2578; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 2579; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 2580; GCN-NEXT: s_cmp_lg_u32 s0, 0 2581; GCN-NEXT: s_cselect_b32 s0, s2, 0 2582; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v2 2583; GCN-NEXT: v_and_b32_e32 v0, 7, v0 2584; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 2585; GCN-NEXT: s_endpgm 2586 %r = sdiv i3 %x, %y 2587 store i3 %r, i3 addrspace(1)* %out 2588 ret void 2589} 2590 2591define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 2592; CHECK-LABEL: @srem_i3( 2593; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 2594; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 2595; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 2596; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 2597; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 2598; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 2599; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 2600; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 2601; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 2602; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 2603; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 2604; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 2605; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 2606; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 2607; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 2608; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 2609; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 2610; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 2611; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 2612; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 2613; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 29 2614; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 29 2615; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3 2616; CHECK-NEXT: store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]], align 1 2617; CHECK-NEXT: ret void 2618; 2619; GCN-LABEL: srem_i3: 2620; GCN: ; %bb.0: 2621; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2622; GCN-NEXT: s_load_dword s2, s[0:1], 0xb 2623; GCN-NEXT: s_mov_b32 s7, 0xf000 2624; GCN-NEXT: s_waitcnt lgkmcnt(0) 2625; GCN-NEXT: s_bfe_i32 s0, s2, 0x30008 2626; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 2627; GCN-NEXT: s_bfe_i32 s1, s2, 0x30000 2628; GCN-NEXT: v_cvt_f32_i32_e32 v1, s1 2629; GCN-NEXT: s_xor_b32 s0, s1, s0 2630; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2631; GCN-NEXT: s_ashr_i32 s0, s0, 30 2632; GCN-NEXT: s_lshr_b32 s3, s2, 8 2633; GCN-NEXT: s_or_b32 s6, s0, 1 2634; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2635; GCN-NEXT: v_trunc_f32_e32 v2, v2 2636; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 2637; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 2638; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 2639; GCN-NEXT: s_cmp_lg_u32 s0, 0 2640; GCN-NEXT: s_cselect_b32 s0, s6, 0 2641; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v2 2642; GCN-NEXT: v_mul_lo_u32 v0, v0, s3 2643; GCN-NEXT: s_mov_b32 s6, -1 2644; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 2645; GCN-NEXT: v_and_b32_e32 v0, 7, v0 2646; GCN-NEXT: buffer_store_byte v0, off, s[4:7], 0 2647; GCN-NEXT: s_endpgm 2648 %r = srem i3 %x, %y 2649 store i3 %r, i3 addrspace(1)* %out 2650 ret void 2651} 2652 2653define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 2654; CHECK-LABEL: @udiv_v3i16( 2655; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 2656; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 2657; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 2658; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 2659; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 2660; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 2661; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 2662; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 2663; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 2664; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 2665; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 2666; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 2667; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 2668; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 2669; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 2670; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 2671; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 2672; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 2673; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 2674; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i16> undef, i16 [[TMP19]], i64 0 2675; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1 2676; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1 2677; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 2678; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 2679; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 2680; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 2681; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 2682; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 2683; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 2684; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 2685; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 2686; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 2687; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 2688; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 2689; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 2690; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 2691; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 2692; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 2693; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 2694; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1 2695; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2 2696; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2 2697; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 2698; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 2699; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 2700; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 2701; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 2702; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 2703; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 2704; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 2705; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 2706; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 2707; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 2708; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 2709; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 2710; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 2711; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 2712; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 2713; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 2714; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2 2715; CHECK-NEXT: store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 2716; CHECK-NEXT: ret void 2717; 2718; GCN-LABEL: udiv_v3i16: 2719; GCN: ; %bb.0: 2720; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2721; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 2722; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2723; GCN-NEXT: s_mov_b32 s8, 0xffff 2724; GCN-NEXT: s_mov_b32 s7, 0xf000 2725; GCN-NEXT: s_waitcnt lgkmcnt(0) 2726; GCN-NEXT: s_and_b32 s6, s0, s8 2727; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 2728; GCN-NEXT: s_and_b32 s6, s2, s8 2729; GCN-NEXT: s_lshr_b32 s0, s0, 16 2730; GCN-NEXT: v_cvt_f32_u32_e32 v3, s0 2731; GCN-NEXT: v_cvt_f32_u32_e32 v1, s6 2732; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2733; GCN-NEXT: s_lshr_b32 s0, s2, 16 2734; GCN-NEXT: v_cvt_f32_u32_e32 v4, s0 2735; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 2736; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 2737; GCN-NEXT: v_trunc_f32_e32 v2, v2 2738; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 2739; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 2740; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2741; GCN-NEXT: v_mul_f32_e32 v1, v4, v5 2742; GCN-NEXT: v_trunc_f32_e32 v1, v1 2743; GCN-NEXT: s_and_b32 s0, s1, s8 2744; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 2745; GCN-NEXT: v_mad_f32 v2, -v1, v3, v4 2746; GCN-NEXT: v_cvt_f32_u32_e32 v4, s0 2747; GCN-NEXT: s_and_b32 s0, s3, s8 2748; GCN-NEXT: v_cvt_f32_u32_e32 v5, s0 2749; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 2750; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v4 2751; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 2752; GCN-NEXT: s_mov_b32 s6, -1 2753; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2754; GCN-NEXT: v_mul_f32_e32 v2, v5, v6 2755; GCN-NEXT: v_trunc_f32_e32 v2, v2 2756; GCN-NEXT: v_cvt_u32_f32_e32 v3, v2 2757; GCN-NEXT: v_mad_f32 v2, -v2, v4, v5 2758; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 2759; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2760; GCN-NEXT: v_and_b32_e32 v0, s8, v0 2761; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 2762; GCN-NEXT: v_or_b32_e32 v0, v0, v1 2763; GCN-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 2764; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 2765; GCN-NEXT: s_endpgm 2766 %r = udiv <3 x i16> %x, %y 2767 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 2768 ret void 2769} 2770 2771define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 2772; CHECK-LABEL: @urem_v3i16( 2773; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 2774; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 2775; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 2776; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 2777; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 2778; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 2779; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 2780; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 2781; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 2782; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 2783; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 2784; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 2785; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 2786; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 2787; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 2788; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 2789; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 2790; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 2791; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 2792; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 2793; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 2794; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i16> undef, i16 [[TMP21]], i64 0 2795; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1 2796; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1 2797; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 2798; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 2799; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 2800; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 2801; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 2802; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 2803; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 2804; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 2805; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 2806; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 2807; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 2808; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 2809; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 2810; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 2811; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 2812; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 2813; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 2814; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 2815; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 2816; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1 2817; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2 2818; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2 2819; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 2820; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 2821; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 2822; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 2823; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 2824; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 2825; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 2826; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 2827; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 2828; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 2829; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 2830; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 2831; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 2832; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 2833; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 2834; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 2835; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 2836; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 2837; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 2838; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2 2839; CHECK-NEXT: store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 2840; CHECK-NEXT: ret void 2841; 2842; GCN-LABEL: urem_v3i16: 2843; GCN: ; %bb.0: 2844; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2845; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 2846; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2847; GCN-NEXT: s_mov_b32 s8, 0xffff 2848; GCN-NEXT: s_mov_b32 s7, 0xf000 2849; GCN-NEXT: s_waitcnt lgkmcnt(0) 2850; GCN-NEXT: v_mov_b32_e32 v1, s2 2851; GCN-NEXT: s_and_b32 s6, s0, s8 2852; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6 2853; GCN-NEXT: s_and_b32 s6, s2, s8 2854; GCN-NEXT: v_cvt_f32_u32_e32 v2, s6 2855; GCN-NEXT: v_mov_b32_e32 v4, s0 2856; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v0 2857; GCN-NEXT: v_alignbit_b32 v4, s1, v4, 16 2858; GCN-NEXT: v_and_b32_e32 v5, s8, v4 2859; GCN-NEXT: v_alignbit_b32 v1, s3, v1, 16 2860; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 2861; GCN-NEXT: v_trunc_f32_e32 v3, v3 2862; GCN-NEXT: v_mad_f32 v2, -v3, v0, v2 2863; GCN-NEXT: v_cvt_u32_f32_e32 v6, v3 2864; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 2865; GCN-NEXT: v_cvt_f32_u32_e32 v2, v5 2866; GCN-NEXT: v_and_b32_e32 v3, s8, v1 2867; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc 2868; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 2869; GCN-NEXT: s_and_b32 s0, s1, s8 2870; GCN-NEXT: v_cvt_f32_u32_e32 v3, v3 2871; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v2 2872; GCN-NEXT: v_cvt_f32_u32_e32 v6, s0 2873; GCN-NEXT: s_and_b32 s0, s3, s8 2874; GCN-NEXT: v_cvt_f32_u32_e32 v7, s0 2875; GCN-NEXT: v_mul_f32_e32 v5, v3, v5 2876; GCN-NEXT: v_trunc_f32_e32 v5, v5 2877; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v6 2878; GCN-NEXT: v_mad_f32 v3, -v5, v2, v3 2879; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 2880; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 2881; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 2882; GCN-NEXT: v_mul_f32_e32 v3, v7, v8 2883; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 2884; GCN-NEXT: v_trunc_f32_e32 v3, v3 2885; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 2886; GCN-NEXT: v_cvt_u32_f32_e32 v4, v3 2887; GCN-NEXT: v_mad_f32 v3, -v3, v6, v7 2888; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v6 2889; GCN-NEXT: s_mov_b32 s6, -1 2890; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 2891; GCN-NEXT: v_mul_lo_u32 v3, v3, s1 2892; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 2893; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2894; GCN-NEXT: v_and_b32_e32 v0, s8, v0 2895; GCN-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 2896; GCN-NEXT: v_or_b32_e32 v0, v0, v1 2897; GCN-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 2898; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 2899; GCN-NEXT: s_endpgm 2900 %r = urem <3 x i16> %x, %y 2901 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 2902 ret void 2903} 2904 2905define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 2906; CHECK-LABEL: @sdiv_v3i16( 2907; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 2908; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 2909; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 2910; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 2911; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 2912; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 2913; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 2914; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 2915; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 2916; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2917; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 2918; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 2919; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 2920; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 2921; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 2922; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 2923; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2924; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 2925; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 2926; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 2927; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 2928; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 2929; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 2930; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i16> undef, i16 [[TMP23]], i64 0 2931; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1 2932; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1 2933; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 2934; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 2935; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 2936; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 2937; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 2938; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 2939; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 2940; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 2941; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 2942; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 2943; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 2944; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 2945; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 2946; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 2947; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 2948; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 2949; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 2950; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 2951; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 2952; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 2953; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 2954; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1 2955; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2 2956; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2 2957; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 2958; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 2959; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 2960; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 2961; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 2962; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 2963; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 2964; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 2965; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 2966; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 2967; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 2968; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 2969; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 2970; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 2971; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 2972; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 2973; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 2974; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 2975; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 2976; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 2977; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 2978; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2 2979; CHECK-NEXT: store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 2980; CHECK-NEXT: ret void 2981; 2982; GCN-LABEL: sdiv_v3i16: 2983; GCN: ; %bb.0: 2984; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2985; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 2986; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 2987; GCN-NEXT: s_mov_b32 s7, 0xf000 2988; GCN-NEXT: s_mov_b32 s6, -1 2989; GCN-NEXT: s_waitcnt lgkmcnt(0) 2990; GCN-NEXT: s_sext_i32_i16 s9, s2 2991; GCN-NEXT: s_sext_i32_i16 s8, s0 2992; GCN-NEXT: v_cvt_f32_i32_e32 v0, s8 2993; GCN-NEXT: v_cvt_f32_i32_e32 v1, s9 2994; GCN-NEXT: s_xor_b32 s8, s9, s8 2995; GCN-NEXT: s_ashr_i32 s8, s8, 30 2996; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 2997; GCN-NEXT: s_or_b32 s10, s8, 1 2998; GCN-NEXT: s_sext_i32_i16 s1, s1 2999; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 3000; GCN-NEXT: v_trunc_f32_e32 v2, v2 3001; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 3002; GCN-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| 3003; GCN-NEXT: s_cmp_lg_u32 s8, 0 3004; GCN-NEXT: s_cselect_b32 s8, s10, 0 3005; GCN-NEXT: s_ashr_i32 s0, s0, 16 3006; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 3007; GCN-NEXT: v_cvt_f32_i32_e32 v0, s0 3008; GCN-NEXT: s_ashr_i32 s2, s2, 16 3009; GCN-NEXT: s_xor_b32 s0, s2, s0 3010; GCN-NEXT: v_add_i32_e32 v1, vcc, s8, v2 3011; GCN-NEXT: v_cvt_f32_i32_e32 v2, s2 3012; GCN-NEXT: v_rcp_iflag_f32_e32 v3, v0 3013; GCN-NEXT: s_ashr_i32 s0, s0, 30 3014; GCN-NEXT: s_or_b32 s0, s0, 1 3015; GCN-NEXT: v_and_b32_e32 v1, 0xffff, v1 3016; GCN-NEXT: v_mul_f32_e32 v3, v2, v3 3017; GCN-NEXT: v_trunc_f32_e32 v3, v3 3018; GCN-NEXT: v_mad_f32 v2, -v3, v0, v2 3019; GCN-NEXT: v_cvt_i32_f32_e32 v3, v3 3020; GCN-NEXT: v_cmp_ge_f32_e64 s[8:9], |v2|, |v0| 3021; GCN-NEXT: v_cvt_f32_i32_e32 v0, s1 3022; GCN-NEXT: s_cmp_lg_u32 s8, 0 3023; GCN-NEXT: s_cselect_b32 s0, s0, 0 3024; GCN-NEXT: v_add_i32_e32 v2, vcc, s0, v3 3025; GCN-NEXT: s_sext_i32_i16 s0, s3 3026; GCN-NEXT: v_cvt_f32_i32_e32 v3, s0 3027; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v0 3028; GCN-NEXT: s_xor_b32 s0, s0, s1 3029; GCN-NEXT: s_ashr_i32 s0, s0, 30 3030; GCN-NEXT: s_or_b32 s2, s0, 1 3031; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 3032; GCN-NEXT: v_trunc_f32_e32 v4, v4 3033; GCN-NEXT: v_mad_f32 v3, -v4, v0, v3 3034; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 3035; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| 3036; GCN-NEXT: s_cmp_lg_u32 s0, 0 3037; GCN-NEXT: s_cselect_b32 s0, s2, 0 3038; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3039; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v4 3040; GCN-NEXT: v_or_b32_e32 v1, v1, v2 3041; GCN-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 3042; GCN-NEXT: buffer_store_dword v1, off, s[4:7], 0 3043; GCN-NEXT: s_endpgm 3044 %r = sdiv <3 x i16> %x, %y 3045 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 3046 ret void 3047} 3048 3049define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 3050; CHECK-LABEL: @srem_v3i16( 3051; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 3052; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 3053; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 3054; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 3055; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 3056; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 3057; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 3058; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 3059; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 3060; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 3061; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 3062; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 3063; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 3064; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 3065; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 3066; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 3067; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3068; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 3069; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 3070; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 3071; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 3072; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 3073; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 3074; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 3075; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 3076; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i16> undef, i16 [[TMP25]], i64 0 3077; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1 3078; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1 3079; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 3080; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 3081; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 3082; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 3083; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 3084; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 3085; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 3086; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 3087; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 3088; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 3089; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 3090; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 3091; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 3092; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 3093; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 3094; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 3095; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 3096; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 3097; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 3098; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 3099; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 3100; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 3101; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 3102; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1 3103; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2 3104; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2 3105; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 3106; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 3107; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 3108; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 3109; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 3110; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 3111; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 3112; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 3113; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 3114; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 3115; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 3116; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 3117; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 3118; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 3119; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 3120; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 3121; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 3122; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 3123; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 3124; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 3125; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 3126; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 3127; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 3128; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2 3129; CHECK-NEXT: store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 3130; CHECK-NEXT: ret void 3131; 3132; GCN-LABEL: srem_v3i16: 3133; GCN: ; %bb.0: 3134; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3135; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3136; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3137; GCN-NEXT: s_mov_b32 s7, 0xf000 3138; GCN-NEXT: s_waitcnt lgkmcnt(0) 3139; GCN-NEXT: s_sext_i32_i16 s8, s2 3140; GCN-NEXT: s_sext_i32_i16 s6, s0 3141; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6 3142; GCN-NEXT: v_cvt_f32_i32_e32 v1, s8 3143; GCN-NEXT: s_xor_b32 s6, s8, s6 3144; GCN-NEXT: s_ashr_i32 s6, s6, 30 3145; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v0 3146; GCN-NEXT: s_or_b32 s6, s6, 1 3147; GCN-NEXT: v_mul_f32_e32 v2, v1, v2 3148; GCN-NEXT: v_trunc_f32_e32 v2, v2 3149; GCN-NEXT: v_mad_f32 v1, -v2, v0, v1 3150; GCN-NEXT: v_cvt_i32_f32_e32 v2, v2 3151; GCN-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, |v0| 3152; GCN-NEXT: s_cmp_lg_u32 s8, 0 3153; GCN-NEXT: s_cselect_b32 s6, s6, 0 3154; GCN-NEXT: v_add_i32_e32 v0, vcc, s6, v2 3155; GCN-NEXT: v_mov_b32_e32 v2, s0 3156; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 16 3157; GCN-NEXT: v_bfe_i32 v3, v2, 0, 16 3158; GCN-NEXT: v_cvt_f32_i32_e32 v4, v3 3159; GCN-NEXT: v_mov_b32_e32 v1, s2 3160; GCN-NEXT: v_alignbit_b32 v1, s3, v1, 16 3161; GCN-NEXT: v_bfe_i32 v5, v1, 0, 16 3162; GCN-NEXT: v_cvt_f32_i32_e32 v6, v5 3163; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v4 3164; GCN-NEXT: v_mul_lo_u32 v0, v0, s0 3165; GCN-NEXT: v_xor_b32_e32 v3, v5, v3 3166; GCN-NEXT: s_sext_i32_i16 s0, s1 3167; GCN-NEXT: v_mul_f32_e32 v5, v6, v7 3168; GCN-NEXT: v_trunc_f32_e32 v5, v5 3169; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 3170; GCN-NEXT: v_mad_f32 v6, -v5, v4, v6 3171; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 3172; GCN-NEXT: v_ashrrev_i32_e32 v3, 30, v3 3173; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4| 3174; GCN-NEXT: v_cvt_f32_i32_e32 v4, s0 3175; GCN-NEXT: v_or_b32_e32 v3, 1, v3 3176; GCN-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 3177; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 3178; GCN-NEXT: s_sext_i32_i16 s2, s3 3179; GCN-NEXT: v_mul_lo_u32 v2, v3, v2 3180; GCN-NEXT: v_cvt_f32_i32_e32 v3, s2 3181; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v4 3182; GCN-NEXT: s_xor_b32 s0, s2, s0 3183; GCN-NEXT: s_ashr_i32 s0, s0, 30 3184; GCN-NEXT: s_or_b32 s0, s0, 1 3185; GCN-NEXT: v_mul_f32_e32 v5, v3, v5 3186; GCN-NEXT: v_trunc_f32_e32 v5, v5 3187; GCN-NEXT: v_mad_f32 v3, -v5, v4, v3 3188; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 3189; GCN-NEXT: v_cmp_ge_f32_e64 s[8:9], |v3|, |v4| 3190; GCN-NEXT: s_cmp_lg_u32 s8, 0 3191; GCN-NEXT: s_cselect_b32 s0, s0, 0 3192; GCN-NEXT: v_add_i32_e32 v3, vcc, s0, v5 3193; GCN-NEXT: v_mul_lo_u32 v3, v3, s1 3194; GCN-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 3195; GCN-NEXT: s_mov_b32 s6, -1 3196; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3197; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 3198; GCN-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 3199; GCN-NEXT: v_or_b32_e32 v0, v0, v1 3200; GCN-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 3201; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3202; GCN-NEXT: s_endpgm 3203 %r = srem <3 x i16> %x, %y 3204 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 3205 ret void 3206} 3207 3208define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 3209; CHECK-LABEL: @udiv_v3i15( 3210; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 3211; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 3212; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 3213; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 3214; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3215; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3216; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3217; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3218; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3219; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3220; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3221; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3222; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3223; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3224; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3225; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3226; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3227; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 32767 3228; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15 3229; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i15> undef, i15 [[TMP19]], i64 0 3230; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1 3231; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1 3232; CHECK-NEXT: [[TMP23:%.*]] = zext i15 [[TMP21]] to i32 3233; CHECK-NEXT: [[TMP24:%.*]] = zext i15 [[TMP22]] to i32 3234; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 3235; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 3236; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 3237; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 3238; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 3239; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 3240; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 3241; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 3242; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 3243; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 3244; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 3245; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 3246; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 3247; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 32767 3248; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15 3249; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1 3250; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2 3251; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2 3252; CHECK-NEXT: [[TMP43:%.*]] = zext i15 [[TMP41]] to i32 3253; CHECK-NEXT: [[TMP44:%.*]] = zext i15 [[TMP42]] to i32 3254; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 3255; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 3256; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 3257; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 3258; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 3259; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 3260; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 3261; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 3262; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 3263; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 3264; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 3265; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 3266; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 3267; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 32767 3268; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15 3269; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2 3270; CHECK-NEXT: store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 3271; CHECK-NEXT: ret void 3272; 3273; GCN-LABEL: udiv_v3i15: 3274; GCN: ; %bb.0: 3275; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3276; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3277; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3278; GCN-NEXT: s_mov_b32 s7, 0xf000 3279; GCN-NEXT: s_mov_b32 s6, -1 3280; GCN-NEXT: s_waitcnt lgkmcnt(0) 3281; GCN-NEXT: v_mov_b32_e32 v0, s2 3282; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 30 3283; GCN-NEXT: s_movk_i32 s3, 0x7fff 3284; GCN-NEXT: s_and_b32 s9, s0, s3 3285; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 3286; GCN-NEXT: v_mov_b32_e32 v2, s0 3287; GCN-NEXT: s_and_b32 s8, s2, s3 3288; GCN-NEXT: s_bfe_u32 s0, s0, 0xf000f 3289; GCN-NEXT: v_cvt_f32_u32_e32 v5, s0 3290; GCN-NEXT: v_cvt_f32_u32_e32 v3, s8 3291; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 3292; GCN-NEXT: s_bfe_u32 s2, s2, 0xf000f 3293; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 30 3294; GCN-NEXT: v_cvt_f32_u32_e32 v6, s2 3295; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 3296; GCN-NEXT: v_rcp_iflag_f32_e32 v7, v5 3297; GCN-NEXT: v_and_b32_e32 v2, s3, v2 3298; GCN-NEXT: v_trunc_f32_e32 v4, v4 3299; GCN-NEXT: v_mad_f32 v3, -v4, v1, v3 3300; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 3301; GCN-NEXT: v_cvt_f32_u32_e32 v2, v2 3302; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 3303; GCN-NEXT: v_mul_f32_e32 v1, v6, v7 3304; GCN-NEXT: v_and_b32_e32 v0, s3, v0 3305; GCN-NEXT: v_trunc_f32_e32 v1, v1 3306; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 3307; GCN-NEXT: v_mad_f32 v4, -v1, v5, v6 3308; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 3309; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0 3310; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v2 3311; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v5 3312; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 3313; GCN-NEXT: v_mul_f32_e32 v1, v0, v6 3314; GCN-NEXT: v_trunc_f32_e32 v1, v1 3315; GCN-NEXT: v_cvt_u32_f32_e32 v5, v1 3316; GCN-NEXT: v_mad_f32 v0, -v1, v2, v0 3317; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 3318; GCN-NEXT: v_and_b32_e32 v2, s3, v3 3319; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc 3320; GCN-NEXT: v_and_b32_e32 v3, s3, v4 3321; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 3322; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 3323; GCN-NEXT: v_or_b32_e32 v2, v2, v3 3324; GCN-NEXT: v_or_b32_e32 v0, v2, v0 3325; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1 3326; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3327; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 3328; GCN-NEXT: s_endpgm 3329 %r = udiv <3 x i15> %x, %y 3330 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 3331 ret void 3332} 3333 3334define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 3335; CHECK-LABEL: @urem_v3i15( 3336; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 3337; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 3338; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 3339; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 3340; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3341; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3342; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3343; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3344; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3345; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3346; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3347; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3348; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3349; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3350; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3351; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3352; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3353; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 3354; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 3355; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 32767 3356; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15 3357; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i15> undef, i15 [[TMP21]], i64 0 3358; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1 3359; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1 3360; CHECK-NEXT: [[TMP25:%.*]] = zext i15 [[TMP23]] to i32 3361; CHECK-NEXT: [[TMP26:%.*]] = zext i15 [[TMP24]] to i32 3362; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 3363; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 3364; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 3365; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 3366; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 3367; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 3368; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 3369; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 3370; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 3371; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 3372; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 3373; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 3374; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 3375; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 3376; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 3377; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 32767 3378; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15 3379; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1 3380; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2 3381; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2 3382; CHECK-NEXT: [[TMP47:%.*]] = zext i15 [[TMP45]] to i32 3383; CHECK-NEXT: [[TMP48:%.*]] = zext i15 [[TMP46]] to i32 3384; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 3385; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 3386; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 3387; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 3388; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 3389; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 3390; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 3391; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 3392; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 3393; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 3394; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 3395; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 3396; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 3397; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 3398; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 3399; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 32767 3400; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15 3401; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2 3402; CHECK-NEXT: store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 3403; CHECK-NEXT: ret void 3404; 3405; GCN-LABEL: urem_v3i15: 3406; GCN: ; %bb.0: 3407; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3408; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3409; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3410; GCN-NEXT: s_mov_b32 s7, 0xf000 3411; GCN-NEXT: s_mov_b32 s6, -1 3412; GCN-NEXT: s_waitcnt lgkmcnt(0) 3413; GCN-NEXT: v_mov_b32_e32 v0, s2 3414; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 30 3415; GCN-NEXT: s_movk_i32 s3, 0x7fff 3416; GCN-NEXT: s_and_b32 s10, s0, s3 3417; GCN-NEXT: v_cvt_f32_u32_e32 v1, s10 3418; GCN-NEXT: s_and_b32 s9, s2, s3 3419; GCN-NEXT: v_cvt_f32_u32_e32 v3, s9 3420; GCN-NEXT: v_mov_b32_e32 v2, s0 3421; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v1 3422; GCN-NEXT: v_alignbit_b32 v2, s1, v2, 30 3423; GCN-NEXT: s_bfe_u32 s1, s0, 0xf000f 3424; GCN-NEXT: v_cvt_f32_u32_e32 v5, s1 3425; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 3426; GCN-NEXT: v_trunc_f32_e32 v4, v4 3427; GCN-NEXT: v_mad_f32 v3, -v4, v1, v3 3428; GCN-NEXT: v_cvt_u32_f32_e32 v4, v4 3429; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 3430; GCN-NEXT: s_bfe_u32 s10, s2, 0xf000f 3431; GCN-NEXT: v_cvt_f32_u32_e32 v3, s10 3432; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc 3433; GCN-NEXT: v_mul_lo_u32 v1, v1, s0 3434; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v5 3435; GCN-NEXT: v_and_b32_e32 v2, s3, v2 3436; GCN-NEXT: v_and_b32_e32 v0, s3, v0 3437; GCN-NEXT: v_sub_i32_e32 v6, vcc, s2, v1 3438; GCN-NEXT: v_mul_f32_e32 v1, v3, v4 3439; GCN-NEXT: v_cvt_f32_u32_e32 v4, v2 3440; GCN-NEXT: v_cvt_f32_u32_e32 v7, v0 3441; GCN-NEXT: v_trunc_f32_e32 v1, v1 3442; GCN-NEXT: v_mad_f32 v3, -v1, v5, v3 3443; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v4 3444; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 3445; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 3446; GCN-NEXT: s_lshr_b32 s0, s0, 15 3447; GCN-NEXT: v_mul_f32_e32 v3, v7, v8 3448; GCN-NEXT: v_trunc_f32_e32 v3, v3 3449; GCN-NEXT: v_cvt_u32_f32_e32 v5, v3 3450; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3451; GCN-NEXT: v_mad_f32 v3, -v3, v4, v7 3452; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 3453; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 3454; GCN-NEXT: v_mul_lo_u32 v1, v1, s0 3455; GCN-NEXT: v_mul_lo_u32 v2, v3, v2 3456; GCN-NEXT: s_lshr_b32 s8, s2, 15 3457; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v1 3458; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 3459; GCN-NEXT: v_and_b32_e32 v3, s3, v3 3460; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 3461; GCN-NEXT: v_and_b32_e32 v2, s3, v6 3462; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 3463; GCN-NEXT: v_or_b32_e32 v2, v2, v3 3464; GCN-NEXT: v_or_b32_e32 v0, v2, v0 3465; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1 3466; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3467; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 3468; GCN-NEXT: s_endpgm 3469 %r = urem <3 x i15> %x, %y 3470 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 3471 ret void 3472} 3473 3474define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 3475; CHECK-LABEL: @sdiv_v3i15( 3476; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 3477; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 3478; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 3479; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 3480; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 3481; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 3482; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 3483; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 3484; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 3485; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 3486; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 3487; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 3488; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 3489; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 3490; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 3491; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 3492; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3493; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 3494; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 3495; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 3496; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 17 3497; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 17 3498; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15 3499; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i15> undef, i15 [[TMP23]], i64 0 3500; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1 3501; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1 3502; CHECK-NEXT: [[TMP27:%.*]] = sext i15 [[TMP25]] to i32 3503; CHECK-NEXT: [[TMP28:%.*]] = sext i15 [[TMP26]] to i32 3504; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 3505; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 3506; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 3507; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 3508; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 3509; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 3510; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 3511; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 3512; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 3513; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 3514; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 3515; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 3516; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 3517; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 3518; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 3519; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 3520; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 17 3521; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 17 3522; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15 3523; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1 3524; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2 3525; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2 3526; CHECK-NEXT: [[TMP51:%.*]] = sext i15 [[TMP49]] to i32 3527; CHECK-NEXT: [[TMP52:%.*]] = sext i15 [[TMP50]] to i32 3528; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 3529; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 3530; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 3531; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 3532; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 3533; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 3534; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 3535; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 3536; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 3537; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 3538; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 3539; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 3540; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 3541; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 3542; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 3543; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 3544; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 17 3545; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 17 3546; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15 3547; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2 3548; CHECK-NEXT: store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 3549; CHECK-NEXT: ret void 3550; 3551; GCN-LABEL: sdiv_v3i15: 3552; GCN: ; %bb.0: 3553; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3554; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3555; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3556; GCN-NEXT: s_mov_b32 s7, 0xf000 3557; GCN-NEXT: s_mov_b32 s6, -1 3558; GCN-NEXT: s_waitcnt lgkmcnt(0) 3559; GCN-NEXT: v_mov_b32_e32 v0, s2 3560; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 30 3561; GCN-NEXT: s_bfe_i32 s3, s0, 0xf0000 3562; GCN-NEXT: v_cvt_f32_i32_e32 v2, s3 3563; GCN-NEXT: v_mov_b32_e32 v1, s0 3564; GCN-NEXT: v_alignbit_b32 v1, s1, v1, 30 3565; GCN-NEXT: s_bfe_i32 s1, s2, 0xf0000 3566; GCN-NEXT: v_cvt_f32_i32_e32 v3, s1 3567; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 3568; GCN-NEXT: s_xor_b32 s1, s1, s3 3569; GCN-NEXT: s_ashr_i32 s1, s1, 30 3570; GCN-NEXT: s_or_b32 s1, s1, 1 3571; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 3572; GCN-NEXT: v_trunc_f32_e32 v4, v4 3573; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 3574; GCN-NEXT: v_cmp_ge_f32_e64 s[8:9], |v3|, |v2| 3575; GCN-NEXT: s_cmp_lg_u32 s8, 0 3576; GCN-NEXT: s_cselect_b32 s1, s1, 0 3577; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 3578; GCN-NEXT: s_bfe_i32 s0, s0, 0xf000f 3579; GCN-NEXT: v_cvt_f32_i32_e32 v2, s0 3580; GCN-NEXT: v_bfe_i32 v1, v1, 0, 15 3581; GCN-NEXT: v_add_i32_e32 v3, vcc, s1, v4 3582; GCN-NEXT: s_bfe_i32 s1, s2, 0xf000f 3583; GCN-NEXT: v_cvt_f32_i32_e32 v4, s1 3584; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v2 3585; GCN-NEXT: s_xor_b32 s0, s1, s0 3586; GCN-NEXT: s_ashr_i32 s0, s0, 30 3587; GCN-NEXT: s_or_b32 s2, s0, 1 3588; GCN-NEXT: v_mul_f32_e32 v5, v4, v5 3589; GCN-NEXT: v_trunc_f32_e32 v5, v5 3590; GCN-NEXT: v_mad_f32 v4, -v5, v2, v4 3591; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v2| 3592; GCN-NEXT: v_cvt_f32_i32_e32 v2, v1 3593; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 3594; GCN-NEXT: s_cmp_lg_u32 s0, 0 3595; GCN-NEXT: s_cselect_b32 s0, s2, 0 3596; GCN-NEXT: v_bfe_i32 v0, v0, 0, 15 3597; GCN-NEXT: v_add_i32_e32 v4, vcc, s0, v5 3598; GCN-NEXT: v_cvt_f32_i32_e32 v5, v0 3599; GCN-NEXT: v_rcp_iflag_f32_e32 v6, v2 3600; GCN-NEXT: v_xor_b32_e32 v0, v0, v1 3601; GCN-NEXT: v_ashrrev_i32_e32 v0, 30, v0 3602; GCN-NEXT: v_or_b32_e32 v0, 1, v0 3603; GCN-NEXT: v_mul_f32_e32 v1, v5, v6 3604; GCN-NEXT: v_trunc_f32_e32 v1, v1 3605; GCN-NEXT: v_mad_f32 v5, -v1, v2, v5 3606; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1 3607; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v2| 3608; GCN-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 3609; GCN-NEXT: s_movk_i32 s0, 0x7fff 3610; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 3611; GCN-NEXT: v_and_b32_e32 v2, s0, v3 3612; GCN-NEXT: v_and_b32_e32 v3, s0, v4 3613; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 3614; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 3615; GCN-NEXT: v_or_b32_e32 v2, v2, v3 3616; GCN-NEXT: v_or_b32_e32 v0, v2, v0 3617; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1 3618; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3619; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 3620; GCN-NEXT: s_endpgm 3621 %r = sdiv <3 x i15> %x, %y 3622 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 3623 ret void 3624} 3625 3626define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 3627; CHECK-LABEL: @srem_v3i15( 3628; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 3629; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 3630; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 3631; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 3632; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 3633; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 3634; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 3635; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 3636; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 3637; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 3638; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 3639; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 3640; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 3641; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 3642; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 3643; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 3644; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3645; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 3646; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 3647; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 3648; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 3649; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 3650; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 17 3651; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 17 3652; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15 3653; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i15> undef, i15 [[TMP25]], i64 0 3654; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1 3655; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1 3656; CHECK-NEXT: [[TMP29:%.*]] = sext i15 [[TMP27]] to i32 3657; CHECK-NEXT: [[TMP30:%.*]] = sext i15 [[TMP28]] to i32 3658; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 3659; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 3660; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 3661; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 3662; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 3663; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 3664; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 3665; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 3666; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 3667; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 3668; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 3669; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 3670; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 3671; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 3672; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 3673; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 3674; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 3675; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 3676; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 17 3677; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 17 3678; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15 3679; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1 3680; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2 3681; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2 3682; CHECK-NEXT: [[TMP55:%.*]] = sext i15 [[TMP53]] to i32 3683; CHECK-NEXT: [[TMP56:%.*]] = sext i15 [[TMP54]] to i32 3684; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 3685; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 3686; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 3687; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 3688; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 3689; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 3690; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 3691; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 3692; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 3693; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 3694; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 3695; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 3696; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 3697; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 3698; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 3699; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 3700; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 3701; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 3702; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 17 3703; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 17 3704; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15 3705; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2 3706; CHECK-NEXT: store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 3707; CHECK-NEXT: ret void 3708; 3709; GCN-LABEL: srem_v3i15: 3710; GCN: ; %bb.0: 3711; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3712; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3713; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3714; GCN-NEXT: s_mov_b32 s7, 0xf000 3715; GCN-NEXT: s_mov_b32 s6, -1 3716; GCN-NEXT: s_waitcnt lgkmcnt(0) 3717; GCN-NEXT: v_mov_b32_e32 v0, s2 3718; GCN-NEXT: v_mov_b32_e32 v1, s0 3719; GCN-NEXT: v_alignbit_b32 v0, s3, v0, 30 3720; GCN-NEXT: s_movk_i32 s3, 0x7fff 3721; GCN-NEXT: v_alignbit_b32 v1, s1, v1, 30 3722; GCN-NEXT: s_and_b32 s1, s0, s3 3723; GCN-NEXT: s_bfe_i32 s1, s1, 0xf0000 3724; GCN-NEXT: v_cvt_f32_i32_e32 v2, s1 3725; GCN-NEXT: s_and_b32 s8, s2, s3 3726; GCN-NEXT: s_bfe_i32 s8, s8, 0xf0000 3727; GCN-NEXT: v_cvt_f32_i32_e32 v3, s8 3728; GCN-NEXT: v_rcp_iflag_f32_e32 v4, v2 3729; GCN-NEXT: s_xor_b32 s1, s8, s1 3730; GCN-NEXT: s_ashr_i32 s1, s1, 30 3731; GCN-NEXT: s_lshr_b32 s10, s2, 15 3732; GCN-NEXT: v_mul_f32_e32 v4, v3, v4 3733; GCN-NEXT: v_trunc_f32_e32 v4, v4 3734; GCN-NEXT: v_mad_f32 v3, -v4, v2, v3 3735; GCN-NEXT: v_cvt_i32_f32_e32 v4, v4 3736; GCN-NEXT: s_bfe_u32 s11, s2, 0xf000f 3737; GCN-NEXT: s_lshr_b32 s12, s0, 15 3738; GCN-NEXT: s_bfe_u32 s13, s0, 0xf000f 3739; GCN-NEXT: s_or_b32 s1, s1, 1 3740; GCN-NEXT: v_cmp_ge_f32_e64 s[8:9], |v3|, |v2| 3741; GCN-NEXT: s_cmp_lg_u32 s8, 0 3742; GCN-NEXT: s_cselect_b32 s1, s1, 0 3743; GCN-NEXT: v_add_i32_e32 v2, vcc, s1, v4 3744; GCN-NEXT: v_mul_lo_u32 v2, v2, s0 3745; GCN-NEXT: s_bfe_i32 s0, s13, 0xf0000 3746; GCN-NEXT: v_cvt_f32_i32_e32 v3, s0 3747; GCN-NEXT: s_bfe_i32 s1, s11, 0xf0000 3748; GCN-NEXT: v_cvt_f32_i32_e32 v4, s1 3749; GCN-NEXT: s_xor_b32 s0, s1, s0 3750; GCN-NEXT: v_rcp_iflag_f32_e32 v5, v3 3751; GCN-NEXT: s_ashr_i32 s0, s0, 30 3752; GCN-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 3753; GCN-NEXT: s_or_b32 s2, s0, 1 3754; GCN-NEXT: v_mul_f32_e32 v5, v4, v5 3755; GCN-NEXT: v_trunc_f32_e32 v5, v5 3756; GCN-NEXT: v_mad_f32 v4, -v5, v3, v4 3757; GCN-NEXT: v_cvt_i32_f32_e32 v5, v5 3758; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 3759; GCN-NEXT: s_cmp_lg_u32 s0, 0 3760; GCN-NEXT: v_and_b32_e32 v1, s3, v1 3761; GCN-NEXT: s_cselect_b32 s0, s2, 0 3762; GCN-NEXT: v_bfe_i32 v4, v1, 0, 15 3763; GCN-NEXT: v_add_i32_e32 v3, vcc, s0, v5 3764; GCN-NEXT: v_cvt_f32_i32_e32 v5, v4 3765; GCN-NEXT: v_and_b32_e32 v0, s3, v0 3766; GCN-NEXT: v_bfe_i32 v6, v0, 0, 15 3767; GCN-NEXT: v_cvt_f32_i32_e32 v7, v6 3768; GCN-NEXT: v_rcp_iflag_f32_e32 v8, v5 3769; GCN-NEXT: v_xor_b32_e32 v4, v6, v4 3770; GCN-NEXT: v_ashrrev_i32_e32 v4, 30, v4 3771; GCN-NEXT: v_or_b32_e32 v4, 1, v4 3772; GCN-NEXT: v_mul_f32_e32 v6, v7, v8 3773; GCN-NEXT: v_trunc_f32_e32 v6, v6 3774; GCN-NEXT: v_mad_f32 v7, -v6, v5, v7 3775; GCN-NEXT: v_cvt_i32_f32_e32 v6, v6 3776; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v5| 3777; GCN-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc 3778; GCN-NEXT: v_mul_lo_u32 v3, v3, s12 3779; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 3780; GCN-NEXT: v_mul_lo_u32 v1, v4, v1 3781; GCN-NEXT: v_and_b32_e32 v2, s3, v2 3782; GCN-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 3783; GCN-NEXT: v_and_b32_e32 v3, s3, v3 3784; GCN-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 3785; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 3786; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 3787; GCN-NEXT: v_or_b32_e32 v2, v2, v3 3788; GCN-NEXT: v_or_b32_e32 v0, v2, v0 3789; GCN-NEXT: v_and_b32_e32 v1, 0x1fff, v1 3790; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3791; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 3792; GCN-NEXT: s_endpgm 3793 %r = srem <3 x i15> %x, %y 3794 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 3795 ret void 3796} 3797 3798define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 3799; CHECK-LABEL: @udiv_i32_oddk_denom( 3800; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 1235195 3801; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 3802; CHECK-NEXT: ret void 3803; 3804; GCN-LABEL: udiv_i32_oddk_denom: 3805; GCN: ; %bb.0: 3806; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3807; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 3808; GCN-NEXT: v_mov_b32_e32 v0, 0xb2a50881 3809; GCN-NEXT: s_mov_b32 s7, 0xf000 3810; GCN-NEXT: s_mov_b32 s6, -1 3811; GCN-NEXT: s_waitcnt lgkmcnt(0) 3812; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 3813; GCN-NEXT: v_sub_i32_e32 v1, vcc, s0, v0 3814; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v1 3815; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 3816; GCN-NEXT: v_lshrrev_b32_e32 v0, 20, v0 3817; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3818; GCN-NEXT: s_endpgm 3819 %r = udiv i32 %x, 1235195 3820 store i32 %r, i32 addrspace(1)* %out 3821 ret void 3822} 3823 3824define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 3825; CHECK-LABEL: @udiv_i32_pow2k_denom( 3826; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 4096 3827; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 3828; CHECK-NEXT: ret void 3829; 3830; GCN-LABEL: udiv_i32_pow2k_denom: 3831; GCN: ; %bb.0: 3832; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3833; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 3834; GCN-NEXT: s_mov_b32 s7, 0xf000 3835; GCN-NEXT: s_mov_b32 s6, -1 3836; GCN-NEXT: s_waitcnt lgkmcnt(0) 3837; GCN-NEXT: s_lshr_b32 s0, s0, 12 3838; GCN-NEXT: v_mov_b32_e32 v0, s0 3839; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3840; GCN-NEXT: s_endpgm 3841 %r = udiv i32 %x, 4096 3842 store i32 %r, i32 addrspace(1)* %out 3843 ret void 3844} 3845 3846define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 3847; CHECK-LABEL: @udiv_i32_pow2_shl_denom( 3848; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 3849; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]] 3850; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 3851; CHECK-NEXT: ret void 3852; 3853; GCN-LABEL: udiv_i32_pow2_shl_denom: 3854; GCN: ; %bb.0: 3855; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3856; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3857; GCN-NEXT: s_mov_b32 s7, 0xf000 3858; GCN-NEXT: s_mov_b32 s6, -1 3859; GCN-NEXT: s_waitcnt lgkmcnt(0) 3860; GCN-NEXT: s_add_i32 s1, s1, 12 3861; GCN-NEXT: s_lshr_b32 s0, s0, s1 3862; GCN-NEXT: v_mov_b32_e32 v0, s0 3863; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 3864; GCN-NEXT: s_endpgm 3865 %shl.y = shl i32 4096, %y 3866 %r = udiv i32 %x, %shl.y 3867 store i32 %r, i32 addrspace(1)* %out 3868 ret void 3869} 3870 3871define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 3872; CHECK-LABEL: @udiv_v2i32_pow2k_denom( 3873; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 3874; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 3875; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 3876; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 3877; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096 3878; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 3879; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 3880; CHECK-NEXT: ret void 3881; 3882; GCN-LABEL: udiv_v2i32_pow2k_denom: 3883; GCN: ; %bb.0: 3884; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3885; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3886; GCN-NEXT: s_mov_b32 s7, 0xf000 3887; GCN-NEXT: s_mov_b32 s6, -1 3888; GCN-NEXT: s_waitcnt lgkmcnt(0) 3889; GCN-NEXT: s_lshr_b32 s0, s0, 12 3890; GCN-NEXT: s_lshr_b32 s1, s1, 12 3891; GCN-NEXT: v_mov_b32_e32 v0, s0 3892; GCN-NEXT: v_mov_b32_e32 v1, s1 3893; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3894; GCN-NEXT: s_endpgm 3895 %r = udiv <2 x i32> %x, <i32 4096, i32 4096> 3896 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 3897 ret void 3898} 3899 3900define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 3901; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom( 3902; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 3903; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 3904; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 3905; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 3906; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095 3907; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 3908; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 3909; CHECK-NEXT: ret void 3910; 3911; GCN-LABEL: udiv_v2i32_mixed_pow2k_denom: 3912; GCN: ; %bb.0: 3913; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3914; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 3915; GCN-NEXT: v_mov_b32_e32 v0, 0x100101 3916; GCN-NEXT: s_mov_b32 s7, 0xf000 3917; GCN-NEXT: s_mov_b32 s6, -1 3918; GCN-NEXT: s_waitcnt lgkmcnt(0) 3919; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 3920; GCN-NEXT: s_lshr_b32 s0, s0, 12 3921; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v0 3922; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v1 3923; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 3924; GCN-NEXT: v_lshrrev_b32_e32 v1, 11, v0 3925; GCN-NEXT: v_mov_b32_e32 v0, s0 3926; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3927; GCN-NEXT: s_endpgm 3928 %r = udiv <2 x i32> %x, <i32 4096, i32 4095> 3929 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 3930 ret void 3931} 3932 3933define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 3934; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom( 3935; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 3936; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 3937; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 3938; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 3939; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 3940; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 3941; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 3942; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 3943; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 3944; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 3945; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 3946; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 3947; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 3948; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 3949; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 3950; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 3951; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 3952; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 3953; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 3954; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 3955; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 3956; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 3957; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 3958; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 3959; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 3960; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1 3961; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]] 3962; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]] 3963; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]] 3964; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]] 3965; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 3966; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 3967; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> undef, i32 [[TMP31]], i64 0 3968; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x i32> [[X]], i64 1 3969; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 3970; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float 3971; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 3972; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000 3973; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32 3974; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]] 3975; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]] 3976; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64 3977; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64 3978; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]] 3979; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 3980; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32 3981; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32 3982; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]] 3983; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64 3984; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64 3985; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]] 3986; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 3987; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32 3988; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32 3989; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]] 3990; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]] 3991; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]] 3992; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1 3993; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]] 3994; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]] 3995; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]] 3996; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]] 3997; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1 3998; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]] 3999; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP63]], i64 1 4000; CHECK-NEXT: store <2 x i32> [[TMP64]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4001; CHECK-NEXT: ret void 4002; 4003; GCN-LABEL: udiv_v2i32_pow2_shl_denom: 4004; GCN: ; %bb.0: 4005; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 4006; GCN-NEXT: s_movk_i32 s4, 0x1000 4007; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb 4008; GCN-NEXT: s_mov_b32 s7, 0xf000 4009; GCN-NEXT: s_mov_b32 s6, -1 4010; GCN-NEXT: s_waitcnt lgkmcnt(0) 4011; GCN-NEXT: s_lshl_b32 s5, s4, s2 4012; GCN-NEXT: v_cvt_f32_u32_e32 v0, s5 4013; GCN-NEXT: s_lshl_b32 s10, s4, s3 4014; GCN-NEXT: s_mov_b32 s3, 0x4f7ffffe 4015; GCN-NEXT: v_cvt_f32_u32_e32 v1, s10 4016; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 4017; GCN-NEXT: s_sub_i32 s2, 0, s5 4018; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 4019; GCN-NEXT: v_mul_f32_e32 v0, s3, v0 4020; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4021; GCN-NEXT: v_mul_f32_e32 v1, s3, v1 4022; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 4023; GCN-NEXT: v_mul_lo_u32 v2, s2, v0 4024; GCN-NEXT: s_sub_i32 s2, 0, s10 4025; GCN-NEXT: v_mul_lo_u32 v3, s2, v1 4026; GCN-NEXT: v_mul_hi_u32 v2, v0, v2 4027; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 4028; GCN-NEXT: v_mul_hi_u32 v0, s8, v0 4029; GCN-NEXT: v_mul_hi_u32 v2, v1, v3 4030; GCN-NEXT: v_mul_lo_u32 v3, v0, s5 4031; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v0 4032; GCN-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 4033; GCN-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v3 4034; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[2:3] 4035; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s5, v3 4036; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[2:3] 4037; GCN-NEXT: v_add_i32_e32 v4, vcc, 1, v0 4038; GCN-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 4039; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 4040; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 4041; GCN-NEXT: v_mul_hi_u32 v1, s9, v1 4042; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4043; GCN-NEXT: v_mul_lo_u32 v2, v1, s10 4044; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v1 4045; GCN-NEXT: v_sub_i32_e32 v2, vcc, s9, v2 4046; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 4047; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 4048; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s10, v2 4049; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 4050; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v1 4051; GCN-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 4052; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 4053; GCN-NEXT: s_waitcnt lgkmcnt(0) 4054; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4055; GCN-NEXT: s_endpgm 4056 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 4057 %r = udiv <2 x i32> %x, %shl.y 4058 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4059 ret void 4060} 4061 4062define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 4063; CHECK-LABEL: @urem_i32_oddk_denom( 4064; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 1235195 4065; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4066; CHECK-NEXT: ret void 4067; 4068; GCN-LABEL: urem_i32_oddk_denom: 4069; GCN: ; %bb.0: 4070; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4071; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4072; GCN-NEXT: v_mov_b32_e32 v0, 0xb2a50881 4073; GCN-NEXT: s_mov_b32 s7, 0xf000 4074; GCN-NEXT: s_mov_b32 s6, -1 4075; GCN-NEXT: s_waitcnt lgkmcnt(0) 4076; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 4077; GCN-NEXT: v_sub_i32_e32 v1, vcc, s0, v0 4078; GCN-NEXT: v_lshrrev_b32_e32 v1, 1, v1 4079; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 4080; GCN-NEXT: v_lshrrev_b32_e32 v0, 20, v0 4081; GCN-NEXT: v_mul_u32_u24_e32 v0, 0x12d8fb, v0 4082; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 4083; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4084; GCN-NEXT: s_endpgm 4085 %r = urem i32 %x, 1235195 4086 store i32 %r, i32 addrspace(1)* %out 4087 ret void 4088} 4089 4090define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 4091; CHECK-LABEL: @urem_i32_pow2k_denom( 4092; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 4096 4093; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4094; CHECK-NEXT: ret void 4095; 4096; GCN-LABEL: urem_i32_pow2k_denom: 4097; GCN: ; %bb.0: 4098; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4099; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4100; GCN-NEXT: s_mov_b32 s7, 0xf000 4101; GCN-NEXT: s_mov_b32 s6, -1 4102; GCN-NEXT: s_waitcnt lgkmcnt(0) 4103; GCN-NEXT: s_and_b32 s0, s0, 0xfff 4104; GCN-NEXT: v_mov_b32_e32 v0, s0 4105; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4106; GCN-NEXT: s_endpgm 4107 %r = urem i32 %x, 4096 4108 store i32 %r, i32 addrspace(1)* %out 4109 ret void 4110} 4111 4112define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 4113; CHECK-LABEL: @urem_i32_pow2_shl_denom( 4114; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 4115; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]] 4116; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4117; CHECK-NEXT: ret void 4118; 4119; GCN-LABEL: urem_i32_pow2_shl_denom: 4120; GCN: ; %bb.0: 4121; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4122; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4123; GCN-NEXT: s_mov_b32 s7, 0xf000 4124; GCN-NEXT: s_mov_b32 s6, -1 4125; GCN-NEXT: s_waitcnt lgkmcnt(0) 4126; GCN-NEXT: s_lshl_b32 s1, 0x1000, s1 4127; GCN-NEXT: s_add_i32 s1, s1, -1 4128; GCN-NEXT: s_and_b32 s0, s0, s1 4129; GCN-NEXT: v_mov_b32_e32 v0, s0 4130; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4131; GCN-NEXT: s_endpgm 4132 %shl.y = shl i32 4096, %y 4133 %r = urem i32 %x, %shl.y 4134 store i32 %r, i32 addrspace(1)* %out 4135 ret void 4136} 4137 4138define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 4139; CHECK-LABEL: @urem_v2i32_pow2k_denom( 4140; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4141; CHECK-NEXT: [[TMP2:%.*]] = urem i32 [[TMP1]], 4096 4142; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 4143; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 4144; CHECK-NEXT: [[TMP5:%.*]] = urem i32 [[TMP4]], 4096 4145; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 4146; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4147; CHECK-NEXT: ret void 4148; 4149; GCN-LABEL: urem_v2i32_pow2k_denom: 4150; GCN: ; %bb.0: 4151; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4152; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4153; GCN-NEXT: s_movk_i32 s2, 0xfff 4154; GCN-NEXT: s_mov_b32 s7, 0xf000 4155; GCN-NEXT: s_mov_b32 s6, -1 4156; GCN-NEXT: s_waitcnt lgkmcnt(0) 4157; GCN-NEXT: s_and_b32 s0, s0, s2 4158; GCN-NEXT: s_and_b32 s1, s1, s2 4159; GCN-NEXT: v_mov_b32_e32 v0, s0 4160; GCN-NEXT: v_mov_b32_e32 v1, s1 4161; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4162; GCN-NEXT: s_endpgm 4163 %r = urem <2 x i32> %x, <i32 4096, i32 4096> 4164 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4165 ret void 4166} 4167 4168define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 4169; CHECK-LABEL: @urem_v2i32_pow2_shl_denom( 4170; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 4171; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4172; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 4173; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 4174; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 4175; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 4176; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 4177; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 4178; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 4179; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 4180; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 4181; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 4182; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 4183; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 4184; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 4185; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 4186; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 4187; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 4188; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 4189; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 4190; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 4191; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 4192; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 4193; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 4194; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 4195; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]] 4196; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]] 4197; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]] 4198; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]] 4199; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]] 4200; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> undef, i32 [[TMP29]], i64 0 4201; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[X]], i64 1 4202; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 4203; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float 4204; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 4205; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000 4206; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32 4207; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]] 4208; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]] 4209; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64 4210; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64 4211; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]] 4212; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32 4213; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32 4214; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 4215; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]] 4216; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64 4217; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64 4218; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]] 4219; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32 4220; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32 4221; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 4222; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]] 4223; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]] 4224; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]] 4225; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]] 4226; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]] 4227; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]] 4228; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]] 4229; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]] 4230; CHECK-NEXT: [[TMP60:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP59]], i64 1 4231; CHECK-NEXT: store <2 x i32> [[TMP60]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4232; CHECK-NEXT: ret void 4233; 4234; GCN-LABEL: urem_v2i32_pow2_shl_denom: 4235; GCN: ; %bb.0: 4236; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 4237; GCN-NEXT: s_movk_i32 s4, 0x1000 4238; GCN-NEXT: s_mov_b32 s7, 0x4f7ffffe 4239; GCN-NEXT: s_waitcnt lgkmcnt(0) 4240; GCN-NEXT: s_lshl_b32 s2, s4, s2 4241; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 4242; GCN-NEXT: s_lshl_b32 s6, s4, s3 4243; GCN-NEXT: s_sub_i32 s3, 0, s2 4244; GCN-NEXT: v_cvt_f32_u32_e32 v1, s6 4245; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 4246; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 4247; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4248; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 4249; GCN-NEXT: v_mul_f32_e32 v0, s7, v0 4250; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4251; GCN-NEXT: v_mul_f32_e32 v1, s7, v1 4252; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 4253; GCN-NEXT: v_mul_lo_u32 v2, s3, v0 4254; GCN-NEXT: s_sub_i32 s3, 0, s6 4255; GCN-NEXT: v_mul_hi_u32 v2, v0, v2 4256; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 4257; GCN-NEXT: s_waitcnt lgkmcnt(0) 4258; GCN-NEXT: v_mul_hi_u32 v0, s4, v0 4259; GCN-NEXT: v_mul_lo_u32 v2, s3, v1 4260; GCN-NEXT: s_mov_b32 s3, 0xf000 4261; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 4262; GCN-NEXT: v_mul_hi_u32 v2, v1, v2 4263; GCN-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 4264; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s2, v0 4265; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 4266; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4267; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s2, v0 4268; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 4269; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4270; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 4271; GCN-NEXT: v_mul_hi_u32 v1, s5, v1 4272; GCN-NEXT: s_mov_b32 s2, -1 4273; GCN-NEXT: v_mul_lo_u32 v1, v1, s6 4274; GCN-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 4275; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s6, v1 4276; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 4277; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4278; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s6, v1 4279; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v1 4280; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4281; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4282; GCN-NEXT: s_endpgm 4283 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 4284 %r = urem <2 x i32> %x, %shl.y 4285 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4286 ret void 4287} 4288 4289define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 4290; CHECK-LABEL: @sdiv_i32_oddk_denom( 4291; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195 4292; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4293; CHECK-NEXT: ret void 4294; 4295; GCN-LABEL: sdiv_i32_oddk_denom: 4296; GCN: ; %bb.0: 4297; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4298; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4299; GCN-NEXT: v_mov_b32_e32 v0, 0xd9528441 4300; GCN-NEXT: s_mov_b32 s7, 0xf000 4301; GCN-NEXT: s_mov_b32 s6, -1 4302; GCN-NEXT: s_waitcnt lgkmcnt(0) 4303; GCN-NEXT: v_mul_hi_i32 v0, s0, v0 4304; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 4305; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 4306; GCN-NEXT: v_ashrrev_i32_e32 v0, 20, v0 4307; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 4308; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4309; GCN-NEXT: s_endpgm 4310 %r = sdiv i32 %x, 1235195 4311 store i32 %r, i32 addrspace(1)* %out 4312 ret void 4313} 4314 4315define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 4316; CHECK-LABEL: @sdiv_i32_pow2k_denom( 4317; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 4096 4318; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4319; CHECK-NEXT: ret void 4320; 4321; GCN-LABEL: sdiv_i32_pow2k_denom: 4322; GCN: ; %bb.0: 4323; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4324; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4325; GCN-NEXT: s_mov_b32 s7, 0xf000 4326; GCN-NEXT: s_mov_b32 s6, -1 4327; GCN-NEXT: s_waitcnt lgkmcnt(0) 4328; GCN-NEXT: s_ashr_i32 s1, s0, 31 4329; GCN-NEXT: s_lshr_b32 s1, s1, 20 4330; GCN-NEXT: s_add_i32 s0, s0, s1 4331; GCN-NEXT: s_ashr_i32 s0, s0, 12 4332; GCN-NEXT: v_mov_b32_e32 v0, s0 4333; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4334; GCN-NEXT: s_endpgm 4335 %r = sdiv i32 %x, 4096 4336 store i32 %r, i32 addrspace(1)* %out 4337 ret void 4338} 4339 4340define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 4341; CHECK-LABEL: @sdiv_i32_pow2_shl_denom( 4342; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 4343; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]] 4344; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4345; CHECK-NEXT: ret void 4346; 4347; GCN-LABEL: sdiv_i32_pow2_shl_denom: 4348; GCN: ; %bb.0: 4349; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4350; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4351; GCN-NEXT: s_mov_b32 s7, 0xf000 4352; GCN-NEXT: s_mov_b32 s6, -1 4353; GCN-NEXT: s_waitcnt lgkmcnt(0) 4354; GCN-NEXT: s_lshl_b32 s3, 0x1000, s3 4355; GCN-NEXT: s_ashr_i32 s8, s3, 31 4356; GCN-NEXT: s_add_i32 s3, s3, s8 4357; GCN-NEXT: s_xor_b32 s9, s3, s8 4358; GCN-NEXT: v_cvt_f32_u32_e32 v0, s9 4359; GCN-NEXT: s_sub_i32 s3, 0, s9 4360; GCN-NEXT: s_ashr_i32 s0, s2, 31 4361; GCN-NEXT: s_add_i32 s1, s2, s0 4362; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 4363; GCN-NEXT: s_xor_b32 s1, s1, s0 4364; GCN-NEXT: s_xor_b32 s2, s0, s8 4365; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 4366; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4367; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 4368; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 4369; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 4370; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 4371; GCN-NEXT: v_mul_lo_u32 v1, v0, s9 4372; GCN-NEXT: v_add_i32_e32 v2, vcc, 1, v0 4373; GCN-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 4374; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v1 4375; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 4376; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s9, v1 4377; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v0 4378; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 4379; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 4380; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4381; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 4382; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 4383; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4384; GCN-NEXT: s_endpgm 4385 %shl.y = shl i32 4096, %y 4386 %r = sdiv i32 %x, %shl.y 4387 store i32 %r, i32 addrspace(1)* %out 4388 ret void 4389} 4390 4391define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 4392; CHECK-LABEL: @sdiv_v2i32_pow2k_denom( 4393; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4394; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 4395; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 4396; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 4397; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096 4398; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 4399; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4400; CHECK-NEXT: ret void 4401; 4402; GCN-LABEL: sdiv_v2i32_pow2k_denom: 4403; GCN: ; %bb.0: 4404; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4405; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4406; GCN-NEXT: s_mov_b32 s7, 0xf000 4407; GCN-NEXT: s_mov_b32 s6, -1 4408; GCN-NEXT: s_waitcnt lgkmcnt(0) 4409; GCN-NEXT: s_ashr_i32 s2, s0, 31 4410; GCN-NEXT: s_lshr_b32 s2, s2, 20 4411; GCN-NEXT: s_ashr_i32 s3, s1, 31 4412; GCN-NEXT: s_add_i32 s0, s0, s2 4413; GCN-NEXT: s_lshr_b32 s2, s3, 20 4414; GCN-NEXT: s_add_i32 s1, s1, s2 4415; GCN-NEXT: s_ashr_i32 s0, s0, 12 4416; GCN-NEXT: s_ashr_i32 s1, s1, 12 4417; GCN-NEXT: v_mov_b32_e32 v0, s0 4418; GCN-NEXT: v_mov_b32_e32 v1, s1 4419; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4420; GCN-NEXT: s_endpgm 4421 %r = sdiv <2 x i32> %x, <i32 4096, i32 4096> 4422 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4423 ret void 4424} 4425 4426define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 4427; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom( 4428; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4429; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 4430; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 4431; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 4432; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095 4433; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 4434; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4435; CHECK-NEXT: ret void 4436; 4437; GCN-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 4438; GCN: ; %bb.0: 4439; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4440; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4441; GCN-NEXT: v_mov_b32_e32 v0, 0x80080081 4442; GCN-NEXT: s_mov_b32 s7, 0xf000 4443; GCN-NEXT: s_mov_b32 s6, -1 4444; GCN-NEXT: s_waitcnt lgkmcnt(0) 4445; GCN-NEXT: v_mul_hi_i32 v0, s1, v0 4446; GCN-NEXT: s_ashr_i32 s2, s0, 31 4447; GCN-NEXT: s_lshr_b32 s2, s2, 20 4448; GCN-NEXT: s_add_i32 s0, s0, s2 4449; GCN-NEXT: v_add_i32_e32 v0, vcc, s1, v0 4450; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 4451; GCN-NEXT: v_ashrrev_i32_e32 v0, 11, v0 4452; GCN-NEXT: s_ashr_i32 s0, s0, 12 4453; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v0 4454; GCN-NEXT: v_mov_b32_e32 v0, s0 4455; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4456; GCN-NEXT: s_endpgm 4457 %r = sdiv <2 x i32> %x, <i32 4096, i32 4095> 4458 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4459 ret void 4460} 4461 4462define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 4463; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom( 4464; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 4465; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4466; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 4467; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 4468; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 4469; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 4470; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 4471; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 4472; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 4473; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 4474; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 4475; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 4476; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000 4477; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 4478; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]] 4479; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]] 4480; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64 4481; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 4482; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 4483; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 4484; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 4485; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 4486; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]] 4487; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64 4488; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64 4489; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 4490; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 4491; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 4492; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 4493; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]] 4494; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]] 4495; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]] 4496; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1 4497; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]] 4498; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]] 4499; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]] 4500; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]] 4501; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1 4502; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]] 4503; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]] 4504; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]] 4505; CHECK-NEXT: [[TMP41:%.*]] = insertelement <2 x i32> undef, i32 [[TMP40]], i64 0 4506; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[X]], i64 1 4507; CHECK-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 4508; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31 4509; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31 4510; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]] 4511; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]] 4512; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]] 4513; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]] 4514; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]] 4515; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float 4516; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]]) 4517; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000 4518; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32 4519; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]] 4520; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]] 4521; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64 4522; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64 4523; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]] 4524; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32 4525; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32 4526; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32 4527; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]] 4528; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64 4529; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64 4530; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]] 4531; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32 4532; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32 4533; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32 4534; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]] 4535; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]] 4536; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]] 4537; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1 4538; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]] 4539; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]] 4540; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]] 4541; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]] 4542; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1 4543; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]] 4544; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]] 4545; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]] 4546; CHECK-NEXT: [[TMP82:%.*]] = insertelement <2 x i32> [[TMP41]], i32 [[TMP81]], i64 1 4547; CHECK-NEXT: store <2 x i32> [[TMP82]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4548; CHECK-NEXT: ret void 4549; 4550; GCN-LABEL: sdiv_v2i32_pow2_shl_denom: 4551; GCN: ; %bb.0: 4552; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 4553; GCN-NEXT: s_movk_i32 s6, 0x1000 4554; GCN-NEXT: s_mov_b32 s12, 0x4f7ffffe 4555; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4556; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb 4557; GCN-NEXT: s_mov_b32 s7, 0xf000 4558; GCN-NEXT: s_waitcnt lgkmcnt(0) 4559; GCN-NEXT: s_lshl_b32 s2, s6, s2 4560; GCN-NEXT: s_ashr_i32 s10, s2, 31 4561; GCN-NEXT: s_add_i32 s2, s2, s10 4562; GCN-NEXT: s_xor_b32 s11, s2, s10 4563; GCN-NEXT: v_cvt_f32_u32_e32 v0, s11 4564; GCN-NEXT: s_sub_i32 s1, 0, s11 4565; GCN-NEXT: s_lshl_b32 s0, s6, s3 4566; GCN-NEXT: s_ashr_i32 s3, s0, 31 4567; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 4568; GCN-NEXT: s_add_i32 s0, s0, s3 4569; GCN-NEXT: s_xor_b32 s13, s0, s3 4570; GCN-NEXT: v_cvt_f32_u32_e32 v2, s13 4571; GCN-NEXT: v_mul_f32_e32 v0, s12, v0 4572; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4573; GCN-NEXT: s_ashr_i32 s2, s8, 31 4574; GCN-NEXT: s_add_i32 s0, s8, s2 4575; GCN-NEXT: s_xor_b32 s0, s0, s2 4576; GCN-NEXT: v_mul_lo_u32 v1, s1, v0 4577; GCN-NEXT: v_rcp_iflag_f32_e32 v2, v2 4578; GCN-NEXT: s_xor_b32 s2, s2, s10 4579; GCN-NEXT: s_mov_b32 s6, -1 4580; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 4581; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 4582; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 4583; GCN-NEXT: v_mul_f32_e32 v1, s12, v2 4584; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 4585; GCN-NEXT: v_mul_lo_u32 v2, v0, s11 4586; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v0 4587; GCN-NEXT: v_sub_i32_e32 v2, vcc, s0, v2 4588; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v2 4589; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 4590; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s11, v2 4591; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 4592; GCN-NEXT: s_sub_i32 s0, 0, s13 4593; GCN-NEXT: v_mul_lo_u32 v4, s0, v1 4594; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v0 4595; GCN-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 4596; GCN-NEXT: s_ashr_i32 s0, s9, 31 4597; GCN-NEXT: v_mul_hi_u32 v2, v1, v4 4598; GCN-NEXT: s_add_i32 s1, s9, s0 4599; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4600; GCN-NEXT: s_xor_b32 s1, s1, s0 4601; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 4602; GCN-NEXT: v_mul_hi_u32 v1, s1, v1 4603; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 4604; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 4605; GCN-NEXT: s_xor_b32 s2, s0, s3 4606; GCN-NEXT: v_mul_lo_u32 v2, v1, s13 4607; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v1 4608; GCN-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 4609; GCN-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v2 4610; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 4611; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s13, v2 4612; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 4613; GCN-NEXT: v_add_i32_e32 v3, vcc, 1, v1 4614; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 4615; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 4616; GCN-NEXT: v_xor_b32_e32 v1, s2, v1 4617; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 4618; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4619; GCN-NEXT: s_endpgm 4620 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 4621 %r = sdiv <2 x i32> %x, %shl.y 4622 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4623 ret void 4624} 4625 4626define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 4627; CHECK-LABEL: @srem_i32_oddk_denom( 4628; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 1235195 4629; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4630; CHECK-NEXT: ret void 4631; 4632; GCN-LABEL: srem_i32_oddk_denom: 4633; GCN: ; %bb.0: 4634; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4635; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4636; GCN-NEXT: v_mov_b32_e32 v0, 0xd9528441 4637; GCN-NEXT: s_mov_b32 s7, 0xf000 4638; GCN-NEXT: s_mov_b32 s6, -1 4639; GCN-NEXT: s_waitcnt lgkmcnt(0) 4640; GCN-NEXT: v_mul_hi_i32 v0, s0, v0 4641; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 4642; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v0 4643; GCN-NEXT: v_ashrrev_i32_e32 v0, 20, v0 4644; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 4645; GCN-NEXT: v_mul_i32_i24_e32 v0, 0x12d8fb, v0 4646; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 4647; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4648; GCN-NEXT: s_endpgm 4649 %r = srem i32 %x, 1235195 4650 store i32 %r, i32 addrspace(1)* %out 4651 ret void 4652} 4653 4654define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 4655; CHECK-LABEL: @srem_i32_pow2k_denom( 4656; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 4096 4657; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4658; CHECK-NEXT: ret void 4659; 4660; GCN-LABEL: srem_i32_pow2k_denom: 4661; GCN: ; %bb.0: 4662; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4663; GCN-NEXT: s_load_dword s0, s[0:1], 0xb 4664; GCN-NEXT: s_mov_b32 s7, 0xf000 4665; GCN-NEXT: s_mov_b32 s6, -1 4666; GCN-NEXT: s_waitcnt lgkmcnt(0) 4667; GCN-NEXT: s_ashr_i32 s1, s0, 31 4668; GCN-NEXT: s_lshr_b32 s1, s1, 20 4669; GCN-NEXT: s_add_i32 s1, s0, s1 4670; GCN-NEXT: s_and_b32 s1, s1, 0xfffff000 4671; GCN-NEXT: s_sub_i32 s0, s0, s1 4672; GCN-NEXT: v_mov_b32_e32 v0, s0 4673; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 4674; GCN-NEXT: s_endpgm 4675 %r = srem i32 %x, 4096 4676 store i32 %r, i32 addrspace(1)* %out 4677 ret void 4678} 4679 4680define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 4681; CHECK-LABEL: @srem_i32_pow2_shl_denom( 4682; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 4683; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]] 4684; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 4685; CHECK-NEXT: ret void 4686; 4687; GCN-LABEL: srem_i32_pow2_shl_denom: 4688; GCN: ; %bb.0: 4689; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4690; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4691; GCN-NEXT: s_waitcnt lgkmcnt(0) 4692; GCN-NEXT: s_lshl_b32 s3, 0x1000, s3 4693; GCN-NEXT: s_ashr_i32 s4, s3, 31 4694; GCN-NEXT: s_add_i32 s3, s3, s4 4695; GCN-NEXT: s_xor_b32 s4, s3, s4 4696; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4 4697; GCN-NEXT: s_sub_i32 s3, 0, s4 4698; GCN-NEXT: s_ashr_i32 s5, s2, 31 4699; GCN-NEXT: s_add_i32 s2, s2, s5 4700; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 4701; GCN-NEXT: s_xor_b32 s6, s2, s5 4702; GCN-NEXT: s_mov_b32 s2, -1 4703; GCN-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 4704; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4705; GCN-NEXT: v_mul_lo_u32 v1, s3, v0 4706; GCN-NEXT: s_mov_b32 s3, 0xf000 4707; GCN-NEXT: v_mul_hi_u32 v1, v0, v1 4708; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0 4709; GCN-NEXT: v_mul_hi_u32 v0, s6, v0 4710; GCN-NEXT: v_mul_lo_u32 v0, v0, s4 4711; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 4712; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 4713; GCN-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 4714; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 4715; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 4716; GCN-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 4717; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 4718; GCN-NEXT: v_xor_b32_e32 v0, s5, v0 4719; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s5, v0 4720; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 4721; GCN-NEXT: s_endpgm 4722 %shl.y = shl i32 4096, %y 4723 %r = srem i32 %x, %shl.y 4724 store i32 %r, i32 addrspace(1)* %out 4725 ret void 4726} 4727 4728define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 4729; CHECK-LABEL: @srem_v2i32_pow2k_denom( 4730; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4731; CHECK-NEXT: [[TMP2:%.*]] = srem i32 [[TMP1]], 4096 4732; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 4733; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 4734; CHECK-NEXT: [[TMP5:%.*]] = srem i32 [[TMP4]], 4096 4735; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 4736; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4737; CHECK-NEXT: ret void 4738; 4739; GCN-LABEL: srem_v2i32_pow2k_denom: 4740; GCN: ; %bb.0: 4741; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4742; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4743; GCN-NEXT: s_movk_i32 s2, 0xf000 4744; GCN-NEXT: s_mov_b32 s7, 0xf000 4745; GCN-NEXT: s_mov_b32 s6, -1 4746; GCN-NEXT: s_waitcnt lgkmcnt(0) 4747; GCN-NEXT: s_ashr_i32 s3, s0, 31 4748; GCN-NEXT: s_lshr_b32 s3, s3, 20 4749; GCN-NEXT: s_add_i32 s3, s0, s3 4750; GCN-NEXT: s_and_b32 s3, s3, s2 4751; GCN-NEXT: s_sub_i32 s0, s0, s3 4752; GCN-NEXT: s_ashr_i32 s3, s1, 31 4753; GCN-NEXT: s_lshr_b32 s3, s3, 20 4754; GCN-NEXT: s_add_i32 s3, s1, s3 4755; GCN-NEXT: s_and_b32 s2, s3, s2 4756; GCN-NEXT: s_sub_i32 s1, s1, s2 4757; GCN-NEXT: v_mov_b32_e32 v0, s0 4758; GCN-NEXT: v_mov_b32_e32 v1, s1 4759; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4760; GCN-NEXT: s_endpgm 4761 %r = srem <2 x i32> %x, <i32 4096, i32 4096> 4762 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4763 ret void 4764} 4765 4766define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 4767; CHECK-LABEL: @srem_v2i32_pow2_shl_denom( 4768; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 4769; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 4770; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 4771; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 4772; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 4773; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 4774; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 4775; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 4776; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 4777; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 4778; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 4779; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000 4780; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 4781; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]] 4782; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]] 4783; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64 4784; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 4785; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]] 4786; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 4787; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32 4788; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 4789; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] 4790; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 4791; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64 4792; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 4793; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 4794; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 4795; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 4796; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]] 4797; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]] 4798; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]] 4799; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]] 4800; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]] 4801; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]] 4802; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]] 4803; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]] 4804; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]] 4805; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]] 4806; CHECK-NEXT: [[TMP38:%.*]] = insertelement <2 x i32> undef, i32 [[TMP37]], i64 0 4807; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[X]], i64 1 4808; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 4809; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31 4810; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31 4811; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]] 4812; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]] 4813; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]] 4814; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]] 4815; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float 4816; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]]) 4817; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000 4818; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32 4819; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]] 4820; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]] 4821; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64 4822; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64 4823; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]] 4824; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32 4825; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32 4826; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32 4827; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]] 4828; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64 4829; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64 4830; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]] 4831; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32 4832; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32 4833; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32 4834; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]] 4835; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]] 4836; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]] 4837; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]] 4838; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]] 4839; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]] 4840; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]] 4841; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]] 4842; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]] 4843; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]] 4844; CHECK-NEXT: [[TMP76:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP75]], i64 1 4845; CHECK-NEXT: store <2 x i32> [[TMP76]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 4846; CHECK-NEXT: ret void 4847; 4848; GCN-LABEL: srem_v2i32_pow2_shl_denom: 4849; GCN: ; %bb.0: 4850; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 4851; GCN-NEXT: s_movk_i32 s6, 0x1000 4852; GCN-NEXT: s_mov_b32 s7, 0x4f7ffffe 4853; GCN-NEXT: s_waitcnt lgkmcnt(0) 4854; GCN-NEXT: s_lshl_b32 s2, s6, s2 4855; GCN-NEXT: s_ashr_i32 s4, s2, 31 4856; GCN-NEXT: s_add_i32 s2, s2, s4 4857; GCN-NEXT: s_xor_b32 s2, s2, s4 4858; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2 4859; GCN-NEXT: s_lshl_b32 s3, s6, s3 4860; GCN-NEXT: s_ashr_i32 s6, s3, 31 4861; GCN-NEXT: s_add_i32 s3, s3, s6 4862; GCN-NEXT: v_rcp_iflag_f32_e32 v0, v0 4863; GCN-NEXT: s_xor_b32 s3, s3, s6 4864; GCN-NEXT: s_sub_i32 s6, 0, s2 4865; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4866; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4867; GCN-NEXT: v_mul_f32_e32 v0, s7, v0 4868; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4869; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3 4870; GCN-NEXT: s_waitcnt lgkmcnt(0) 4871; GCN-NEXT: s_ashr_i32 s8, s0, 31 4872; GCN-NEXT: v_mul_lo_u32 v2, s6, v0 4873; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v1 4874; GCN-NEXT: s_add_i32 s0, s0, s8 4875; GCN-NEXT: s_xor_b32 s0, s0, s8 4876; GCN-NEXT: v_mul_hi_u32 v2, v0, v2 4877; GCN-NEXT: v_mul_f32_e32 v1, s7, v1 4878; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 4879; GCN-NEXT: s_sub_i32 s6, 0, s3 4880; GCN-NEXT: v_add_i32_e32 v0, vcc, v2, v0 4881; GCN-NEXT: v_mul_hi_u32 v0, s0, v0 4882; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 4883; GCN-NEXT: s_ashr_i32 s9, s1, 31 4884; GCN-NEXT: s_add_i32 s1, s1, s9 4885; GCN-NEXT: v_mul_lo_u32 v0, v0, s2 4886; GCN-NEXT: v_mul_hi_u32 v2, v1, v2 4887; GCN-NEXT: s_mov_b32 s7, 0xf000 4888; GCN-NEXT: s_mov_b32 s6, -1 4889; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 4890; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s2, v0 4891; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 4892; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4893; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s2, v0 4894; GCN-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 4895; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 4896; GCN-NEXT: s_xor_b32 s0, s1, s9 4897; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 4898; GCN-NEXT: v_mul_hi_u32 v1, s0, v1 4899; GCN-NEXT: v_xor_b32_e32 v0, s8, v0 4900; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 4901; GCN-NEXT: v_mul_lo_u32 v1, v1, s3 4902; GCN-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 4903; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 4904; GCN-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 4905; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4906; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 4907; GCN-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 4908; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 4909; GCN-NEXT: v_xor_b32_e32 v1, s9, v1 4910; GCN-NEXT: v_subrev_i32_e32 v1, vcc, s9, v1 4911; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4912; GCN-NEXT: s_endpgm 4913 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 4914 %r = srem <2 x i32> %x, %shl.y 4915 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 4916 ret void 4917} 4918 4919define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 4920; CHECK-LABEL: @udiv_i64_oddk_denom( 4921; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943 4922; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 4923; CHECK-NEXT: ret void 4924; 4925; GCN-LABEL: udiv_i64_oddk_denom: 4926; GCN: ; %bb.0: 4927; GCN-NEXT: v_mov_b32_e32 v0, 0x4f176a73 4928; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 4929; GCN-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 4930; GCN-NEXT: v_rcp_f32_e32 v0, v0 4931; GCN-NEXT: s_movk_i32 s2, 0xfee0 4932; GCN-NEXT: s_mov_b32 s3, 0x68958c89 4933; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 4934; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 4935; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 4936; GCN-NEXT: v_trunc_f32_e32 v1, v1 4937; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 4938; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 4939; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 4940; GCN-NEXT: s_waitcnt lgkmcnt(0) 4941; GCN-NEXT: s_mov_b32 s4, s8 4942; GCN-NEXT: s_movk_i32 s8, 0x11f 4943; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 4944; GCN-NEXT: v_mul_hi_u32 v3, v0, s3 4945; GCN-NEXT: v_mul_lo_u32 v4, v1, s3 4946; GCN-NEXT: v_mul_lo_u32 v5, v0, s3 4947; GCN-NEXT: s_mov_b32 s5, s9 4948; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 4949; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 4950; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 4951; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 4952; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 4953; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 4954; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 4955; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 4956; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 4957; GCN-NEXT: v_mul_lo_u32 v6, v1, v5 4958; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 4959; GCN-NEXT: s_movk_i32 s9, 0x11e 4960; GCN-NEXT: s_mov_b32 s7, 0xf000 4961; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 4962; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 4963; GCN-NEXT: v_mov_b32_e32 v4, 0 4964; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 4965; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 4966; GCN-NEXT: v_mov_b32_e32 v6, 0 4967; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 4968; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 4969; GCN-NEXT: v_mul_lo_u32 v5, v0, s2 4970; GCN-NEXT: v_mul_hi_u32 v7, v0, s3 4971; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 4972; GCN-NEXT: v_mul_lo_u32 v8, v2, s3 4973; GCN-NEXT: s_mov_b32 s2, 0x976a7377 4974; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 4975; GCN-NEXT: v_mul_lo_u32 v7, v0, s3 4976; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8 4977; GCN-NEXT: v_mul_lo_u32 v8, v0, v5 4978; GCN-NEXT: v_mul_hi_u32 v10, v0, v5 4979; GCN-NEXT: v_mul_hi_u32 v9, v0, v7 4980; GCN-NEXT: v_mul_hi_u32 v11, v2, v5 4981; GCN-NEXT: s_mov_b32 s6, -1 4982; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 4983; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 4984; GCN-NEXT: v_mul_lo_u32 v10, v2, v7 4985; GCN-NEXT: v_mul_hi_u32 v7, v2, v7 4986; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 4987; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10 4988; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc 4989; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v4, vcc 4990; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 4991; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 4992; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 4993; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 4994; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 4995; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4996; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 4997; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 4998; GCN-NEXT: v_mul_hi_u32 v5, s10, v1 4999; GCN-NEXT: v_mul_hi_u32 v7, s11, v1 5000; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 5001; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5002; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 5003; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 5004; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 5005; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 5006; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 5007; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 5008; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5009; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 5010; GCN-NEXT: v_mul_lo_u32 v2, v0, s8 5011; GCN-NEXT: v_mul_hi_u32 v3, v0, s2 5012; GCN-NEXT: v_mul_lo_u32 v4, v1, s2 5013; GCN-NEXT: v_mov_b32_e32 v5, s8 5014; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5015; GCN-NEXT: v_mul_lo_u32 v3, v0, s2 5016; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 5017; GCN-NEXT: v_sub_i32_e32 v4, vcc, s11, v2 5018; GCN-NEXT: v_sub_i32_e64 v3, s[0:1], s10, v3 5019; GCN-NEXT: v_subb_u32_e64 v4, vcc, v4, v5, s[0:1] 5020; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s2, v3 5021; GCN-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc 5022; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s9, v4 5023; GCN-NEXT: s_mov_b32 s10, 0x976a7376 5024; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 5025; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s10, v5 5026; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 5027; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s8, v4 5028; GCN-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc 5029; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 5030; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 5031; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 5032; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc 5033; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v4 5034; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[2:3] 5035; GCN-NEXT: v_mov_b32_e32 v6, s11 5036; GCN-NEXT: v_subb_u32_e64 v2, vcc, v6, v2, s[0:1] 5037; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s9, v2 5038; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 5039; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s10, v3 5040; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 5041; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s8, v2 5042; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 5043; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 5044; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[2:3] 5045; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 5046; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5047; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5048; GCN-NEXT: s_endpgm 5049 %r = udiv i64 %x, 1235195949943 5050 store i64 %r, i64 addrspace(1)* %out 5051 ret void 5052} 5053 5054define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 5055; CHECK-LABEL: @udiv_i64_pow2k_denom( 5056; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 4096 5057; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5058; CHECK-NEXT: ret void 5059; 5060; GCN-LABEL: udiv_i64_pow2k_denom: 5061; GCN: ; %bb.0: 5062; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 5063; GCN-NEXT: s_mov_b32 s7, 0xf000 5064; GCN-NEXT: s_mov_b32 s6, -1 5065; GCN-NEXT: s_waitcnt lgkmcnt(0) 5066; GCN-NEXT: s_mov_b32 s4, s0 5067; GCN-NEXT: s_mov_b32 s5, s1 5068; GCN-NEXT: s_lshr_b64 s[0:1], s[2:3], 12 5069; GCN-NEXT: v_mov_b32_e32 v0, s0 5070; GCN-NEXT: v_mov_b32_e32 v1, s1 5071; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5072; GCN-NEXT: s_endpgm 5073 %r = udiv i64 %x, 4096 5074 store i64 %r, i64 addrspace(1)* %out 5075 ret void 5076} 5077 5078define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 5079; CHECK-LABEL: @udiv_i64_pow2_shl_denom( 5080; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 5081; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]] 5082; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5083; CHECK-NEXT: ret void 5084; 5085; GCN-LABEL: udiv_i64_pow2_shl_denom: 5086; GCN: ; %bb.0: 5087; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 5088; GCN-NEXT: s_load_dword s8, s[0:1], 0xd 5089; GCN-NEXT: s_mov_b32 s3, 0xf000 5090; GCN-NEXT: s_mov_b32 s2, -1 5091; GCN-NEXT: s_waitcnt lgkmcnt(0) 5092; GCN-NEXT: s_mov_b32 s0, s4 5093; GCN-NEXT: s_add_i32 s8, s8, 12 5094; GCN-NEXT: s_mov_b32 s1, s5 5095; GCN-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 5096; GCN-NEXT: v_mov_b32_e32 v0, s4 5097; GCN-NEXT: v_mov_b32_e32 v1, s5 5098; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5099; GCN-NEXT: s_endpgm 5100 %shl.y = shl i64 4096, %y 5101 %r = udiv i64 %x, %shl.y 5102 store i64 %r, i64 addrspace(1)* %out 5103 ret void 5104} 5105 5106define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 5107; CHECK-LABEL: @udiv_v2i64_pow2k_denom( 5108; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5109; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 5110; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 5111; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 5112; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096 5113; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 5114; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 5115; CHECK-NEXT: ret void 5116; 5117; GCN-LABEL: udiv_v2i64_pow2k_denom: 5118; GCN: ; %bb.0: 5119; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5120; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 5121; GCN-NEXT: s_mov_b32 s7, 0xf000 5122; GCN-NEXT: s_mov_b32 s6, -1 5123; GCN-NEXT: s_waitcnt lgkmcnt(0) 5124; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], 12 5125; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 5126; GCN-NEXT: v_mov_b32_e32 v0, s0 5127; GCN-NEXT: v_mov_b32_e32 v1, s1 5128; GCN-NEXT: v_mov_b32_e32 v2, s2 5129; GCN-NEXT: v_mov_b32_e32 v3, s3 5130; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5131; GCN-NEXT: s_endpgm 5132 %r = udiv <2 x i64> %x, <i64 4096, i64 4096> 5133 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5134 ret void 5135} 5136 5137define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 5138; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom( 5139; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5140; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 5141; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 5142; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 5143; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095 5144; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 5145; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 5146; CHECK-NEXT: ret void 5147; 5148; GCN-LABEL: udiv_v2i64_mixed_pow2k_denom: 5149; GCN: ; %bb.0: 5150; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 5151; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 5152; GCN-NEXT: v_rcp_f32_e32 v0, v0 5153; GCN-NEXT: s_movk_i32 s6, 0xf001 5154; GCN-NEXT: v_mov_b32_e32 v7, 0 5155; GCN-NEXT: v_mov_b32_e32 v2, 0 5156; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 5157; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 5158; GCN-NEXT: v_trunc_f32_e32 v1, v1 5159; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 5160; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 5161; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 5162; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5163; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 5164; GCN-NEXT: s_movk_i32 s0, 0xfff 5165; GCN-NEXT: v_mul_hi_u32 v3, v0, s6 5166; GCN-NEXT: v_mul_lo_u32 v5, v1, s6 5167; GCN-NEXT: v_mul_lo_u32 v4, v0, s6 5168; GCN-NEXT: s_mov_b32 s7, 0xf000 5169; GCN-NEXT: v_subrev_i32_e32 v3, vcc, v0, v3 5170; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 5171; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 5172; GCN-NEXT: v_mul_lo_u32 v5, v0, v3 5173; GCN-NEXT: v_mul_hi_u32 v8, v0, v3 5174; GCN-NEXT: v_mul_hi_u32 v9, v1, v3 5175; GCN-NEXT: v_mul_lo_u32 v3, v1, v3 5176; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 5177; GCN-NEXT: v_addc_u32_e32 v6, vcc, v7, v8, vcc 5178; GCN-NEXT: v_mul_lo_u32 v8, v1, v4 5179; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 5180; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 5181; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc 5182; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v2, vcc 5183; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 5184; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v3 5185; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc 5186; GCN-NEXT: v_mul_hi_u32 v5, v0, s6 5187; GCN-NEXT: v_addc_u32_e64 v3, vcc, v1, v4, s[2:3] 5188; GCN-NEXT: v_mul_lo_u32 v6, v3, s6 5189; GCN-NEXT: v_mul_lo_u32 v8, v0, s6 5190; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v0, v5 5191; GCN-NEXT: s_mov_b32 s6, -1 5192; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 5193; GCN-NEXT: v_mul_lo_u32 v6, v0, v5 5194; GCN-NEXT: v_mul_hi_u32 v9, v0, v8 5195; GCN-NEXT: v_mul_hi_u32 v10, v0, v5 5196; GCN-NEXT: v_mul_hi_u32 v11, v3, v5 5197; GCN-NEXT: v_add_i32_e32 v6, vcc, v9, v6 5198; GCN-NEXT: v_addc_u32_e32 v9, vcc, v7, v10, vcc 5199; GCN-NEXT: v_mul_lo_u32 v10, v3, v8 5200; GCN-NEXT: v_mul_hi_u32 v8, v3, v8 5201; GCN-NEXT: v_mul_lo_u32 v3, v3, v5 5202; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v10 5203; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v8, vcc 5204; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v2, vcc 5205; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 5206; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc 5207; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v4 5208; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] 5209; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 5210; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5211; GCN-NEXT: s_waitcnt lgkmcnt(0) 5212; GCN-NEXT: v_mul_lo_u32 v3, s10, v1 5213; GCN-NEXT: v_mul_hi_u32 v4, s10, v0 5214; GCN-NEXT: v_mul_hi_u32 v5, s10, v1 5215; GCN-NEXT: v_mul_hi_u32 v6, s11, v1 5216; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 5217; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 5218; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc 5219; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 5220; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 5221; GCN-NEXT: s_lshr_b64 s[2:3], s[8:9], 12 5222; GCN-NEXT: s_movk_i32 s8, 0xffe 5223; GCN-NEXT: v_add_i32_e32 v3, vcc, v5, v3 5224; GCN-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc 5225; GCN-NEXT: v_addc_u32_e32 v2, vcc, v6, v2, vcc 5226; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5227; GCN-NEXT: v_addc_u32_e32 v1, vcc, v7, v2, vcc 5228; GCN-NEXT: v_mul_lo_u32 v2, v1, s0 5229; GCN-NEXT: v_mul_hi_u32 v3, v0, s0 5230; GCN-NEXT: v_mul_lo_u32 v4, v0, s0 5231; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5232; GCN-NEXT: v_mov_b32_e32 v3, s11 5233; GCN-NEXT: v_sub_i32_e32 v4, vcc, s10, v4 5234; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc 5235; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s0, v4 5236; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc 5237; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s8, v3 5238; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 5239; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 5240; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 5241; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 5242; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 5243; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 5244; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc 5245; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s8, v4 5246; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 5247; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 5248; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 5249; GCN-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc 5250; GCN-NEXT: v_cndmask_b32_e64 v3, v8, v6, s[0:1] 5251; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 5252; GCN-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc 5253; GCN-NEXT: v_cndmask_b32_e64 v1, v7, v5, s[0:1] 5254; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 5255; GCN-NEXT: v_mov_b32_e32 v0, s2 5256; GCN-NEXT: v_mov_b32_e32 v1, s3 5257; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5258; GCN-NEXT: s_endpgm 5259 %r = udiv <2 x i64> %x, <i64 4096, i64 4095> 5260 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5261 ret void 5262} 5263 5264define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 5265; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom( 5266; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 5267; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5268; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 5269; CHECK-NEXT: [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]] 5270; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 5271; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 5272; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 5273; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]] 5274; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 5275; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 5276; CHECK-NEXT: ret void 5277; 5278; GCN-LABEL: udiv_v2i64_pow2_shl_denom: 5279; GCN: ; %bb.0: 5280; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5281; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 5282; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 5283; GCN-NEXT: s_mov_b32 s7, 0xf000 5284; GCN-NEXT: s_mov_b32 s6, -1 5285; GCN-NEXT: s_waitcnt lgkmcnt(0) 5286; GCN-NEXT: s_add_i32 s0, s0, 12 5287; GCN-NEXT: s_add_i32 s2, s2, 12 5288; GCN-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 5289; GCN-NEXT: s_lshr_b64 s[2:3], s[10:11], s2 5290; GCN-NEXT: v_mov_b32_e32 v0, s0 5291; GCN-NEXT: v_mov_b32_e32 v1, s1 5292; GCN-NEXT: v_mov_b32_e32 v2, s2 5293; GCN-NEXT: v_mov_b32_e32 v3, s3 5294; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5295; GCN-NEXT: s_endpgm 5296 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 5297 %r = udiv <2 x i64> %x, %shl.y 5298 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5299 ret void 5300} 5301 5302define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 5303; CHECK-LABEL: @urem_i64_oddk_denom( 5304; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993 5305; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5306; CHECK-NEXT: ret void 5307; 5308; GCN-LABEL: urem_i64_oddk_denom: 5309; GCN: ; %bb.0: 5310; GCN-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 5311; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 5312; GCN-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 5313; GCN-NEXT: v_rcp_f32_e32 v0, v0 5314; GCN-NEXT: s_movk_i32 s2, 0xfee0 5315; GCN-NEXT: s_mov_b32 s3, 0x689e0837 5316; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 5317; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 5318; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 5319; GCN-NEXT: v_trunc_f32_e32 v1, v1 5320; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 5321; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 5322; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 5323; GCN-NEXT: s_waitcnt lgkmcnt(0) 5324; GCN-NEXT: s_mov_b32 s4, s8 5325; GCN-NEXT: s_movk_i32 s8, 0x11f 5326; GCN-NEXT: v_mul_lo_u32 v2, v0, s2 5327; GCN-NEXT: v_mul_hi_u32 v3, v0, s3 5328; GCN-NEXT: v_mul_lo_u32 v4, v1, s3 5329; GCN-NEXT: v_mul_lo_u32 v5, v0, s3 5330; GCN-NEXT: s_mov_b32 s12, 0x9761f7c9 5331; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5332; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 5333; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 5334; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 5335; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 5336; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 5337; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 5338; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 5339; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 5340; GCN-NEXT: v_mul_lo_u32 v6, v1, v5 5341; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 5342; GCN-NEXT: s_mov_b32 s5, s9 5343; GCN-NEXT: s_movk_i32 s9, 0x11e 5344; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 5345; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 5346; GCN-NEXT: v_mov_b32_e32 v4, 0 5347; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 5348; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5349; GCN-NEXT: v_mov_b32_e32 v6, 0 5350; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 5351; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 5352; GCN-NEXT: v_mul_lo_u32 v5, v0, s2 5353; GCN-NEXT: v_mul_hi_u32 v7, v0, s3 5354; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 5355; GCN-NEXT: v_mul_lo_u32 v8, v2, s3 5356; GCN-NEXT: s_mov_b32 s7, 0xf000 5357; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 5358; GCN-NEXT: v_mul_lo_u32 v7, v0, s3 5359; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v8 5360; GCN-NEXT: v_mul_lo_u32 v8, v0, v5 5361; GCN-NEXT: v_mul_hi_u32 v10, v0, v5 5362; GCN-NEXT: v_mul_hi_u32 v9, v0, v7 5363; GCN-NEXT: v_mul_hi_u32 v11, v2, v5 5364; GCN-NEXT: s_mov_b32 s6, -1 5365; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 5366; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 5367; GCN-NEXT: v_mul_lo_u32 v10, v2, v7 5368; GCN-NEXT: v_mul_hi_u32 v7, v2, v7 5369; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 5370; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10 5371; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc 5372; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v4, vcc 5373; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 5374; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 5375; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 5376; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 5377; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 5378; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5379; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 5380; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 5381; GCN-NEXT: v_mul_hi_u32 v5, s10, v1 5382; GCN-NEXT: v_mul_hi_u32 v7, s11, v1 5383; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 5384; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5385; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 5386; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 5387; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 5388; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 5389; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 5390; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 5391; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5392; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 5393; GCN-NEXT: v_mul_lo_u32 v2, v0, s8 5394; GCN-NEXT: v_mul_hi_u32 v3, v0, s12 5395; GCN-NEXT: v_mul_lo_u32 v1, v1, s12 5396; GCN-NEXT: v_mul_lo_u32 v0, v0, s12 5397; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5398; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 5399; GCN-NEXT: v_sub_i32_e64 v0, s[0:1], s10, v0 5400; GCN-NEXT: v_sub_i32_e32 v2, vcc, s11, v1 5401; GCN-NEXT: v_mov_b32_e32 v3, s8 5402; GCN-NEXT: v_subb_u32_e64 v2, vcc, v2, v3, s[0:1] 5403; GCN-NEXT: v_subrev_i32_e64 v4, s[2:3], s12, v0 5404; GCN-NEXT: v_subbrev_u32_e64 v5, vcc, 0, v2, s[2:3] 5405; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s9, v5 5406; GCN-NEXT: s_mov_b32 s10, 0x9761f7c8 5407; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 5408; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s10, v4 5409; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 5410; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s8, v5 5411; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc 5412; GCN-NEXT: v_subb_u32_e64 v2, vcc, v2, v3, s[2:3] 5413; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s12, v4 5414; GCN-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc 5415; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v6 5416; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[2:3] 5417; GCN-NEXT: v_mov_b32_e32 v5, s11 5418; GCN-NEXT: v_subb_u32_e64 v1, vcc, v5, v1, s[0:1] 5419; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s9, v1 5420; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 5421; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 5422; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 5423; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 5424; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 5425; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 5426; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 5427; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[2:3] 5428; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5429; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5430; GCN-NEXT: s_endpgm 5431 %r = urem i64 %x, 1235195393993 5432 store i64 %r, i64 addrspace(1)* %out 5433 ret void 5434} 5435 5436define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 5437; CHECK-LABEL: @urem_i64_pow2k_denom( 5438; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 4096 5439; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5440; CHECK-NEXT: ret void 5441; 5442; GCN-LABEL: urem_i64_pow2k_denom: 5443; GCN: ; %bb.0: 5444; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 5445; GCN-NEXT: s_mov_b32 s3, 0xf000 5446; GCN-NEXT: s_mov_b32 s2, -1 5447; GCN-NEXT: v_mov_b32_e32 v1, 0 5448; GCN-NEXT: s_waitcnt lgkmcnt(0) 5449; GCN-NEXT: s_mov_b32 s0, s4 5450; GCN-NEXT: s_and_b32 s4, s6, 0xfff 5451; GCN-NEXT: s_mov_b32 s1, s5 5452; GCN-NEXT: v_mov_b32_e32 v0, s4 5453; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5454; GCN-NEXT: s_endpgm 5455 %r = urem i64 %x, 4096 5456 store i64 %r, i64 addrspace(1)* %out 5457 ret void 5458} 5459 5460define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 5461; CHECK-LABEL: @urem_i64_pow2_shl_denom( 5462; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 5463; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]] 5464; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5465; CHECK-NEXT: ret void 5466; 5467; GCN-LABEL: urem_i64_pow2_shl_denom: 5468; GCN: ; %bb.0: 5469; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 5470; GCN-NEXT: s_load_dword s8, s[0:1], 0xd 5471; GCN-NEXT: s_mov_b32 s3, 0xf000 5472; GCN-NEXT: s_mov_b32 s2, -1 5473; GCN-NEXT: s_waitcnt lgkmcnt(0) 5474; GCN-NEXT: s_mov_b32 s0, s4 5475; GCN-NEXT: s_mov_b32 s1, s5 5476; GCN-NEXT: s_mov_b32 s5, 0 5477; GCN-NEXT: s_movk_i32 s4, 0x1000 5478; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 5479; GCN-NEXT: s_add_u32 s4, s4, -1 5480; GCN-NEXT: s_addc_u32 s5, s5, -1 5481; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 5482; GCN-NEXT: v_mov_b32_e32 v0, s4 5483; GCN-NEXT: v_mov_b32_e32 v1, s5 5484; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5485; GCN-NEXT: s_endpgm 5486 %shl.y = shl i64 4096, %y 5487 %r = urem i64 %x, %shl.y 5488 store i64 %r, i64 addrspace(1)* %out 5489 ret void 5490} 5491 5492define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 5493; CHECK-LABEL: @urem_v2i64_pow2k_denom( 5494; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5495; CHECK-NEXT: [[TMP2:%.*]] = urem i64 [[TMP1]], 4096 5496; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 5497; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 5498; CHECK-NEXT: [[TMP5:%.*]] = urem i64 [[TMP4]], 4096 5499; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 5500; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 5501; CHECK-NEXT: ret void 5502; 5503; GCN-LABEL: urem_v2i64_pow2k_denom: 5504; GCN: ; %bb.0: 5505; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5506; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 5507; GCN-NEXT: s_movk_i32 s8, 0xfff 5508; GCN-NEXT: v_mov_b32_e32 v1, 0 5509; GCN-NEXT: s_mov_b32 s7, 0xf000 5510; GCN-NEXT: s_mov_b32 s6, -1 5511; GCN-NEXT: s_waitcnt lgkmcnt(0) 5512; GCN-NEXT: s_and_b32 s0, s0, s8 5513; GCN-NEXT: s_and_b32 s1, s2, s8 5514; GCN-NEXT: v_mov_b32_e32 v0, s0 5515; GCN-NEXT: v_mov_b32_e32 v2, s1 5516; GCN-NEXT: v_mov_b32_e32 v3, v1 5517; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5518; GCN-NEXT: s_endpgm 5519 %r = urem <2 x i64> %x, <i64 4096, i64 4096> 5520 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5521 ret void 5522} 5523 5524define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 5525; CHECK-LABEL: @urem_v2i64_pow2_shl_denom( 5526; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 5527; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5528; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 5529; CHECK-NEXT: [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]] 5530; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 5531; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 5532; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 5533; CHECK-NEXT: [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]] 5534; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 5535; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 5536; CHECK-NEXT: ret void 5537; 5538; GCN-LABEL: urem_v2i64_pow2_shl_denom: 5539; GCN: ; %bb.0: 5540; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5541; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 5542; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 5543; GCN-NEXT: s_mov_b32 s13, 0 5544; GCN-NEXT: s_movk_i32 s12, 0x1000 5545; GCN-NEXT: s_mov_b32 s7, 0xf000 5546; GCN-NEXT: s_mov_b32 s6, -1 5547; GCN-NEXT: s_waitcnt lgkmcnt(0) 5548; GCN-NEXT: s_lshl_b64 s[2:3], s[12:13], s2 5549; GCN-NEXT: s_lshl_b64 s[0:1], s[12:13], s0 5550; GCN-NEXT: s_add_u32 s0, s0, -1 5551; GCN-NEXT: s_addc_u32 s1, s1, -1 5552; GCN-NEXT: s_and_b64 s[0:1], s[8:9], s[0:1] 5553; GCN-NEXT: s_add_u32 s2, s2, -1 5554; GCN-NEXT: s_addc_u32 s3, s3, -1 5555; GCN-NEXT: s_and_b64 s[2:3], s[10:11], s[2:3] 5556; GCN-NEXT: v_mov_b32_e32 v0, s0 5557; GCN-NEXT: v_mov_b32_e32 v1, s1 5558; GCN-NEXT: v_mov_b32_e32 v2, s2 5559; GCN-NEXT: v_mov_b32_e32 v3, s3 5560; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5561; GCN-NEXT: s_endpgm 5562 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 5563 %r = urem <2 x i64> %x, %shl.y 5564 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5565 ret void 5566} 5567 5568define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 5569; CHECK-LABEL: @sdiv_i64_oddk_denom( 5570; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195 5571; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5572; CHECK-NEXT: ret void 5573; 5574; GCN-LABEL: sdiv_i64_oddk_denom: 5575; GCN: ; %bb.0: 5576; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 5577; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 5578; GCN-NEXT: v_rcp_f32_e32 v0, v0 5579; GCN-NEXT: s_mov_b32 s2, 0xffed2705 5580; GCN-NEXT: v_mov_b32_e32 v8, 0 5581; GCN-NEXT: v_mov_b32_e32 v7, 0 5582; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 5583; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 5584; GCN-NEXT: v_trunc_f32_e32 v1, v1 5585; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 5586; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 5587; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 5588; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 5589; GCN-NEXT: s_mov_b32 s7, 0xf000 5590; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 5591; GCN-NEXT: v_mul_lo_u32 v2, v1, s2 5592; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 5593; GCN-NEXT: s_mov_b32 s6, -1 5594; GCN-NEXT: s_waitcnt lgkmcnt(0) 5595; GCN-NEXT: s_mov_b32 s4, s8 5596; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5597; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 5598; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 5599; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 5600; GCN-NEXT: v_mul_hi_u32 v3, v0, v2 5601; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 5602; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 5603; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 5604; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 5605; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 5606; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc 5607; GCN-NEXT: s_mov_b32 s5, s9 5608; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 5609; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc 5610; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 5611; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5612; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 5613; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 5614; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 5615; GCN-NEXT: v_mul_lo_u32 v4, v2, s2 5616; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 5617; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 5618; GCN-NEXT: v_mul_lo_u32 v5, v0, s2 5619; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 5620; GCN-NEXT: v_mul_lo_u32 v10, v0, v4 5621; GCN-NEXT: v_mul_hi_u32 v12, v0, v4 5622; GCN-NEXT: v_mul_hi_u32 v11, v0, v5 5623; GCN-NEXT: v_mul_hi_u32 v9, v2, v5 5624; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 5625; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 5626; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 5627; GCN-NEXT: v_addc_u32_e32 v11, vcc, v8, v12, vcc 5628; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 5629; GCN-NEXT: v_add_i32_e32 v5, vcc, v10, v5 5630; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc 5631; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc 5632; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 5633; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 5634; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 5635; GCN-NEXT: s_ashr_i32 s2, s11, 31 5636; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 5637; GCN-NEXT: s_add_u32 s0, s10, s2 5638; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 5639; GCN-NEXT: s_mov_b32 s3, s2 5640; GCN-NEXT: s_addc_u32 s1, s11, s2 5641; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 5642; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5643; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 5644; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 5645; GCN-NEXT: v_mul_hi_u32 v4, s0, v1 5646; GCN-NEXT: v_mul_hi_u32 v5, s1, v1 5647; GCN-NEXT: v_mul_lo_u32 v1, s1, v1 5648; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5649; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 5650; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 5651; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 5652; GCN-NEXT: s_mov_b32 s3, 0x12d8fb 5653; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 5654; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 5655; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 5656; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5657; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 5658; GCN-NEXT: v_mul_lo_u32 v2, v1, s3 5659; GCN-NEXT: v_mul_hi_u32 v3, s3, v0 5660; GCN-NEXT: v_mul_lo_u32 v4, v0, s3 5661; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5662; GCN-NEXT: v_mov_b32_e32 v3, s1 5663; GCN-NEXT: v_sub_i32_e32 v4, vcc, s0, v4 5664; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc 5665; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s3, v4 5666; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc 5667; GCN-NEXT: s_mov_b32 s3, 0x12d8fa 5668; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s3, v3 5669; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 5670; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 5671; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 5672; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 5673; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 5674; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 5675; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc 5676; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s3, v4 5677; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 5678; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 5679; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 5680; GCN-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc 5681; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 5682; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 5683; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5684; GCN-NEXT: v_cndmask_b32_e64 v3, v8, v6, s[0:1] 5685; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 5686; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 5687; GCN-NEXT: v_xor_b32_e32 v1, s2, v1 5688; GCN-NEXT: v_mov_b32_e32 v2, s2 5689; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 5690; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 5691; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5692; GCN-NEXT: s_endpgm 5693 %r = sdiv i64 %x, 1235195 5694 store i64 %r, i64 addrspace(1)* %out 5695 ret void 5696} 5697 5698define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 5699; CHECK-LABEL: @sdiv_i64_pow2k_denom( 5700; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 4096 5701; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5702; CHECK-NEXT: ret void 5703; 5704; GCN-LABEL: sdiv_i64_pow2k_denom: 5705; GCN: ; %bb.0: 5706; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 5707; GCN-NEXT: s_mov_b32 s7, 0xf000 5708; GCN-NEXT: s_mov_b32 s6, -1 5709; GCN-NEXT: s_waitcnt lgkmcnt(0) 5710; GCN-NEXT: s_mov_b32 s4, s0 5711; GCN-NEXT: s_ashr_i32 s0, s3, 31 5712; GCN-NEXT: s_lshr_b32 s0, s0, 20 5713; GCN-NEXT: s_add_u32 s0, s2, s0 5714; GCN-NEXT: s_mov_b32 s5, s1 5715; GCN-NEXT: s_addc_u32 s1, s3, 0 5716; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 5717; GCN-NEXT: v_mov_b32_e32 v0, s0 5718; GCN-NEXT: v_mov_b32_e32 v1, s1 5719; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5720; GCN-NEXT: s_endpgm 5721 %r = sdiv i64 %x, 4096 5722 store i64 %r, i64 addrspace(1)* %out 5723 ret void 5724} 5725 5726define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 5727; CHECK-LABEL: @sdiv_i64_pow2_shl_denom( 5728; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 5729; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]] 5730; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 5731; CHECK-NEXT: ret void 5732; 5733; GCN-LABEL: sdiv_i64_pow2_shl_denom: 5734; GCN: ; %bb.0: 5735; GCN-NEXT: s_load_dword s4, s[0:1], 0xd 5736; GCN-NEXT: s_mov_b32 s3, 0 5737; GCN-NEXT: s_movk_i32 s2, 0x1000 5738; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 5739; GCN-NEXT: s_mov_b32 s7, 0xf000 5740; GCN-NEXT: s_waitcnt lgkmcnt(0) 5741; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 5742; GCN-NEXT: s_ashr_i32 s12, s3, 31 5743; GCN-NEXT: s_add_u32 s2, s2, s12 5744; GCN-NEXT: s_mov_b32 s13, s12 5745; GCN-NEXT: s_addc_u32 s3, s3, s12 5746; GCN-NEXT: s_xor_b64 s[14:15], s[2:3], s[12:13] 5747; GCN-NEXT: v_cvt_f32_u32_e32 v0, s14 5748; GCN-NEXT: v_cvt_f32_u32_e32 v1, s15 5749; GCN-NEXT: s_sub_u32 s2, 0, s14 5750; GCN-NEXT: s_subb_u32 s3, 0, s15 5751; GCN-NEXT: s_ashr_i32 s16, s11, 31 5752; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 5753; GCN-NEXT: v_rcp_f32_e32 v0, v0 5754; GCN-NEXT: s_mov_b32 s17, s16 5755; GCN-NEXT: s_mov_b32 s6, -1 5756; GCN-NEXT: s_mov_b32 s4, s8 5757; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 5758; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 5759; GCN-NEXT: v_trunc_f32_e32 v1, v1 5760; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 5761; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 5762; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 5763; GCN-NEXT: s_mov_b32 s5, s9 5764; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 5765; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 5766; GCN-NEXT: v_mul_lo_u32 v5, s3, v0 5767; GCN-NEXT: v_mul_lo_u32 v4, s2, v0 5768; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5769; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 5770; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 5771; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 5772; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 5773; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 5774; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 5775; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 5776; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 5777; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 5778; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 5779; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 5780; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 5781; GCN-NEXT: v_mov_b32_e32 v4, 0 5782; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 5783; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5784; GCN-NEXT: v_mov_b32_e32 v6, 0 5785; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 5786; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 5787; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 5788; GCN-NEXT: v_mul_lo_u32 v5, s2, v2 5789; GCN-NEXT: v_mul_hi_u32 v7, s2, v0 5790; GCN-NEXT: v_mul_lo_u32 v8, s3, v0 5791; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 5792; GCN-NEXT: v_mul_lo_u32 v7, s2, v0 5793; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 5794; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 5795; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 5796; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 5797; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 5798; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 5799; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 5800; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 5801; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 5802; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 5803; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 5804; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 5805; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 5806; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 5807; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 5808; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 5809; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 5810; GCN-NEXT: s_add_u32 s0, s10, s16 5811; GCN-NEXT: s_addc_u32 s1, s11, s16 5812; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 5813; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[16:17] 5814; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 5815; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 5816; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 5817; GCN-NEXT: v_mul_hi_u32 v5, s10, v1 5818; GCN-NEXT: v_mul_hi_u32 v7, s11, v1 5819; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 5820; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5821; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 5822; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 5823; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 5824; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 5825; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 5826; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 5827; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5828; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 5829; GCN-NEXT: v_mul_lo_u32 v2, s14, v1 5830; GCN-NEXT: v_mul_hi_u32 v3, s14, v0 5831; GCN-NEXT: v_mul_lo_u32 v4, s15, v0 5832; GCN-NEXT: v_mov_b32_e32 v5, s15 5833; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5834; GCN-NEXT: v_mul_lo_u32 v3, s14, v0 5835; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 5836; GCN-NEXT: v_sub_i32_e32 v4, vcc, s11, v2 5837; GCN-NEXT: v_sub_i32_e64 v3, s[0:1], s10, v3 5838; GCN-NEXT: v_subb_u32_e64 v4, vcc, v4, v5, s[0:1] 5839; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s14, v3 5840; GCN-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc 5841; GCN-NEXT: v_cmp_le_u32_e32 vcc, s15, v4 5842; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 5843; GCN-NEXT: v_cmp_le_u32_e32 vcc, s14, v5 5844; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 5845; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s15, v4 5846; GCN-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc 5847; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 5848; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 5849; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 5850; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc 5851; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v4 5852; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[2:3] 5853; GCN-NEXT: v_mov_b32_e32 v6, s11 5854; GCN-NEXT: v_subb_u32_e64 v2, vcc, v6, v2, s[0:1] 5855; GCN-NEXT: v_cmp_le_u32_e32 vcc, s15, v2 5856; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 5857; GCN-NEXT: v_cmp_le_u32_e32 vcc, s14, v3 5858; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 5859; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s15, v2 5860; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 5861; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 5862; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[2:3] 5863; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 5864; GCN-NEXT: s_xor_b64 s[0:1], s[16:17], s[12:13] 5865; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 5866; GCN-NEXT: v_xor_b32_e32 v0, s0, v0 5867; GCN-NEXT: v_xor_b32_e32 v1, s1, v1 5868; GCN-NEXT: v_mov_b32_e32 v2, s1 5869; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 5870; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 5871; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5872; GCN-NEXT: s_endpgm 5873 %shl.y = shl i64 4096, %y 5874 %r = sdiv i64 %x, %shl.y 5875 store i64 %r, i64 addrspace(1)* %out 5876 ret void 5877} 5878 5879define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 5880; CHECK-LABEL: @sdiv_v2i64_pow2k_denom( 5881; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5882; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 5883; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 5884; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 5885; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096 5886; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 5887; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 5888; CHECK-NEXT: ret void 5889; 5890; GCN-LABEL: sdiv_v2i64_pow2k_denom: 5891; GCN: ; %bb.0: 5892; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5893; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 5894; GCN-NEXT: s_mov_b32 s7, 0xf000 5895; GCN-NEXT: s_mov_b32 s6, -1 5896; GCN-NEXT: s_waitcnt lgkmcnt(0) 5897; GCN-NEXT: s_ashr_i32 s8, s1, 31 5898; GCN-NEXT: s_lshr_b32 s8, s8, 20 5899; GCN-NEXT: s_add_u32 s0, s0, s8 5900; GCN-NEXT: s_addc_u32 s1, s1, 0 5901; GCN-NEXT: s_ashr_i32 s8, s3, 31 5902; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 5903; GCN-NEXT: s_lshr_b32 s8, s8, 20 5904; GCN-NEXT: s_add_u32 s2, s2, s8 5905; GCN-NEXT: s_addc_u32 s3, s3, 0 5906; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 5907; GCN-NEXT: v_mov_b32_e32 v0, s0 5908; GCN-NEXT: v_mov_b32_e32 v1, s1 5909; GCN-NEXT: v_mov_b32_e32 v2, s2 5910; GCN-NEXT: v_mov_b32_e32 v3, s3 5911; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 5912; GCN-NEXT: s_endpgm 5913 %r = sdiv <2 x i64> %x, <i64 4096, i64 4096> 5914 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 5915 ret void 5916} 5917 5918define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 5919; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom( 5920; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 5921; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 5922; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 5923; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 5924; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095 5925; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 5926; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 5927; CHECK-NEXT: ret void 5928; 5929; GCN-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 5930; GCN: ; %bb.0: 5931; GCN-NEXT: v_mov_b32_e32 v0, 0x457ff000 5932; GCN-NEXT: v_mov_b32_e32 v1, 0x4f800000 5933; GCN-NEXT: v_mac_f32_e32 v0, 0, v1 5934; GCN-NEXT: v_rcp_f32_e32 v0, v0 5935; GCN-NEXT: s_movk_i32 s6, 0xf001 5936; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5937; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 5938; GCN-NEXT: s_mov_b32 s7, 0xf000 5939; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 5940; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 5941; GCN-NEXT: v_trunc_f32_e32 v1, v1 5942; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 5943; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 5944; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 5945; GCN-NEXT: s_waitcnt lgkmcnt(0) 5946; GCN-NEXT: s_ashr_i32 s0, s9, 31 5947; GCN-NEXT: s_lshr_b32 s0, s0, 20 5948; GCN-NEXT: v_mul_hi_u32 v2, s6, v0 5949; GCN-NEXT: v_mul_lo_u32 v3, v1, s6 5950; GCN-NEXT: s_add_u32 s2, s8, s0 5951; GCN-NEXT: s_addc_u32 s3, s9, 0 5952; GCN-NEXT: s_ashr_i32 s8, s11, 31 5953; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 5954; GCN-NEXT: v_mul_lo_u32 v3, v0, s6 5955; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 5956; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 5957; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 5958; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 5959; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 5960; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 5961; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 5962; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 5963; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 5964; GCN-NEXT: v_mul_lo_u32 v6, v1, v3 5965; GCN-NEXT: v_mul_hi_u32 v3, v1, v3 5966; GCN-NEXT: s_mov_b32 s9, s8 5967; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 5968; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 5969; GCN-NEXT: v_mov_b32_e32 v4, 0 5970; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 5971; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 5972; GCN-NEXT: v_mov_b32_e32 v6, 0 5973; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 5974; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 5975; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 5976; GCN-NEXT: v_mul_lo_u32 v5, v2, s6 5977; GCN-NEXT: v_mul_hi_u32 v7, s6, v0 5978; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 5979; GCN-NEXT: v_mul_lo_u32 v7, v0, s6 5980; GCN-NEXT: v_subrev_i32_e32 v5, vcc, v0, v5 5981; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 5982; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 5983; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 5984; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 5985; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 5986; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 5987; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 5988; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 5989; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 5990; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 5991; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 5992; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 5993; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 5994; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 5995; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 5996; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 5997; GCN-NEXT: s_add_u32 s0, s10, s8 5998; GCN-NEXT: s_addc_u32 s1, s11, s8 5999; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 6000; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] 6001; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6002; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 6003; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 6004; GCN-NEXT: v_mul_hi_u32 v5, s0, v1 6005; GCN-NEXT: v_mul_hi_u32 v7, s1, v1 6006; GCN-NEXT: v_mul_lo_u32 v1, s1, v1 6007; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6008; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6009; GCN-NEXT: v_mul_lo_u32 v5, s1, v0 6010; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 6011; GCN-NEXT: s_movk_i32 s9, 0xfff 6012; GCN-NEXT: s_mov_b32 s6, -1 6013; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 6014; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 6015; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 6016; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6017; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 6018; GCN-NEXT: v_mul_lo_u32 v2, v1, s9 6019; GCN-NEXT: v_mul_hi_u32 v3, s9, v0 6020; GCN-NEXT: v_mul_lo_u32 v4, v0, s9 6021; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6022; GCN-NEXT: v_mov_b32_e32 v3, s1 6023; GCN-NEXT: v_sub_i32_e32 v4, vcc, s0, v4 6024; GCN-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc 6025; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s9, v4 6026; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc 6027; GCN-NEXT: s_movk_i32 s9, 0xffe 6028; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s9, v3 6029; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 6030; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 6031; GCN-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 6032; GCN-NEXT: v_add_i32_e32 v5, vcc, 2, v0 6033; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 6034; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v0 6035; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc 6036; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s9, v4 6037; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 6038; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 6039; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 6040; GCN-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc 6041; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 6042; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 6043; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6044; GCN-NEXT: v_cndmask_b32_e64 v3, v8, v6, s[0:1] 6045; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 6046; GCN-NEXT: v_xor_b32_e32 v0, s8, v0 6047; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0 6048; GCN-NEXT: v_xor_b32_e32 v1, s8, v1 6049; GCN-NEXT: v_mov_b32_e32 v3, s8 6050; GCN-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc 6051; GCN-NEXT: v_mov_b32_e32 v0, s2 6052; GCN-NEXT: v_mov_b32_e32 v1, s3 6053; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 6054; GCN-NEXT: s_endpgm 6055 %r = sdiv <2 x i64> %x, <i64 4096, i64 4095> 6056 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 6057 ret void 6058} 6059 6060define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 6061; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom( 6062; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 6063; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 6064; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 6065; CHECK-NEXT: [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]] 6066; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 6067; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 6068; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 6069; CHECK-NEXT: [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]] 6070; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 6071; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 6072; CHECK-NEXT: ret void 6073; 6074; GCN-LABEL: sdiv_v2i64_pow2_shl_denom: 6075; GCN: ; %bb.0: 6076; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 6077; GCN-NEXT: s_mov_b32 s3, 0 6078; GCN-NEXT: s_movk_i32 s2, 0x1000 6079; GCN-NEXT: s_mov_b32 s20, 0x4f800000 6080; GCN-NEXT: s_mov_b32 s21, 0x5f7ffffc 6081; GCN-NEXT: s_waitcnt lgkmcnt(0) 6082; GCN-NEXT: s_lshl_b64 s[12:13], s[2:3], s6 6083; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 6084; GCN-NEXT: s_ashr_i32 s16, s3, 31 6085; GCN-NEXT: s_add_u32 s2, s2, s16 6086; GCN-NEXT: s_mov_b32 s17, s16 6087; GCN-NEXT: s_addc_u32 s3, s3, s16 6088; GCN-NEXT: s_xor_b64 s[14:15], s[2:3], s[16:17] 6089; GCN-NEXT: v_cvt_f32_u32_e32 v0, s14 6090; GCN-NEXT: v_cvt_f32_u32_e32 v1, s15 6091; GCN-NEXT: s_mov_b32 s22, 0x2f800000 6092; GCN-NEXT: s_mov_b32 s23, 0xcf800000 6093; GCN-NEXT: s_sub_u32 s6, 0, s14 6094; GCN-NEXT: v_mac_f32_e32 v0, s20, v1 6095; GCN-NEXT: v_rcp_f32_e32 v0, v0 6096; GCN-NEXT: s_subb_u32 s7, 0, s15 6097; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6098; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 6099; GCN-NEXT: v_mul_f32_e32 v0, s21, v0 6100; GCN-NEXT: v_mul_f32_e32 v1, s22, v0 6101; GCN-NEXT: v_trunc_f32_e32 v1, v1 6102; GCN-NEXT: v_mac_f32_e32 v0, s23, v1 6103; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 6104; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 6105; GCN-NEXT: s_waitcnt lgkmcnt(0) 6106; GCN-NEXT: s_ashr_i32 s18, s9, 31 6107; GCN-NEXT: s_add_u32 s0, s8, s18 6108; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 6109; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 6110; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 6111; GCN-NEXT: v_mul_lo_u32 v5, s6, v0 6112; GCN-NEXT: s_mov_b32 s19, s18 6113; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6114; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 6115; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 6116; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 6117; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 6118; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 6119; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 6120; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 6121; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 6122; GCN-NEXT: v_mul_lo_u32 v6, v1, v5 6123; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 6124; GCN-NEXT: s_addc_u32 s1, s9, s18 6125; GCN-NEXT: s_xor_b64 s[8:9], s[0:1], s[18:19] 6126; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 6127; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 6128; GCN-NEXT: v_mov_b32_e32 v4, 0 6129; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 6130; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6131; GCN-NEXT: v_mov_b32_e32 v6, 0 6132; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2 6133; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 6134; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[2:3] 6135; GCN-NEXT: v_mul_lo_u32 v5, s6, v2 6136; GCN-NEXT: v_mul_hi_u32 v7, s6, v0 6137; GCN-NEXT: v_mul_lo_u32 v8, s7, v0 6138; GCN-NEXT: s_mov_b32 s7, 0xf000 6139; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 6140; GCN-NEXT: v_mul_lo_u32 v7, s6, v0 6141; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 6142; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 6143; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 6144; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 6145; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 6146; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 6147; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 6148; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 6149; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 6150; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 6151; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 6152; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 6153; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 6154; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 6155; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 6156; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6157; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] 6158; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 6159; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6160; GCN-NEXT: v_mul_lo_u32 v2, s8, v1 6161; GCN-NEXT: v_mul_hi_u32 v3, s8, v0 6162; GCN-NEXT: v_mul_hi_u32 v5, s8, v1 6163; GCN-NEXT: v_mul_hi_u32 v7, s9, v1 6164; GCN-NEXT: v_mul_lo_u32 v1, s9, v1 6165; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6166; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6167; GCN-NEXT: v_mul_lo_u32 v5, s9, v0 6168; GCN-NEXT: v_mul_hi_u32 v0, s9, v0 6169; GCN-NEXT: s_mov_b32 s6, -1 6170; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 6171; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 6172; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 6173; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6174; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 6175; GCN-NEXT: v_mul_lo_u32 v2, s14, v1 6176; GCN-NEXT: v_mul_hi_u32 v3, s14, v0 6177; GCN-NEXT: v_mul_lo_u32 v5, s15, v0 6178; GCN-NEXT: v_mov_b32_e32 v7, s15 6179; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6180; GCN-NEXT: v_mul_lo_u32 v3, s14, v0 6181; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 6182; GCN-NEXT: v_sub_i32_e32 v5, vcc, s9, v2 6183; GCN-NEXT: v_sub_i32_e64 v3, s[0:1], s8, v3 6184; GCN-NEXT: v_subb_u32_e64 v5, vcc, v5, v7, s[0:1] 6185; GCN-NEXT: v_subrev_i32_e32 v7, vcc, s14, v3 6186; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc 6187; GCN-NEXT: v_cmp_le_u32_e32 vcc, s15, v5 6188; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 6189; GCN-NEXT: v_cmp_le_u32_e32 vcc, s14, v7 6190; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 6191; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s15, v5 6192; GCN-NEXT: v_cndmask_b32_e32 v5, v8, v7, vcc 6193; GCN-NEXT: v_add_i32_e32 v7, vcc, 2, v0 6194; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc 6195; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v0 6196; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc 6197; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v5 6198; GCN-NEXT: v_cndmask_b32_e64 v5, v10, v8, s[2:3] 6199; GCN-NEXT: v_mov_b32_e32 v8, s9 6200; GCN-NEXT: s_xor_b64 s[8:9], s[18:19], s[16:17] 6201; GCN-NEXT: s_ashr_i32 s16, s13, 31 6202; GCN-NEXT: v_subb_u32_e64 v2, vcc, v8, v2, s[0:1] 6203; GCN-NEXT: s_add_u32 s0, s12, s16 6204; GCN-NEXT: s_mov_b32 s17, s16 6205; GCN-NEXT: s_addc_u32 s1, s13, s16 6206; GCN-NEXT: s_xor_b64 s[12:13], s[0:1], s[16:17] 6207; GCN-NEXT: v_cvt_f32_u32_e32 v10, s12 6208; GCN-NEXT: v_cvt_f32_u32_e32 v11, s13 6209; GCN-NEXT: v_cmp_le_u32_e32 vcc, s15, v2 6210; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 6211; GCN-NEXT: v_cmp_le_u32_e32 vcc, s14, v3 6212; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 6213; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s15, v2 6214; GCN-NEXT: v_mac_f32_e32 v10, s20, v11 6215; GCN-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc 6216; GCN-NEXT: v_rcp_f32_e32 v3, v10 6217; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 6218; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 6219; GCN-NEXT: v_cndmask_b32_e64 v2, v9, v7, s[2:3] 6220; GCN-NEXT: v_mul_f32_e32 v3, s21, v3 6221; GCN-NEXT: v_mul_f32_e32 v5, s22, v3 6222; GCN-NEXT: v_trunc_f32_e32 v5, v5 6223; GCN-NEXT: v_mac_f32_e32 v3, s23, v5 6224; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 6225; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 6226; GCN-NEXT: s_sub_u32 s2, 0, s12 6227; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6228; GCN-NEXT: v_mul_hi_u32 v2, s2, v3 6229; GCN-NEXT: v_mul_lo_u32 v7, s2, v5 6230; GCN-NEXT: s_subb_u32 s3, 0, s13 6231; GCN-NEXT: v_mul_lo_u32 v8, s3, v3 6232; GCN-NEXT: s_ashr_i32 s14, s11, 31 6233; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v7 6234; GCN-NEXT: v_mul_lo_u32 v7, s2, v3 6235; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v8 6236; GCN-NEXT: v_mul_lo_u32 v8, v3, v2 6237; GCN-NEXT: v_mul_hi_u32 v10, v3, v2 6238; GCN-NEXT: v_mul_hi_u32 v9, v3, v7 6239; GCN-NEXT: v_mul_hi_u32 v11, v5, v2 6240; GCN-NEXT: v_mul_lo_u32 v2, v5, v2 6241; GCN-NEXT: s_mov_b32 s15, s14 6242; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 6243; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 6244; GCN-NEXT: v_mul_lo_u32 v10, v5, v7 6245; GCN-NEXT: v_mul_hi_u32 v7, v5, v7 6246; GCN-NEXT: v_xor_b32_e32 v0, s8, v0 6247; GCN-NEXT: v_xor_b32_e32 v1, s9, v1 6248; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 6249; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc 6250; GCN-NEXT: v_addc_u32_e32 v8, vcc, v11, v4, vcc 6251; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 6252; GCN-NEXT: v_add_i32_e64 v2, s[0:1], v3, v2 6253; GCN-NEXT: v_addc_u32_e32 v7, vcc, v6, v8, vcc 6254; GCN-NEXT: v_addc_u32_e64 v3, vcc, v5, v7, s[0:1] 6255; GCN-NEXT: v_mul_lo_u32 v8, s2, v3 6256; GCN-NEXT: v_mul_hi_u32 v9, s2, v2 6257; GCN-NEXT: v_mul_lo_u32 v10, s3, v2 6258; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 6259; GCN-NEXT: v_mul_lo_u32 v9, s2, v2 6260; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 6261; GCN-NEXT: v_mul_lo_u32 v12, v2, v8 6262; GCN-NEXT: v_mul_hi_u32 v14, v2, v8 6263; GCN-NEXT: v_mul_hi_u32 v13, v2, v9 6264; GCN-NEXT: v_mul_hi_u32 v11, v3, v9 6265; GCN-NEXT: v_mul_lo_u32 v9, v3, v9 6266; GCN-NEXT: v_mul_hi_u32 v10, v3, v8 6267; GCN-NEXT: v_add_i32_e32 v12, vcc, v13, v12 6268; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc 6269; GCN-NEXT: v_mul_lo_u32 v3, v3, v8 6270; GCN-NEXT: v_add_i32_e32 v9, vcc, v12, v9 6271; GCN-NEXT: v_addc_u32_e32 v9, vcc, v13, v11, vcc 6272; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v4, vcc 6273; GCN-NEXT: v_add_i32_e32 v3, vcc, v9, v3 6274; GCN-NEXT: v_addc_u32_e32 v8, vcc, v6, v8, vcc 6275; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 6276; GCN-NEXT: v_addc_u32_e64 v5, vcc, v5, v8, s[0:1] 6277; GCN-NEXT: s_add_u32 s0, s10, s14 6278; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 6279; GCN-NEXT: s_addc_u32 s1, s11, s14 6280; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 6281; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6282; GCN-NEXT: v_mul_lo_u32 v5, s10, v3 6283; GCN-NEXT: v_mul_hi_u32 v7, s10, v2 6284; GCN-NEXT: v_mul_hi_u32 v9, s10, v3 6285; GCN-NEXT: v_mul_hi_u32 v10, s11, v3 6286; GCN-NEXT: v_mul_lo_u32 v3, s11, v3 6287; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 6288; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc 6289; GCN-NEXT: v_mul_lo_u32 v9, s11, v2 6290; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 6291; GCN-NEXT: v_mov_b32_e32 v8, s9 6292; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 6293; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v2, vcc 6294; GCN-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc 6295; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 6296; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc 6297; GCN-NEXT: v_mul_lo_u32 v4, s12, v3 6298; GCN-NEXT: v_mul_hi_u32 v5, s12, v2 6299; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 6300; GCN-NEXT: v_mul_lo_u32 v6, s13, v2 6301; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc 6302; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 6303; GCN-NEXT: v_mul_lo_u32 v5, s12, v2 6304; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 6305; GCN-NEXT: v_sub_i32_e32 v6, vcc, s11, v4 6306; GCN-NEXT: v_mov_b32_e32 v7, s13 6307; GCN-NEXT: v_sub_i32_e64 v5, s[0:1], s10, v5 6308; GCN-NEXT: v_subb_u32_e64 v6, vcc, v6, v7, s[0:1] 6309; GCN-NEXT: v_subrev_i32_e32 v7, vcc, s12, v5 6310; GCN-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc 6311; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v6 6312; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 6313; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v7 6314; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 6315; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v6 6316; GCN-NEXT: v_cndmask_b32_e32 v6, v8, v7, vcc 6317; GCN-NEXT: v_add_i32_e32 v7, vcc, 2, v2 6318; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc 6319; GCN-NEXT: v_add_i32_e32 v9, vcc, 1, v2 6320; GCN-NEXT: v_addc_u32_e32 v10, vcc, 0, v3, vcc 6321; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v6 6322; GCN-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[2:3] 6323; GCN-NEXT: v_mov_b32_e32 v8, s11 6324; GCN-NEXT: v_subb_u32_e64 v4, vcc, v8, v4, s[0:1] 6325; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 6326; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 6327; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 6328; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 6329; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v4 6330; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc 6331; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 6332; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[2:3] 6333; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 6334; GCN-NEXT: s_xor_b64 s[0:1], s[14:15], s[16:17] 6335; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 6336; GCN-NEXT: v_xor_b32_e32 v2, s0, v2 6337; GCN-NEXT: v_xor_b32_e32 v3, s1, v3 6338; GCN-NEXT: v_mov_b32_e32 v4, s1 6339; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s0, v2 6340; GCN-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc 6341; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 6342; GCN-NEXT: s_endpgm 6343 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 6344 %r = sdiv <2 x i64> %x, %shl.y 6345 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 6346 ret void 6347} 6348 6349define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 6350; CHECK-LABEL: @srem_i64_oddk_denom( 6351; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 1235195 6352; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 6353; CHECK-NEXT: ret void 6354; 6355; GCN-LABEL: srem_i64_oddk_denom: 6356; GCN: ; %bb.0: 6357; GCN-NEXT: v_mov_b32_e32 v0, 0x4f800000 6358; GCN-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 6359; GCN-NEXT: v_rcp_f32_e32 v0, v0 6360; GCN-NEXT: s_mov_b32 s2, 0xffed2705 6361; GCN-NEXT: v_mov_b32_e32 v8, 0 6362; GCN-NEXT: v_mov_b32_e32 v7, 0 6363; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 6364; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 6365; GCN-NEXT: v_trunc_f32_e32 v1, v1 6366; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 6367; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 6368; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 6369; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 6370; GCN-NEXT: s_mov_b32 s7, 0xf000 6371; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 6372; GCN-NEXT: v_mul_lo_u32 v2, v1, s2 6373; GCN-NEXT: v_mul_lo_u32 v4, v0, s2 6374; GCN-NEXT: s_mov_b32 s6, -1 6375; GCN-NEXT: s_waitcnt lgkmcnt(0) 6376; GCN-NEXT: s_mov_b32 s4, s8 6377; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6378; GCN-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 6379; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 6380; GCN-NEXT: v_mul_hi_u32 v6, v0, v4 6381; GCN-NEXT: v_mul_hi_u32 v3, v0, v2 6382; GCN-NEXT: v_mul_hi_u32 v9, v1, v2 6383; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 6384; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 6385; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 6386; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 6387; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc 6388; GCN-NEXT: s_mov_b32 s5, s9 6389; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 6390; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc 6391; GCN-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 6392; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6393; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 6394; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 6395; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 6396; GCN-NEXT: v_mul_lo_u32 v4, v2, s2 6397; GCN-NEXT: v_mul_hi_u32 v5, s2, v0 6398; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 6399; GCN-NEXT: v_mul_lo_u32 v5, v0, s2 6400; GCN-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 6401; GCN-NEXT: v_mul_lo_u32 v10, v0, v4 6402; GCN-NEXT: v_mul_hi_u32 v12, v0, v4 6403; GCN-NEXT: v_mul_hi_u32 v11, v0, v5 6404; GCN-NEXT: v_mul_hi_u32 v9, v2, v5 6405; GCN-NEXT: v_mul_lo_u32 v5, v2, v5 6406; GCN-NEXT: v_mul_hi_u32 v6, v2, v4 6407; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 6408; GCN-NEXT: v_addc_u32_e32 v11, vcc, v8, v12, vcc 6409; GCN-NEXT: v_mul_lo_u32 v2, v2, v4 6410; GCN-NEXT: v_add_i32_e32 v5, vcc, v10, v5 6411; GCN-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc 6412; GCN-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc 6413; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 6414; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 6415; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6416; GCN-NEXT: s_ashr_i32 s2, s11, 31 6417; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 6418; GCN-NEXT: s_add_u32 s0, s10, s2 6419; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 6420; GCN-NEXT: s_mov_b32 s3, s2 6421; GCN-NEXT: s_addc_u32 s1, s11, s2 6422; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 6423; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6424; GCN-NEXT: v_mul_lo_u32 v2, s0, v1 6425; GCN-NEXT: v_mul_hi_u32 v3, s0, v0 6426; GCN-NEXT: v_mul_hi_u32 v4, s0, v1 6427; GCN-NEXT: v_mul_hi_u32 v5, s1, v1 6428; GCN-NEXT: v_mul_lo_u32 v1, s1, v1 6429; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6430; GCN-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 6431; GCN-NEXT: v_mul_lo_u32 v4, s1, v0 6432; GCN-NEXT: v_mul_hi_u32 v0, s1, v0 6433; GCN-NEXT: s_mov_b32 s3, 0x12d8fb 6434; GCN-NEXT: v_add_i32_e32 v2, vcc, v4, v2 6435; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 6436; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 6437; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6438; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 6439; GCN-NEXT: v_mul_hi_u32 v2, s3, v0 6440; GCN-NEXT: v_mul_lo_u32 v1, v1, s3 6441; GCN-NEXT: v_mul_lo_u32 v0, v0, s3 6442; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 6443; GCN-NEXT: v_mov_b32_e32 v2, s1 6444; GCN-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 6445; GCN-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 6446; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s3, v0 6447; GCN-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc 6448; GCN-NEXT: v_subrev_i32_e32 v4, vcc, s3, v2 6449; GCN-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v3, vcc 6450; GCN-NEXT: s_mov_b32 s3, 0x12d8fa 6451; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s3, v2 6452; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 6453; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 6454; GCN-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc 6455; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 6456; GCN-NEXT: v_cmp_lt_u32_e32 vcc, s3, v0 6457; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 6458; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 6459; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 6460; GCN-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 6461; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 6462; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 6463; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6464; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 6465; GCN-NEXT: v_xor_b32_e32 v0, s2, v0 6466; GCN-NEXT: v_xor_b32_e32 v1, s2, v1 6467; GCN-NEXT: v_mov_b32_e32 v2, s2 6468; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 6469; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 6470; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6471; GCN-NEXT: s_endpgm 6472 %r = srem i64 %x, 1235195 6473 store i64 %r, i64 addrspace(1)* %out 6474 ret void 6475} 6476 6477define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 6478; CHECK-LABEL: @srem_i64_pow2k_denom( 6479; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 4096 6480; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 6481; CHECK-NEXT: ret void 6482; 6483; GCN-LABEL: srem_i64_pow2k_denom: 6484; GCN: ; %bb.0: 6485; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 6486; GCN-NEXT: s_mov_b32 s3, 0xf000 6487; GCN-NEXT: s_mov_b32 s2, -1 6488; GCN-NEXT: s_waitcnt lgkmcnt(0) 6489; GCN-NEXT: s_mov_b32 s0, s4 6490; GCN-NEXT: s_ashr_i32 s4, s7, 31 6491; GCN-NEXT: s_lshr_b32 s4, s4, 20 6492; GCN-NEXT: s_add_u32 s4, s6, s4 6493; GCN-NEXT: s_mov_b32 s1, s5 6494; GCN-NEXT: s_addc_u32 s5, s7, 0 6495; GCN-NEXT: s_and_b32 s4, s4, 0xfffff000 6496; GCN-NEXT: s_sub_u32 s4, s6, s4 6497; GCN-NEXT: s_subb_u32 s5, s7, s5 6498; GCN-NEXT: v_mov_b32_e32 v0, s4 6499; GCN-NEXT: v_mov_b32_e32 v1, s5 6500; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6501; GCN-NEXT: s_endpgm 6502 %r = srem i64 %x, 4096 6503 store i64 %r, i64 addrspace(1)* %out 6504 ret void 6505} 6506 6507define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 6508; CHECK-LABEL: @srem_i64_pow2_shl_denom( 6509; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 6510; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]] 6511; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 6512; CHECK-NEXT: ret void 6513; 6514; GCN-LABEL: srem_i64_pow2_shl_denom: 6515; GCN: ; %bb.0: 6516; GCN-NEXT: s_load_dword s4, s[0:1], 0xd 6517; GCN-NEXT: s_mov_b32 s3, 0 6518; GCN-NEXT: s_movk_i32 s2, 0x1000 6519; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 6520; GCN-NEXT: s_mov_b32 s7, 0xf000 6521; GCN-NEXT: s_waitcnt lgkmcnt(0) 6522; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 6523; GCN-NEXT: s_ashr_i32 s4, s3, 31 6524; GCN-NEXT: s_add_u32 s2, s2, s4 6525; GCN-NEXT: s_mov_b32 s5, s4 6526; GCN-NEXT: s_addc_u32 s3, s3, s4 6527; GCN-NEXT: s_xor_b64 s[12:13], s[2:3], s[4:5] 6528; GCN-NEXT: v_cvt_f32_u32_e32 v0, s12 6529; GCN-NEXT: v_cvt_f32_u32_e32 v1, s13 6530; GCN-NEXT: s_sub_u32 s2, 0, s12 6531; GCN-NEXT: s_subb_u32 s3, 0, s13 6532; GCN-NEXT: s_ashr_i32 s14, s11, 31 6533; GCN-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 6534; GCN-NEXT: v_rcp_f32_e32 v0, v0 6535; GCN-NEXT: s_mov_b32 s15, s14 6536; GCN-NEXT: s_mov_b32 s6, -1 6537; GCN-NEXT: s_mov_b32 s4, s8 6538; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 6539; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 6540; GCN-NEXT: v_trunc_f32_e32 v1, v1 6541; GCN-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 6542; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 6543; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 6544; GCN-NEXT: s_mov_b32 s5, s9 6545; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 6546; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 6547; GCN-NEXT: v_mul_lo_u32 v5, s3, v0 6548; GCN-NEXT: v_mul_lo_u32 v4, s2, v0 6549; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6550; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 6551; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 6552; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 6553; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 6554; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 6555; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 6556; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 6557; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 6558; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 6559; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 6560; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 6561; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 6562; GCN-NEXT: v_mov_b32_e32 v4, 0 6563; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 6564; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6565; GCN-NEXT: v_mov_b32_e32 v6, 0 6566; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 6567; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 6568; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 6569; GCN-NEXT: v_mul_lo_u32 v5, s2, v2 6570; GCN-NEXT: v_mul_hi_u32 v7, s2, v0 6571; GCN-NEXT: v_mul_lo_u32 v8, s3, v0 6572; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 6573; GCN-NEXT: v_mul_lo_u32 v7, s2, v0 6574; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 6575; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 6576; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 6577; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 6578; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 6579; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 6580; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 6581; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 6582; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 6583; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 6584; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 6585; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 6586; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 6587; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 6588; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 6589; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6590; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 6591; GCN-NEXT: s_add_u32 s0, s10, s14 6592; GCN-NEXT: s_addc_u32 s1, s11, s14 6593; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 6594; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 6595; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6596; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 6597; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 6598; GCN-NEXT: v_mul_hi_u32 v5, s10, v1 6599; GCN-NEXT: v_mul_hi_u32 v7, s11, v1 6600; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 6601; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6602; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6603; GCN-NEXT: v_mul_lo_u32 v5, s11, v0 6604; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 6605; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 6606; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 6607; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 6608; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6609; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 6610; GCN-NEXT: v_mul_lo_u32 v1, s12, v1 6611; GCN-NEXT: v_mul_hi_u32 v2, s12, v0 6612; GCN-NEXT: v_mul_lo_u32 v3, s13, v0 6613; GCN-NEXT: v_mul_lo_u32 v0, s12, v0 6614; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 6615; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6616; GCN-NEXT: v_sub_i32_e64 v0, s[0:1], s10, v0 6617; GCN-NEXT: v_sub_i32_e32 v2, vcc, s11, v1 6618; GCN-NEXT: v_mov_b32_e32 v3, s13 6619; GCN-NEXT: v_subb_u32_e64 v2, vcc, v2, v3, s[0:1] 6620; GCN-NEXT: v_subrev_i32_e64 v4, s[2:3], s12, v0 6621; GCN-NEXT: v_subbrev_u32_e64 v5, vcc, 0, v2, s[2:3] 6622; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v5 6623; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 6624; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v4 6625; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 6626; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v5 6627; GCN-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc 6628; GCN-NEXT: v_subb_u32_e64 v2, vcc, v2, v3, s[2:3] 6629; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s12, v4 6630; GCN-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc 6631; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v6 6632; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[2:3] 6633; GCN-NEXT: v_mov_b32_e32 v5, s11 6634; GCN-NEXT: v_subb_u32_e64 v1, vcc, v5, v1, s[0:1] 6635; GCN-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 6636; GCN-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 6637; GCN-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 6638; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 6639; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 6640; GCN-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 6641; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 6642; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6643; GCN-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[2:3] 6644; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6645; GCN-NEXT: v_xor_b32_e32 v0, s14, v0 6646; GCN-NEXT: v_xor_b32_e32 v1, s14, v1 6647; GCN-NEXT: v_mov_b32_e32 v2, s14 6648; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s14, v0 6649; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 6650; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6651; GCN-NEXT: s_endpgm 6652 %shl.y = shl i64 4096, %y 6653 %r = srem i64 %x, %shl.y 6654 store i64 %r, i64 addrspace(1)* %out 6655 ret void 6656} 6657 6658define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 6659; CHECK-LABEL: @srem_v2i64_pow2k_denom( 6660; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 6661; CHECK-NEXT: [[TMP2:%.*]] = srem i64 [[TMP1]], 4096 6662; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 6663; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 6664; CHECK-NEXT: [[TMP5:%.*]] = srem i64 [[TMP4]], 4096 6665; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 6666; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 6667; CHECK-NEXT: ret void 6668; 6669; GCN-LABEL: srem_v2i64_pow2k_denom: 6670; GCN: ; %bb.0: 6671; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6672; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 6673; GCN-NEXT: s_movk_i32 s8, 0xf000 6674; GCN-NEXT: s_mov_b32 s7, 0xf000 6675; GCN-NEXT: s_mov_b32 s6, -1 6676; GCN-NEXT: s_waitcnt lgkmcnt(0) 6677; GCN-NEXT: s_ashr_i32 s9, s1, 31 6678; GCN-NEXT: s_lshr_b32 s9, s9, 20 6679; GCN-NEXT: s_add_u32 s9, s0, s9 6680; GCN-NEXT: s_addc_u32 s10, s1, 0 6681; GCN-NEXT: s_and_b32 s9, s9, s8 6682; GCN-NEXT: s_sub_u32 s0, s0, s9 6683; GCN-NEXT: s_subb_u32 s1, s1, s10 6684; GCN-NEXT: s_ashr_i32 s9, s3, 31 6685; GCN-NEXT: s_lshr_b32 s9, s9, 20 6686; GCN-NEXT: s_add_u32 s9, s2, s9 6687; GCN-NEXT: s_addc_u32 s10, s3, 0 6688; GCN-NEXT: s_and_b32 s8, s9, s8 6689; GCN-NEXT: s_sub_u32 s2, s2, s8 6690; GCN-NEXT: s_subb_u32 s3, s3, s10 6691; GCN-NEXT: v_mov_b32_e32 v0, s0 6692; GCN-NEXT: v_mov_b32_e32 v1, s1 6693; GCN-NEXT: v_mov_b32_e32 v2, s2 6694; GCN-NEXT: v_mov_b32_e32 v3, s3 6695; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 6696; GCN-NEXT: s_endpgm 6697 %r = srem <2 x i64> %x, <i64 4096, i64 4096> 6698 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 6699 ret void 6700} 6701 6702define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 6703; CHECK-LABEL: @srem_v2i64_pow2_shl_denom( 6704; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 6705; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 6706; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 6707; CHECK-NEXT: [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]] 6708; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 6709; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 6710; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 6711; CHECK-NEXT: [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]] 6712; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 6713; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 6714; CHECK-NEXT: ret void 6715; 6716; GCN-LABEL: srem_v2i64_pow2_shl_denom: 6717; GCN: ; %bb.0: 6718; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 6719; GCN-NEXT: s_mov_b32 s3, 0 6720; GCN-NEXT: s_movk_i32 s2, 0x1000 6721; GCN-NEXT: s_mov_b32 s18, 0x4f800000 6722; GCN-NEXT: s_mov_b32 s19, 0x5f7ffffc 6723; GCN-NEXT: s_waitcnt lgkmcnt(0) 6724; GCN-NEXT: s_lshl_b64 s[14:15], s[2:3], s6 6725; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 6726; GCN-NEXT: s_ashr_i32 s4, s3, 31 6727; GCN-NEXT: s_add_u32 s2, s2, s4 6728; GCN-NEXT: s_mov_b32 s5, s4 6729; GCN-NEXT: s_addc_u32 s3, s3, s4 6730; GCN-NEXT: s_xor_b64 s[16:17], s[2:3], s[4:5] 6731; GCN-NEXT: v_cvt_f32_u32_e32 v0, s16 6732; GCN-NEXT: v_cvt_f32_u32_e32 v1, s17 6733; GCN-NEXT: s_mov_b32 s20, 0x2f800000 6734; GCN-NEXT: s_mov_b32 s21, 0xcf800000 6735; GCN-NEXT: s_sub_u32 s6, 0, s16 6736; GCN-NEXT: v_mac_f32_e32 v0, s18, v1 6737; GCN-NEXT: v_rcp_f32_e32 v0, v0 6738; GCN-NEXT: s_subb_u32 s7, 0, s17 6739; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6740; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 6741; GCN-NEXT: v_mul_f32_e32 v0, s19, v0 6742; GCN-NEXT: v_mul_f32_e32 v1, s20, v0 6743; GCN-NEXT: v_trunc_f32_e32 v1, v1 6744; GCN-NEXT: v_mac_f32_e32 v0, s21, v1 6745; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 6746; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 6747; GCN-NEXT: s_waitcnt lgkmcnt(0) 6748; GCN-NEXT: s_ashr_i32 s12, s9, 31 6749; GCN-NEXT: s_add_u32 s0, s8, s12 6750; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 6751; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 6752; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 6753; GCN-NEXT: v_mul_lo_u32 v5, s6, v0 6754; GCN-NEXT: s_mov_b32 s13, s12 6755; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6756; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 6757; GCN-NEXT: v_mul_lo_u32 v3, v0, v2 6758; GCN-NEXT: v_mul_hi_u32 v4, v0, v5 6759; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 6760; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 6761; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 6762; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 6763; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 6764; GCN-NEXT: v_mul_lo_u32 v6, v1, v5 6765; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 6766; GCN-NEXT: s_addc_u32 s1, s9, s12 6767; GCN-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] 6768; GCN-NEXT: v_add_i32_e32 v3, vcc, v6, v3 6769; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 6770; GCN-NEXT: v_mov_b32_e32 v4, 0 6771; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 6772; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6773; GCN-NEXT: v_mov_b32_e32 v6, 0 6774; GCN-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2 6775; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 6776; GCN-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[2:3] 6777; GCN-NEXT: v_mul_lo_u32 v5, s6, v2 6778; GCN-NEXT: v_mul_hi_u32 v7, s6, v0 6779; GCN-NEXT: v_mul_lo_u32 v8, s7, v0 6780; GCN-NEXT: s_mov_b32 s7, 0xf000 6781; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 6782; GCN-NEXT: v_mul_lo_u32 v7, s6, v0 6783; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 6784; GCN-NEXT: v_mul_lo_u32 v10, v0, v5 6785; GCN-NEXT: v_mul_hi_u32 v12, v0, v5 6786; GCN-NEXT: v_mul_hi_u32 v11, v0, v7 6787; GCN-NEXT: v_mul_hi_u32 v9, v2, v7 6788; GCN-NEXT: v_mul_lo_u32 v7, v2, v7 6789; GCN-NEXT: v_mul_hi_u32 v8, v2, v5 6790; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 6791; GCN-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 6792; GCN-NEXT: v_mul_lo_u32 v2, v2, v5 6793; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 6794; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 6795; GCN-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 6796; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 6797; GCN-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 6798; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6799; GCN-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] 6800; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 6801; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6802; GCN-NEXT: v_mul_lo_u32 v2, s8, v1 6803; GCN-NEXT: v_mul_hi_u32 v3, s8, v0 6804; GCN-NEXT: v_mul_hi_u32 v5, s8, v1 6805; GCN-NEXT: v_mul_hi_u32 v7, s9, v1 6806; GCN-NEXT: v_mul_lo_u32 v1, s9, v1 6807; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 6808; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6809; GCN-NEXT: v_mul_lo_u32 v5, s9, v0 6810; GCN-NEXT: v_mul_hi_u32 v0, s9, v0 6811; GCN-NEXT: s_mov_b32 s6, -1 6812; GCN-NEXT: v_add_i32_e32 v2, vcc, v5, v2 6813; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 6814; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 6815; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6816; GCN-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 6817; GCN-NEXT: v_mul_lo_u32 v1, s16, v1 6818; GCN-NEXT: v_mul_hi_u32 v2, s16, v0 6819; GCN-NEXT: v_mul_lo_u32 v3, s17, v0 6820; GCN-NEXT: v_mul_lo_u32 v0, s16, v0 6821; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 6822; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6823; GCN-NEXT: v_sub_i32_e64 v0, s[0:1], s8, v0 6824; GCN-NEXT: v_sub_i32_e32 v2, vcc, s9, v1 6825; GCN-NEXT: v_mov_b32_e32 v3, s17 6826; GCN-NEXT: v_subb_u32_e64 v2, vcc, v2, v3, s[0:1] 6827; GCN-NEXT: v_subrev_i32_e64 v5, s[2:3], s16, v0 6828; GCN-NEXT: v_subbrev_u32_e64 v7, vcc, 0, v2, s[2:3] 6829; GCN-NEXT: v_cmp_le_u32_e32 vcc, s17, v7 6830; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 6831; GCN-NEXT: v_cmp_le_u32_e32 vcc, s16, v5 6832; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc 6833; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s17, v7 6834; GCN-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc 6835; GCN-NEXT: v_subb_u32_e64 v2, vcc, v2, v3, s[2:3] 6836; GCN-NEXT: v_subrev_i32_e32 v3, vcc, s16, v5 6837; GCN-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc 6838; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v8 6839; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[2:3] 6840; GCN-NEXT: v_mov_b32_e32 v7, s9 6841; GCN-NEXT: v_subb_u32_e64 v1, vcc, v7, v1, s[0:1] 6842; GCN-NEXT: s_ashr_i32 s0, s15, 31 6843; GCN-NEXT: s_add_u32 s8, s14, s0 6844; GCN-NEXT: s_mov_b32 s1, s0 6845; GCN-NEXT: s_addc_u32 s9, s15, s0 6846; GCN-NEXT: s_xor_b64 s[8:9], s[8:9], s[0:1] 6847; GCN-NEXT: v_cvt_f32_u32_e32 v9, s8 6848; GCN-NEXT: v_cvt_f32_u32_e32 v10, s9 6849; GCN-NEXT: v_cmp_le_u32_e32 vcc, s17, v1 6850; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 6851; GCN-NEXT: v_cmp_le_u32_e32 vcc, s16, v0 6852; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 6853; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s17, v1 6854; GCN-NEXT: v_mac_f32_e32 v9, s18, v10 6855; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 6856; GCN-NEXT: v_rcp_f32_e32 v8, v9 6857; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 6858; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6859; GCN-NEXT: v_cndmask_b32_e64 v2, v5, v3, s[2:3] 6860; GCN-NEXT: v_mul_f32_e32 v3, s19, v8 6861; GCN-NEXT: v_mul_f32_e32 v5, s20, v3 6862; GCN-NEXT: v_trunc_f32_e32 v5, v5 6863; GCN-NEXT: v_mac_f32_e32 v3, s21, v5 6864; GCN-NEXT: v_cvt_u32_f32_e32 v3, v3 6865; GCN-NEXT: v_cvt_u32_f32_e32 v5, v5 6866; GCN-NEXT: s_sub_u32 s2, 0, s8 6867; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6868; GCN-NEXT: v_mul_hi_u32 v2, s2, v3 6869; GCN-NEXT: v_mul_lo_u32 v7, s2, v5 6870; GCN-NEXT: s_subb_u32 s3, 0, s9 6871; GCN-NEXT: v_mul_lo_u32 v8, s3, v3 6872; GCN-NEXT: s_ashr_i32 s14, s11, 31 6873; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v7 6874; GCN-NEXT: v_mul_lo_u32 v7, s2, v3 6875; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v8 6876; GCN-NEXT: v_mul_lo_u32 v8, v3, v2 6877; GCN-NEXT: v_mul_hi_u32 v10, v3, v2 6878; GCN-NEXT: v_mul_hi_u32 v9, v3, v7 6879; GCN-NEXT: v_mul_hi_u32 v11, v5, v2 6880; GCN-NEXT: v_mul_lo_u32 v2, v5, v2 6881; GCN-NEXT: s_mov_b32 s15, s14 6882; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 6883; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 6884; GCN-NEXT: v_mul_lo_u32 v10, v5, v7 6885; GCN-NEXT: v_mul_hi_u32 v7, v5, v7 6886; GCN-NEXT: v_xor_b32_e32 v0, s12, v0 6887; GCN-NEXT: v_xor_b32_e32 v1, s12, v1 6888; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 6889; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc 6890; GCN-NEXT: v_addc_u32_e32 v8, vcc, v11, v4, vcc 6891; GCN-NEXT: v_add_i32_e32 v2, vcc, v7, v2 6892; GCN-NEXT: v_add_i32_e64 v2, s[0:1], v3, v2 6893; GCN-NEXT: v_addc_u32_e32 v7, vcc, v6, v8, vcc 6894; GCN-NEXT: v_addc_u32_e64 v3, vcc, v5, v7, s[0:1] 6895; GCN-NEXT: v_mul_lo_u32 v8, s2, v3 6896; GCN-NEXT: v_mul_hi_u32 v9, s2, v2 6897; GCN-NEXT: v_mul_lo_u32 v10, s3, v2 6898; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 6899; GCN-NEXT: v_mul_lo_u32 v9, s2, v2 6900; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 6901; GCN-NEXT: v_mul_lo_u32 v12, v2, v8 6902; GCN-NEXT: v_mul_hi_u32 v14, v2, v8 6903; GCN-NEXT: v_mul_hi_u32 v13, v2, v9 6904; GCN-NEXT: v_mul_hi_u32 v11, v3, v9 6905; GCN-NEXT: v_mul_lo_u32 v9, v3, v9 6906; GCN-NEXT: v_mul_hi_u32 v10, v3, v8 6907; GCN-NEXT: v_add_i32_e32 v12, vcc, v13, v12 6908; GCN-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc 6909; GCN-NEXT: v_mul_lo_u32 v3, v3, v8 6910; GCN-NEXT: v_add_i32_e32 v9, vcc, v12, v9 6911; GCN-NEXT: v_addc_u32_e32 v9, vcc, v13, v11, vcc 6912; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v4, vcc 6913; GCN-NEXT: v_add_i32_e32 v3, vcc, v9, v3 6914; GCN-NEXT: v_addc_u32_e32 v8, vcc, v6, v8, vcc 6915; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 6916; GCN-NEXT: v_addc_u32_e64 v5, vcc, v5, v8, s[0:1] 6917; GCN-NEXT: s_add_u32 s0, s10, s14 6918; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 6919; GCN-NEXT: s_addc_u32 s1, s11, s14 6920; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 6921; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6922; GCN-NEXT: v_mul_lo_u32 v5, s10, v3 6923; GCN-NEXT: v_mul_hi_u32 v7, s10, v2 6924; GCN-NEXT: v_mul_hi_u32 v9, s10, v3 6925; GCN-NEXT: v_mul_hi_u32 v10, s11, v3 6926; GCN-NEXT: v_mul_lo_u32 v3, s11, v3 6927; GCN-NEXT: v_add_i32_e32 v5, vcc, v7, v5 6928; GCN-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc 6929; GCN-NEXT: v_mul_lo_u32 v9, s11, v2 6930; GCN-NEXT: v_mul_hi_u32 v2, s11, v2 6931; GCN-NEXT: v_mov_b32_e32 v8, s12 6932; GCN-NEXT: v_add_i32_e32 v5, vcc, v9, v5 6933; GCN-NEXT: v_addc_u32_e32 v2, vcc, v7, v2, vcc 6934; GCN-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc 6935; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 6936; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc 6937; GCN-NEXT: v_mul_lo_u32 v3, s8, v3 6938; GCN-NEXT: v_mul_hi_u32 v4, s8, v2 6939; GCN-NEXT: v_mul_lo_u32 v5, s9, v2 6940; GCN-NEXT: v_mul_lo_u32 v2, s8, v2 6941; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 6942; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc 6943; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 6944; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 6945; GCN-NEXT: v_sub_i32_e64 v2, s[0:1], s10, v2 6946; GCN-NEXT: v_sub_i32_e32 v4, vcc, s11, v3 6947; GCN-NEXT: v_mov_b32_e32 v5, s9 6948; GCN-NEXT: v_subb_u32_e64 v4, vcc, v4, v5, s[0:1] 6949; GCN-NEXT: v_subrev_i32_e64 v6, s[2:3], s8, v2 6950; GCN-NEXT: v_subbrev_u32_e64 v7, vcc, 0, v4, s[2:3] 6951; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v7 6952; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 6953; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v6 6954; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc 6955; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s9, v7 6956; GCN-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc 6957; GCN-NEXT: v_subb_u32_e64 v4, vcc, v4, v5, s[2:3] 6958; GCN-NEXT: v_subrev_i32_e32 v5, vcc, s8, v6 6959; GCN-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc 6960; GCN-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v8 6961; GCN-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[2:3] 6962; GCN-NEXT: v_mov_b32_e32 v7, s11 6963; GCN-NEXT: v_subb_u32_e64 v3, vcc, v7, v3, s[0:1] 6964; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 6965; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 6966; GCN-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 6967; GCN-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 6968; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 6969; GCN-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 6970; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 6971; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 6972; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[2:3] 6973; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 6974; GCN-NEXT: v_xor_b32_e32 v2, s14, v2 6975; GCN-NEXT: v_xor_b32_e32 v3, s14, v3 6976; GCN-NEXT: v_mov_b32_e32 v4, s14 6977; GCN-NEXT: v_subrev_i32_e32 v2, vcc, s14, v2 6978; GCN-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc 6979; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 6980; GCN-NEXT: s_endpgm 6981 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 6982 %r = srem <2 x i64> %x, %shl.y 6983 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 6984 ret void 6985} 6986