1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update 2; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s 3; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s 5 6define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 7; CHECK-LABEL: @udiv_i32( 8; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 9; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 10; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000 11; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 12; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]] 13; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]] 14; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64 15; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 16; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 17; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 18; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 19; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 20; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]] 21; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64 22; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 23; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 24; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 25; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 26; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 27; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]] 28; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]] 29; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]] 30; CHECK-NEXT: [[TMP23:%.*]] = add i32 [[TMP19]], 1 31; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP19]] 32; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP21]], [[Y]] 33; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP22]], i32 [[TMP25]], i32 [[TMP21]] 34; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]] 35; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP24]], 1 36; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]] 37; CHECK-NEXT: store i32 [[TMP29]], i32 addrspace(1)* [[OUT:%.*]], align 4 38; CHECK-NEXT: ret void 39; 40; GFX6-LABEL: udiv_i32: 41; GFX6: ; %bb.0: 42; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 43; GFX6-NEXT: s_mov_b32 s7, 0xf000 44; GFX6-NEXT: s_mov_b32 s6, -1 45; GFX6-NEXT: s_waitcnt lgkmcnt(0) 46; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 47; GFX6-NEXT: s_sub_i32 s4, 0, s3 48; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 49; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 50; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 51; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 52; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 53; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 54; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 55; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 56; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 57; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 58; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 59; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 60; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 61; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 62; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 63; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 64; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 65; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 66; GFX6-NEXT: s_waitcnt lgkmcnt(0) 67; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 68; GFX6-NEXT: s_endpgm 69; 70; GFX9-LABEL: udiv_i32: 71; GFX9: ; %bb.0: 72; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 73; GFX9-NEXT: v_mov_b32_e32 v2, 0 74; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 75; GFX9-NEXT: s_waitcnt lgkmcnt(0) 76; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 77; GFX9-NEXT: s_sub_i32 s4, 0, s3 78; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 79; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 80; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 81; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 82; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 83; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 84; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 85; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3 86; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 87; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 88; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 89; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 90; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 91; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 92; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 93; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 94; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 95; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 96; GFX9-NEXT: s_endpgm 97 %r = udiv i32 %x, %y 98 store i32 %r, i32 addrspace(1)* %out 99 ret void 100} 101 102define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 103; CHECK-LABEL: @urem_i32( 104; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 105; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 106; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000 107; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 108; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]] 109; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]] 110; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64 111; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 112; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 113; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 114; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 115; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 116; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]] 117; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64 118; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 119; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 120; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 121; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 122; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 123; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]] 124; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]] 125; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]] 126; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP21]], [[Y]] 127; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP21]] 128; CHECK-NEXT: [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]] 129; CHECK-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]] 130; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]] 131; CHECK-NEXT: store i32 [[TMP27]], i32 addrspace(1)* [[OUT:%.*]], align 4 132; CHECK-NEXT: ret void 133; 134; GFX6-LABEL: urem_i32: 135; GFX6: ; %bb.0: 136; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 137; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 138; GFX6-NEXT: s_mov_b32 s3, 0xf000 139; GFX6-NEXT: s_waitcnt lgkmcnt(0) 140; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 141; GFX6-NEXT: s_sub_i32 s2, 0, s5 142; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 143; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 144; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 145; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 146; GFX6-NEXT: s_mov_b32 s2, -1 147; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 148; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 149; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 150; GFX6-NEXT: v_mul_lo_u32 v0, v0, s5 151; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 152; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s5, v0 153; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 154; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 155; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s5, v0 156; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 157; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 158; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 159; GFX6-NEXT: s_endpgm 160; 161; GFX9-LABEL: urem_i32: 162; GFX9: ; %bb.0: 163; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 164; GFX9-NEXT: v_mov_b32_e32 v1, 0 165; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 166; GFX9-NEXT: s_waitcnt lgkmcnt(0) 167; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 168; GFX9-NEXT: s_sub_i32 s4, 0, s3 169; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 170; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 171; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 172; GFX9-NEXT: v_readfirstlane_b32 s5, v0 173; GFX9-NEXT: s_mul_i32 s4, s4, s5 174; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 175; GFX9-NEXT: s_add_i32 s5, s5, s4 176; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5 177; GFX9-NEXT: s_mul_i32 s4, s4, s3 178; GFX9-NEXT: s_sub_i32 s2, s2, s4 179; GFX9-NEXT: s_sub_i32 s4, s2, s3 180; GFX9-NEXT: s_cmp_ge_u32 s2, s3 181; GFX9-NEXT: s_cselect_b32 s2, s4, s2 182; GFX9-NEXT: s_sub_i32 s4, s2, s3 183; GFX9-NEXT: s_cmp_ge_u32 s2, s3 184; GFX9-NEXT: s_cselect_b32 s2, s4, s2 185; GFX9-NEXT: v_mov_b32_e32 v0, s2 186; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 187; GFX9-NEXT: s_endpgm 188 %r = urem i32 %x, %y 189 store i32 %r, i32 addrspace(1)* %out 190 ret void 191} 192 193define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 194; CHECK-LABEL: @sdiv_i32( 195; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 196; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 197; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 198; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[X]], [[TMP1]] 199; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]] 200; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]] 201; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]] 202; CHECK-NEXT: [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float 203; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]]) 204; CHECK-NEXT: [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41EFFFFFC0000000 205; CHECK-NEXT: [[TMP11:%.*]] = fptoui float [[TMP10]] to i32 206; CHECK-NEXT: [[TMP12:%.*]] = sub i32 0, [[TMP7]] 207; CHECK-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP11]] 208; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP11]] to i64 209; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 210; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 211; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 212; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 213; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 214; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP11]], [[TMP19]] 215; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP6]] to i64 216; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP20]] to i64 217; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]] 218; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 219; CHECK-NEXT: [[TMP25:%.*]] = lshr i64 [[TMP23]], 32 220; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 221; CHECK-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], [[TMP7]] 222; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP6]], [[TMP27]] 223; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP7]] 224; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 225; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 226; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP28]], [[TMP7]] 227; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP29]], i32 [[TMP32]], i32 [[TMP28]] 228; CHECK-NEXT: [[TMP34:%.*]] = icmp uge i32 [[TMP33]], [[TMP7]] 229; CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP31]], 1 230; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]] 231; CHECK-NEXT: [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]] 232; CHECK-NEXT: [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]] 233; CHECK-NEXT: store i32 [[TMP38]], i32 addrspace(1)* [[OUT:%.*]], align 4 234; CHECK-NEXT: ret void 235; 236; GFX6-LABEL: sdiv_i32: 237; GFX6: ; %bb.0: 238; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 239; GFX6-NEXT: s_mov_b32 s7, 0xf000 240; GFX6-NEXT: s_mov_b32 s6, -1 241; GFX6-NEXT: s_waitcnt lgkmcnt(0) 242; GFX6-NEXT: s_ashr_i32 s8, s3, 31 243; GFX6-NEXT: s_add_i32 s3, s3, s8 244; GFX6-NEXT: s_xor_b32 s3, s3, s8 245; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 246; GFX6-NEXT: s_sub_i32 s4, 0, s3 247; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 248; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 249; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 250; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 251; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 252; GFX6-NEXT: s_ashr_i32 s0, s2, 31 253; GFX6-NEXT: s_add_i32 s1, s2, s0 254; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 255; GFX6-NEXT: s_xor_b32 s1, s1, s0 256; GFX6-NEXT: s_xor_b32 s2, s0, s8 257; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 258; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 259; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 260; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 261; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 262; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 263; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 264; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 265; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 266; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 267; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 268; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 269; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 270; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 271; GFX6-NEXT: s_waitcnt lgkmcnt(0) 272; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 273; GFX6-NEXT: s_endpgm 274; 275; GFX9-LABEL: sdiv_i32: 276; GFX9: ; %bb.0: 277; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 278; GFX9-NEXT: v_mov_b32_e32 v2, 0 279; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 280; GFX9-NEXT: s_waitcnt lgkmcnt(0) 281; GFX9-NEXT: s_ashr_i32 s4, s3, 31 282; GFX9-NEXT: s_add_i32 s3, s3, s4 283; GFX9-NEXT: s_xor_b32 s3, s3, s4 284; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 285; GFX9-NEXT: s_sub_i32 s5, 0, s3 286; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 287; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 288; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 289; GFX9-NEXT: v_mul_lo_u32 v1, s5, v0 290; GFX9-NEXT: s_ashr_i32 s5, s2, 31 291; GFX9-NEXT: s_add_i32 s2, s2, s5 292; GFX9-NEXT: s_xor_b32 s2, s2, s5 293; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 294; GFX9-NEXT: s_xor_b32 s4, s5, s4 295; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 296; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 297; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3 298; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 299; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 300; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 301; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 302; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 303; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 304; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 305; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 306; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 307; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 308; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 309; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 310; GFX9-NEXT: s_endpgm 311 %r = sdiv i32 %x, %y 312 store i32 %r, i32 addrspace(1)* %out 313 ret void 314} 315 316define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 317; CHECK-LABEL: @srem_i32( 318; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 319; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 320; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[X]], [[TMP1]] 321; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]] 322; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]] 323; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]] 324; CHECK-NEXT: [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float 325; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 326; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41EFFFFFC0000000 327; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP9]] to i32 328; CHECK-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP6]] 329; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP10]] 330; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP10]] to i64 331; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64 332; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]] 333; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 334; CHECK-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP15]], 32 335; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 336; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP10]], [[TMP18]] 337; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64 338; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP19]] to i64 339; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]] 340; CHECK-NEXT: [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32 341; CHECK-NEXT: [[TMP24:%.*]] = lshr i64 [[TMP22]], 32 342; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 343; CHECK-NEXT: [[TMP26:%.*]] = mul i32 [[TMP25]], [[TMP6]] 344; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP5]], [[TMP26]] 345; CHECK-NEXT: [[TMP28:%.*]] = icmp uge i32 [[TMP27]], [[TMP6]] 346; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP6]] 347; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP28]], i32 [[TMP29]], i32 [[TMP27]] 348; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP6]] 349; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP30]], [[TMP6]] 350; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]] 351; CHECK-NEXT: [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]] 352; CHECK-NEXT: [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]] 353; CHECK-NEXT: store i32 [[TMP35]], i32 addrspace(1)* [[OUT:%.*]], align 4 354; CHECK-NEXT: ret void 355; 356; GFX6-LABEL: srem_i32: 357; GFX6: ; %bb.0: 358; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 359; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 360; GFX6-NEXT: s_waitcnt lgkmcnt(0) 361; GFX6-NEXT: s_ashr_i32 s4, s3, 31 362; GFX6-NEXT: s_add_i32 s3, s3, s4 363; GFX6-NEXT: s_xor_b32 s4, s3, s4 364; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 365; GFX6-NEXT: s_sub_i32 s3, 0, s4 366; GFX6-NEXT: s_ashr_i32 s5, s2, 31 367; GFX6-NEXT: s_add_i32 s2, s2, s5 368; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 369; GFX6-NEXT: s_xor_b32 s6, s2, s5 370; GFX6-NEXT: s_mov_b32 s2, -1 371; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 372; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 373; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 374; GFX6-NEXT: s_mov_b32 s3, 0xf000 375; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 376; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 377; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 378; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 379; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 380; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 381; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 382; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 383; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 384; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 385; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 386; GFX6-NEXT: v_xor_b32_e32 v0, s5, v0 387; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s5, v0 388; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 389; GFX6-NEXT: s_endpgm 390; 391; GFX9-LABEL: srem_i32: 392; GFX9: ; %bb.0: 393; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 394; GFX9-NEXT: v_mov_b32_e32 v1, 0 395; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 396; GFX9-NEXT: s_waitcnt lgkmcnt(0) 397; GFX9-NEXT: s_ashr_i32 s4, s3, 31 398; GFX9-NEXT: s_add_i32 s3, s3, s4 399; GFX9-NEXT: s_xor_b32 s3, s3, s4 400; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 401; GFX9-NEXT: s_sub_i32 s5, 0, s3 402; GFX9-NEXT: s_ashr_i32 s4, s2, 31 403; GFX9-NEXT: s_add_i32 s2, s2, s4 404; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 405; GFX9-NEXT: s_xor_b32 s2, s2, s4 406; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 407; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 408; GFX9-NEXT: v_readfirstlane_b32 s6, v0 409; GFX9-NEXT: s_mul_i32 s5, s5, s6 410; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 411; GFX9-NEXT: s_add_i32 s6, s6, s5 412; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 413; GFX9-NEXT: s_mul_i32 s5, s5, s3 414; GFX9-NEXT: s_sub_i32 s2, s2, s5 415; GFX9-NEXT: s_sub_i32 s5, s2, s3 416; GFX9-NEXT: s_cmp_ge_u32 s2, s3 417; GFX9-NEXT: s_cselect_b32 s2, s5, s2 418; GFX9-NEXT: s_sub_i32 s5, s2, s3 419; GFX9-NEXT: s_cmp_ge_u32 s2, s3 420; GFX9-NEXT: s_cselect_b32 s2, s5, s2 421; GFX9-NEXT: s_xor_b32 s2, s2, s4 422; GFX9-NEXT: s_sub_i32 s2, s2, s4 423; GFX9-NEXT: v_mov_b32_e32 v0, s2 424; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 425; GFX9-NEXT: s_endpgm 426 %r = srem i32 %x, %y 427 store i32 %r, i32 addrspace(1)* %out 428 ret void 429} 430 431define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 432; CHECK-LABEL: @udiv_i16( 433; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 434; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 435; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 436; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 437; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 438; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 439; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 440; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 441; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 442; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 443; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 444; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 445; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 446; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 447; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 448; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 65535 449; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 450; CHECK-NEXT: store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]], align 2 451; CHECK-NEXT: ret void 452; 453; GFX6-LABEL: udiv_i16: 454; GFX6: ; %bb.0: 455; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb 456; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 457; GFX6-NEXT: s_waitcnt lgkmcnt(0) 458; GFX6-NEXT: s_lshr_b32 s3, s2, 16 459; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 460; GFX6-NEXT: s_and_b32 s2, s2, 0xffff 461; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s2 462; GFX6-NEXT: s_mov_b32 s3, 0xf000 463; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 464; GFX6-NEXT: s_mov_b32 s2, -1 465; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 466; GFX6-NEXT: v_trunc_f32_e32 v2, v2 467; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 468; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 469; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 470; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 471; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 472; GFX6-NEXT: s_endpgm 473; 474; GFX9-LABEL: udiv_i16: 475; GFX9: ; %bb.0: 476; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 477; GFX9-NEXT: v_mov_b32_e32 v3, 0 478; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 479; GFX9-NEXT: s_waitcnt lgkmcnt(0) 480; GFX9-NEXT: s_lshr_b32 s3, s2, 16 481; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 482; GFX9-NEXT: s_and_b32 s2, s2, 0xffff 483; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 484; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 485; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 486; GFX9-NEXT: v_trunc_f32_e32 v2, v2 487; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 488; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 489; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 490; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 491; GFX9-NEXT: global_store_short v3, v0, s[0:1] 492; GFX9-NEXT: s_endpgm 493 %r = udiv i16 %x, %y 494 store i16 %r, i16 addrspace(1)* %out 495 ret void 496} 497 498define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 499; CHECK-LABEL: @urem_i16( 500; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 501; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 502; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 503; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 504; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 505; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 506; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 507; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 508; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 509; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 510; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 511; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 512; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 513; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 514; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 515; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 516; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 517; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 518; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 519; CHECK-NEXT: store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]], align 2 520; CHECK-NEXT: ret void 521; 522; GFX6-LABEL: urem_i16: 523; GFX6: ; %bb.0: 524; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 525; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 526; GFX6-NEXT: s_waitcnt lgkmcnt(0) 527; GFX6-NEXT: s_lshr_b32 s2, s4, 16 528; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 529; GFX6-NEXT: s_and_b32 s3, s4, 0xffff 530; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 531; GFX6-NEXT: s_mov_b32 s3, 0xf000 532; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 533; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 534; GFX6-NEXT: v_trunc_f32_e32 v2, v2 535; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 536; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 537; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 538; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 539; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 540; GFX6-NEXT: s_mov_b32 s2, -1 541; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 542; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 543; GFX6-NEXT: s_endpgm 544; 545; GFX9-LABEL: urem_i16: 546; GFX9: ; %bb.0: 547; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 548; GFX9-NEXT: s_waitcnt lgkmcnt(0) 549; GFX9-NEXT: s_lshr_b32 s3, s2, 16 550; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 551; GFX9-NEXT: s_and_b32 s4, s2, 0xffff 552; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 553; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 554; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 555; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 556; GFX9-NEXT: v_trunc_f32_e32 v2, v2 557; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 558; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 559; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 560; GFX9-NEXT: v_mov_b32_e32 v1, 0 561; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 562; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 563; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 564; GFX9-NEXT: s_waitcnt lgkmcnt(0) 565; GFX9-NEXT: global_store_short v1, v0, s[0:1] 566; GFX9-NEXT: s_endpgm 567 %r = urem i16 %x, %y 568 store i16 %r, i16 addrspace(1)* %out 569 ret void 570} 571 572define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 573; CHECK-LABEL: @sdiv_i16( 574; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 575; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 576; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 577; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 578; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 579; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 580; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 581; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 582; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 583; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 584; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 585; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 586; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 587; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 588; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 589; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 590; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 591; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 592; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 16 593; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 16 594; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 595; CHECK-NEXT: store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]], align 2 596; CHECK-NEXT: ret void 597; 598; GFX6-LABEL: sdiv_i16: 599; GFX6: ; %bb.0: 600; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 601; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 602; GFX6-NEXT: s_mov_b32 s3, 0xf000 603; GFX6-NEXT: s_mov_b32 s2, -1 604; GFX6-NEXT: s_waitcnt lgkmcnt(0) 605; GFX6-NEXT: s_ashr_i32 s5, s4, 16 606; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 607; GFX6-NEXT: s_sext_i32_i16 s4, s4 608; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 609; GFX6-NEXT: s_xor_b32 s4, s4, s5 610; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 611; GFX6-NEXT: s_ashr_i32 s4, s4, 30 612; GFX6-NEXT: s_or_b32 s4, s4, 1 613; GFX6-NEXT: v_mov_b32_e32 v3, s4 614; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 615; GFX6-NEXT: v_trunc_f32_e32 v2, v2 616; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 617; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 618; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 619; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 620; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 621; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 622; GFX6-NEXT: s_endpgm 623; 624; GFX9-LABEL: sdiv_i16: 625; GFX9: ; %bb.0: 626; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 627; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 628; GFX9-NEXT: v_mov_b32_e32 v1, 0 629; GFX9-NEXT: s_waitcnt lgkmcnt(0) 630; GFX9-NEXT: s_ashr_i32 s0, s4, 16 631; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 632; GFX9-NEXT: s_sext_i32_i16 s1, s4 633; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 634; GFX9-NEXT: s_xor_b32 s0, s1, s0 635; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 636; GFX9-NEXT: s_ashr_i32 s0, s0, 30 637; GFX9-NEXT: s_or_b32 s4, s0, 1 638; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 639; GFX9-NEXT: v_trunc_f32_e32 v3, v3 640; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 641; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 642; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 643; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 644; GFX9-NEXT: s_cselect_b32 s0, s4, 0 645; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 646; GFX9-NEXT: global_store_short v1, v0, s[2:3] 647; GFX9-NEXT: s_endpgm 648 %r = sdiv i16 %x, %y 649 store i16 %r, i16 addrspace(1)* %out 650 ret void 651} 652 653define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 654; CHECK-LABEL: @srem_i16( 655; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 656; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 657; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 658; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 659; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 660; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 661; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 662; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 663; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 664; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 665; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 666; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 667; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 668; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 669; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 670; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 671; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 672; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 673; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 674; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 675; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 676; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 677; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 678; CHECK-NEXT: store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]], align 2 679; CHECK-NEXT: ret void 680; 681; GFX6-LABEL: srem_i16: 682; GFX6: ; %bb.0: 683; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 684; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 685; GFX6-NEXT: s_waitcnt lgkmcnt(0) 686; GFX6-NEXT: s_ashr_i32 s2, s4, 16 687; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 688; GFX6-NEXT: s_sext_i32_i16 s3, s4 689; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 690; GFX6-NEXT: s_xor_b32 s3, s3, s2 691; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 692; GFX6-NEXT: s_ashr_i32 s3, s3, 30 693; GFX6-NEXT: s_or_b32 s3, s3, 1 694; GFX6-NEXT: v_mov_b32_e32 v3, s3 695; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 696; GFX6-NEXT: v_trunc_f32_e32 v2, v2 697; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 698; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 699; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 700; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 701; GFX6-NEXT: s_mov_b32 s3, 0xf000 702; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 703; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 704; GFX6-NEXT: s_mov_b32 s2, -1 705; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 706; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 707; GFX6-NEXT: s_endpgm 708; 709; GFX9-LABEL: srem_i16: 710; GFX9: ; %bb.0: 711; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 712; GFX9-NEXT: s_waitcnt lgkmcnt(0) 713; GFX9-NEXT: s_ashr_i32 s5, s4, 16 714; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5 715; GFX9-NEXT: s_sext_i32_i16 s2, s4 716; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 717; GFX9-NEXT: s_xor_b32 s2, s2, s5 718; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 719; GFX9-NEXT: s_ashr_i32 s2, s2, 30 720; GFX9-NEXT: s_or_b32 s6, s2, 1 721; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 722; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 723; GFX9-NEXT: v_trunc_f32_e32 v2, v2 724; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 725; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 726; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 727; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 728; GFX9-NEXT: s_cselect_b32 s2, s6, 0 729; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 730; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 731; GFX9-NEXT: v_mov_b32_e32 v1, 0 732; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 733; GFX9-NEXT: s_waitcnt lgkmcnt(0) 734; GFX9-NEXT: global_store_short v1, v0, s[0:1] 735; GFX9-NEXT: s_endpgm 736 %r = srem i16 %x, %y 737 store i16 %r, i16 addrspace(1)* %out 738 ret void 739} 740 741define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 742; CHECK-LABEL: @udiv_i8( 743; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 744; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 745; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 746; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 747; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 748; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 749; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 750; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 751; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 752; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 753; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 754; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 755; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 756; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 757; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 758; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 255 759; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 760; CHECK-NEXT: store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]], align 1 761; CHECK-NEXT: ret void 762; 763; GFX6-LABEL: udiv_i8: 764; GFX6: ; %bb.0: 765; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 766; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 767; GFX6-NEXT: s_mov_b32 s3, 0xf000 768; GFX6-NEXT: s_mov_b32 s2, -1 769; GFX6-NEXT: s_waitcnt lgkmcnt(0) 770; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 771; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 772; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 773; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 774; GFX6-NEXT: v_trunc_f32_e32 v1, v1 775; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 776; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 777; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 778; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 779; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 780; GFX6-NEXT: s_endpgm 781; 782; GFX9-LABEL: udiv_i8: 783; GFX9: ; %bb.0: 784; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 785; GFX9-NEXT: v_mov_b32_e32 v2, 0 786; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 787; GFX9-NEXT: s_waitcnt lgkmcnt(0) 788; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 789; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 790; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 791; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 792; GFX9-NEXT: v_trunc_f32_e32 v1, v1 793; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 794; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v3 795; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 796; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 797; GFX9-NEXT: global_store_byte v2, v0, s[0:1] 798; GFX9-NEXT: s_endpgm 799 %r = udiv i8 %x, %y 800 store i8 %r, i8 addrspace(1)* %out 801 ret void 802} 803 804define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 805; CHECK-LABEL: @urem_i8( 806; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 807; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 808; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 809; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 810; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 811; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 812; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 813; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 814; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 815; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 816; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 817; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 818; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 819; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 820; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 821; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 822; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 823; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 255 824; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8 825; CHECK-NEXT: store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]], align 1 826; CHECK-NEXT: ret void 827; 828; GFX6-LABEL: urem_i8: 829; GFX6: ; %bb.0: 830; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 831; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 832; GFX6-NEXT: s_mov_b32 s3, 0xf000 833; GFX6-NEXT: s_waitcnt lgkmcnt(0) 834; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 835; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 836; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 837; GFX6-NEXT: s_lshr_b32 s2, s4, 8 838; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 839; GFX6-NEXT: v_trunc_f32_e32 v1, v1 840; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 841; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 842; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 843; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 844; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 845; GFX6-NEXT: s_mov_b32 s2, -1 846; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 847; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 848; GFX6-NEXT: s_endpgm 849; 850; GFX9-LABEL: urem_i8: 851; GFX9: ; %bb.0: 852; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 853; GFX9-NEXT: s_waitcnt lgkmcnt(0) 854; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 855; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 856; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 857; GFX9-NEXT: s_lshr_b32 s3, s2, 8 858; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 859; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 860; GFX9-NEXT: v_trunc_f32_e32 v1, v1 861; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 862; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2 863; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 864; GFX9-NEXT: v_mov_b32_e32 v1, 0 865; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 866; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 867; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 868; GFX9-NEXT: s_waitcnt lgkmcnt(0) 869; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 870; GFX9-NEXT: s_endpgm 871 %r = urem i8 %x, %y 872 store i8 %r, i8 addrspace(1)* %out 873 ret void 874} 875 876define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 877; CHECK-LABEL: @sdiv_i8( 878; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 879; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 880; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 881; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 882; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 883; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 884; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 885; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 886; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 887; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 888; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 889; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 890; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 891; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 892; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 893; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 894; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 895; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 896; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 24 897; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 24 898; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8 899; CHECK-NEXT: store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]], align 1 900; CHECK-NEXT: ret void 901; 902; GFX6-LABEL: sdiv_i8: 903; GFX6: ; %bb.0: 904; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 905; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 906; GFX6-NEXT: s_mov_b32 s3, 0xf000 907; GFX6-NEXT: s_mov_b32 s2, -1 908; GFX6-NEXT: s_waitcnt lgkmcnt(0) 909; GFX6-NEXT: s_bfe_i32 s5, s4, 0x80008 910; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 911; GFX6-NEXT: s_sext_i32_i8 s4, s4 912; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 913; GFX6-NEXT: s_xor_b32 s4, s4, s5 914; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 915; GFX6-NEXT: s_ashr_i32 s4, s4, 30 916; GFX6-NEXT: s_or_b32 s4, s4, 1 917; GFX6-NEXT: v_mov_b32_e32 v3, s4 918; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 919; GFX6-NEXT: v_trunc_f32_e32 v2, v2 920; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 921; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 922; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 923; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 924; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 925; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 926; GFX6-NEXT: s_endpgm 927; 928; GFX9-LABEL: sdiv_i8: 929; GFX9: ; %bb.0: 930; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 931; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 932; GFX9-NEXT: v_mov_b32_e32 v1, 0 933; GFX9-NEXT: s_waitcnt lgkmcnt(0) 934; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80008 935; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 936; GFX9-NEXT: s_sext_i32_i8 s1, s4 937; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 938; GFX9-NEXT: s_xor_b32 s0, s1, s0 939; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 940; GFX9-NEXT: s_ashr_i32 s0, s0, 30 941; GFX9-NEXT: s_or_b32 s4, s0, 1 942; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 943; GFX9-NEXT: v_trunc_f32_e32 v3, v3 944; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 945; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 946; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 947; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 948; GFX9-NEXT: s_cselect_b32 s0, s4, 0 949; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 950; GFX9-NEXT: global_store_byte v1, v0, s[2:3] 951; GFX9-NEXT: s_endpgm 952 %r = sdiv i8 %x, %y 953 store i8 %r, i8 addrspace(1)* %out 954 ret void 955} 956 957define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 958; CHECK-LABEL: @srem_i8( 959; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 960; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 961; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 962; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 963; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 964; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 965; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 966; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 967; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 968; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 969; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 970; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 971; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 972; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 973; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 974; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 975; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 976; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 977; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 978; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 979; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 24 980; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 24 981; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8 982; CHECK-NEXT: store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]], align 1 983; CHECK-NEXT: ret void 984; 985; GFX6-LABEL: srem_i8: 986; GFX6: ; %bb.0: 987; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 988; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 989; GFX6-NEXT: s_waitcnt lgkmcnt(0) 990; GFX6-NEXT: s_bfe_i32 s2, s4, 0x80008 991; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 992; GFX6-NEXT: s_sext_i32_i8 s5, s4 993; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 994; GFX6-NEXT: s_xor_b32 s2, s5, s2 995; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 996; GFX6-NEXT: s_ashr_i32 s2, s2, 30 997; GFX6-NEXT: s_or_b32 s2, s2, 1 998; GFX6-NEXT: v_mov_b32_e32 v3, s2 999; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 1000; GFX6-NEXT: v_trunc_f32_e32 v2, v2 1001; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 1002; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 1003; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 1004; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 1005; GFX6-NEXT: s_lshr_b32 s3, s4, 8 1006; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1007; GFX6-NEXT: v_mul_lo_u32 v0, v0, s3 1008; GFX6-NEXT: s_mov_b32 s3, 0xf000 1009; GFX6-NEXT: s_mov_b32 s2, -1 1010; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1011; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 1012; GFX6-NEXT: s_endpgm 1013; 1014; GFX9-LABEL: srem_i8: 1015; GFX9: ; %bb.0: 1016; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 1017; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1018; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1019; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80008 1020; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 1021; GFX9-NEXT: s_sext_i32_i8 s1, s4 1022; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 1023; GFX9-NEXT: s_xor_b32 s0, s1, s0 1024; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 1025; GFX9-NEXT: s_ashr_i32 s0, s0, 30 1026; GFX9-NEXT: s_lshr_b32 s5, s4, 8 1027; GFX9-NEXT: s_or_b32 s6, s0, 1 1028; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 1029; GFX9-NEXT: v_trunc_f32_e32 v2, v2 1030; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 1031; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 1032; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 1033; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 1034; GFX9-NEXT: s_cselect_b32 s0, s6, 0 1035; GFX9-NEXT: v_add_u32_e32 v0, s0, v2 1036; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 1037; GFX9-NEXT: v_mov_b32_e32 v1, 0 1038; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 1039; GFX9-NEXT: global_store_byte v1, v0, s[2:3] 1040; GFX9-NEXT: s_endpgm 1041 %r = srem i8 %x, %y 1042 store i8 %r, i8 addrspace(1)* %out 1043 ret void 1044} 1045 1046define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1047; CHECK-LABEL: @udiv_v4i32( 1048; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1049; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1050; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 1051; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 1052; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 1053; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 1054; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 1055; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 1056; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 1057; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 1058; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 1059; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 1060; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 1061; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 1062; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 1063; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 1064; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1065; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1066; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1067; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1068; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1069; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 1070; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 1071; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 1072; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1 1073; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]] 1074; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]] 1075; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]] 1076; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]] 1077; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 1078; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 1079; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i64 0 1080; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[X]], i64 1 1081; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1082; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float 1083; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 1084; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000 1085; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32 1086; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]] 1087; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]] 1088; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64 1089; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64 1090; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]] 1091; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 1092; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32 1093; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32 1094; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]] 1095; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64 1096; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64 1097; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]] 1098; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 1099; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32 1100; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32 1101; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]] 1102; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]] 1103; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]] 1104; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1 1105; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]] 1106; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]] 1107; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]] 1108; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]] 1109; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1 1110; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]] 1111; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP63]], i64 1 1112; CHECK-NEXT: [[TMP65:%.*]] = extractelement <4 x i32> [[X]], i64 2 1113; CHECK-NEXT: [[TMP66:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1114; CHECK-NEXT: [[TMP67:%.*]] = uitofp i32 [[TMP66]] to float 1115; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP67]]) 1116; CHECK-NEXT: [[TMP69:%.*]] = fmul fast float [[TMP68]], 0x41EFFFFFC0000000 1117; CHECK-NEXT: [[TMP70:%.*]] = fptoui float [[TMP69]] to i32 1118; CHECK-NEXT: [[TMP71:%.*]] = sub i32 0, [[TMP66]] 1119; CHECK-NEXT: [[TMP72:%.*]] = mul i32 [[TMP71]], [[TMP70]] 1120; CHECK-NEXT: [[TMP73:%.*]] = zext i32 [[TMP70]] to i64 1121; CHECK-NEXT: [[TMP74:%.*]] = zext i32 [[TMP72]] to i64 1122; CHECK-NEXT: [[TMP75:%.*]] = mul i64 [[TMP73]], [[TMP74]] 1123; CHECK-NEXT: [[TMP76:%.*]] = trunc i64 [[TMP75]] to i32 1124; CHECK-NEXT: [[TMP77:%.*]] = lshr i64 [[TMP75]], 32 1125; CHECK-NEXT: [[TMP78:%.*]] = trunc i64 [[TMP77]] to i32 1126; CHECK-NEXT: [[TMP79:%.*]] = add i32 [[TMP70]], [[TMP78]] 1127; CHECK-NEXT: [[TMP80:%.*]] = zext i32 [[TMP65]] to i64 1128; CHECK-NEXT: [[TMP81:%.*]] = zext i32 [[TMP79]] to i64 1129; CHECK-NEXT: [[TMP82:%.*]] = mul i64 [[TMP80]], [[TMP81]] 1130; CHECK-NEXT: [[TMP83:%.*]] = trunc i64 [[TMP82]] to i32 1131; CHECK-NEXT: [[TMP84:%.*]] = lshr i64 [[TMP82]], 32 1132; CHECK-NEXT: [[TMP85:%.*]] = trunc i64 [[TMP84]] to i32 1133; CHECK-NEXT: [[TMP86:%.*]] = mul i32 [[TMP85]], [[TMP66]] 1134; CHECK-NEXT: [[TMP87:%.*]] = sub i32 [[TMP65]], [[TMP86]] 1135; CHECK-NEXT: [[TMP88:%.*]] = icmp uge i32 [[TMP87]], [[TMP66]] 1136; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP85]], 1 1137; CHECK-NEXT: [[TMP90:%.*]] = select i1 [[TMP88]], i32 [[TMP89]], i32 [[TMP85]] 1138; CHECK-NEXT: [[TMP91:%.*]] = sub i32 [[TMP87]], [[TMP66]] 1139; CHECK-NEXT: [[TMP92:%.*]] = select i1 [[TMP88]], i32 [[TMP91]], i32 [[TMP87]] 1140; CHECK-NEXT: [[TMP93:%.*]] = icmp uge i32 [[TMP92]], [[TMP66]] 1141; CHECK-NEXT: [[TMP94:%.*]] = add i32 [[TMP90]], 1 1142; CHECK-NEXT: [[TMP95:%.*]] = select i1 [[TMP93]], i32 [[TMP94]], i32 [[TMP90]] 1143; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP95]], i64 2 1144; CHECK-NEXT: [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 3 1145; CHECK-NEXT: [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1146; CHECK-NEXT: [[TMP99:%.*]] = uitofp i32 [[TMP98]] to float 1147; CHECK-NEXT: [[TMP100:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP99]]) 1148; CHECK-NEXT: [[TMP101:%.*]] = fmul fast float [[TMP100]], 0x41EFFFFFC0000000 1149; CHECK-NEXT: [[TMP102:%.*]] = fptoui float [[TMP101]] to i32 1150; CHECK-NEXT: [[TMP103:%.*]] = sub i32 0, [[TMP98]] 1151; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP102]] 1152; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP102]] to i64 1153; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64 1154; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 1155; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 1156; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 1157; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 1158; CHECK-NEXT: [[TMP111:%.*]] = add i32 [[TMP102]], [[TMP110]] 1159; CHECK-NEXT: [[TMP112:%.*]] = zext i32 [[TMP97]] to i64 1160; CHECK-NEXT: [[TMP113:%.*]] = zext i32 [[TMP111]] to i64 1161; CHECK-NEXT: [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]] 1162; CHECK-NEXT: [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32 1163; CHECK-NEXT: [[TMP116:%.*]] = lshr i64 [[TMP114]], 32 1164; CHECK-NEXT: [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32 1165; CHECK-NEXT: [[TMP118:%.*]] = mul i32 [[TMP117]], [[TMP98]] 1166; CHECK-NEXT: [[TMP119:%.*]] = sub i32 [[TMP97]], [[TMP118]] 1167; CHECK-NEXT: [[TMP120:%.*]] = icmp uge i32 [[TMP119]], [[TMP98]] 1168; CHECK-NEXT: [[TMP121:%.*]] = add i32 [[TMP117]], 1 1169; CHECK-NEXT: [[TMP122:%.*]] = select i1 [[TMP120]], i32 [[TMP121]], i32 [[TMP117]] 1170; CHECK-NEXT: [[TMP123:%.*]] = sub i32 [[TMP119]], [[TMP98]] 1171; CHECK-NEXT: [[TMP124:%.*]] = select i1 [[TMP120]], i32 [[TMP123]], i32 [[TMP119]] 1172; CHECK-NEXT: [[TMP125:%.*]] = icmp uge i32 [[TMP124]], [[TMP98]] 1173; CHECK-NEXT: [[TMP126:%.*]] = add i32 [[TMP122]], 1 1174; CHECK-NEXT: [[TMP127:%.*]] = select i1 [[TMP125]], i32 [[TMP126]], i32 [[TMP122]] 1175; CHECK-NEXT: [[TMP128:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP127]], i64 3 1176; CHECK-NEXT: store <4 x i32> [[TMP128]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1177; CHECK-NEXT: ret void 1178; 1179; GFX6-LABEL: udiv_v4i32: 1180; GFX6: ; %bb.0: 1181; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1182; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 1183; GFX6-NEXT: s_mov_b32 s15, 0xf000 1184; GFX6-NEXT: s_mov_b32 s14, -1 1185; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1186; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 1187; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 1188; GFX6-NEXT: s_sub_i32 s2, 0, s8 1189; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s10 1190; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 1191; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 1192; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s11 1193; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1194; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 1195; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 1196; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 1197; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 1198; GFX6-NEXT: s_sub_i32 s2, 0, s9 1199; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 1200; GFX6-NEXT: s_sub_i32 s2, 0, s10 1201; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 1202; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 1203; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1204; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 1205; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 1206; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 1207; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 1208; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 1209; GFX6-NEXT: v_mul_lo_u32 v5, v1, s9 1210; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 1211; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 1212; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 1213; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v2 1214; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 1215; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 1216; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 1217; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v4 1218; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1219; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s5, v5 1220; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 1221; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 1222; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1223; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v3 1224; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 1225; GFX6-NEXT: v_mul_lo_u32 v4, s2, v2 1226; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s9, v3 1227; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1228; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4 1229; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v1 1230; GFX6-NEXT: s_sub_i32 s0, 0, s11 1231; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 1232; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v6 1233; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 1234; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1235; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 1236; GFX6-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 1237; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 1238; GFX6-NEXT: v_mul_lo_u32 v3, v2, s10 1239; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v2 1240; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 1241; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 1242; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v3 1243; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 1244; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 1245; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s10, v3 1246; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 1247; GFX6-NEXT: v_mul_hi_u32 v4, s7, v4 1248; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 1249; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v2 1250; GFX6-NEXT: v_mul_lo_u32 v6, v4, s11 1251; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 1252; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1253; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1254; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v6 1255; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v3 1256; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] 1257; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s11, v3 1258; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1259; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1260; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1261; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 1262; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 1263; GFX6-NEXT: s_endpgm 1264; 1265; GFX9-LABEL: udiv_v4i32: 1266; GFX9: ; %bb.0: 1267; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 1268; GFX9-NEXT: v_mov_b32_e32 v4, 0 1269; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1270; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1271; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 1272; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 1273; GFX9-NEXT: s_sub_i32 s2, 0, s8 1274; GFX9-NEXT: s_sub_i32 s3, 0, s9 1275; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1276; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 1277; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s10 1278; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s11 1279; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1280; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1281; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 1282; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 1283; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 1284; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 1285; GFX9-NEXT: s_sub_i32 s2, 0, s10 1286; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 1287; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v6 1288; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 1289; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 1290; GFX9-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 1291; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 1292; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 1293; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 1294; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v5 1295; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 1296; GFX9-NEXT: v_mul_lo_u32 v3, v0, s8 1297; GFX9-NEXT: v_add_u32_e32 v7, 1, v0 1298; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 1299; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 1300; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 1301; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 1302; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc 1303; GFX9-NEXT: v_subrev_u32_e32 v7, s8, v3 1304; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc 1305; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 1306; GFX9-NEXT: v_mul_lo_u32 v3, s2, v2 1307; GFX9-NEXT: s_sub_i32 s2, 0, s11 1308; GFX9-NEXT: v_mul_lo_u32 v5, v1, s9 1309; GFX9-NEXT: v_add_u32_e32 v7, 1, v0 1310; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 1311; GFX9-NEXT: v_add_u32_e32 v8, 1, v1 1312; GFX9-NEXT: v_sub_u32_e32 v5, s5, v5 1313; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc 1314; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 1315; GFX9-NEXT: v_mul_lo_u32 v3, s2, v6 1316; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 1317; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 1318; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc 1319; GFX9-NEXT: v_mul_hi_u32 v3, v6, v3 1320; GFX9-NEXT: v_mul_lo_u32 v8, v2, s10 1321; GFX9-NEXT: v_subrev_u32_e32 v7, s9, v5 1322; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc 1323; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 1324; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 1325; GFX9-NEXT: v_add_u32_e32 v7, 1, v1 1326; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 1327; GFX9-NEXT: v_sub_u32_e32 v5, s6, v8 1328; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc 1329; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v5 1330; GFX9-NEXT: v_subrev_u32_e32 v6, s10, v5 1331; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 1332; GFX9-NEXT: v_mul_lo_u32 v6, v3, s11 1333; GFX9-NEXT: v_add_u32_e32 v7, 1, v2 1334; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc 1335; GFX9-NEXT: v_add_u32_e32 v7, 1, v2 1336; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v5 1337; GFX9-NEXT: v_sub_u32_e32 v5, s7, v6 1338; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc 1339; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 1340; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 1341; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1342; GFX9-NEXT: v_subrev_u32_e32 v6, s11, v5 1343; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 1344; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 1345; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 1346; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1347; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1348; GFX9-NEXT: s_endpgm 1349 %r = udiv <4 x i32> %x, %y 1350 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1351 ret void 1352} 1353 1354define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1355; CHECK-LABEL: @urem_v4i32( 1356; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1357; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1358; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 1359; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 1360; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 1361; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 1362; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 1363; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 1364; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 1365; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 1366; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 1367; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 1368; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 1369; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 1370; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 1371; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 1372; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1373; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1374; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1375; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1376; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1377; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 1378; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 1379; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 1380; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]] 1381; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]] 1382; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]] 1383; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]] 1384; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]] 1385; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> undef, i32 [[TMP29]], i64 0 1386; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[X]], i64 1 1387; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1388; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float 1389; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 1390; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000 1391; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32 1392; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]] 1393; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]] 1394; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64 1395; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64 1396; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]] 1397; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32 1398; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32 1399; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 1400; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]] 1401; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64 1402; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64 1403; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]] 1404; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32 1405; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32 1406; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 1407; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]] 1408; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]] 1409; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]] 1410; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]] 1411; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]] 1412; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]] 1413; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]] 1414; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]] 1415; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP59]], i64 1 1416; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i32> [[X]], i64 2 1417; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1418; CHECK-NEXT: [[TMP63:%.*]] = uitofp i32 [[TMP62]] to float 1419; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP63]]) 1420; CHECK-NEXT: [[TMP65:%.*]] = fmul fast float [[TMP64]], 0x41EFFFFFC0000000 1421; CHECK-NEXT: [[TMP66:%.*]] = fptoui float [[TMP65]] to i32 1422; CHECK-NEXT: [[TMP67:%.*]] = sub i32 0, [[TMP62]] 1423; CHECK-NEXT: [[TMP68:%.*]] = mul i32 [[TMP67]], [[TMP66]] 1424; CHECK-NEXT: [[TMP69:%.*]] = zext i32 [[TMP66]] to i64 1425; CHECK-NEXT: [[TMP70:%.*]] = zext i32 [[TMP68]] to i64 1426; CHECK-NEXT: [[TMP71:%.*]] = mul i64 [[TMP69]], [[TMP70]] 1427; CHECK-NEXT: [[TMP72:%.*]] = trunc i64 [[TMP71]] to i32 1428; CHECK-NEXT: [[TMP73:%.*]] = lshr i64 [[TMP71]], 32 1429; CHECK-NEXT: [[TMP74:%.*]] = trunc i64 [[TMP73]] to i32 1430; CHECK-NEXT: [[TMP75:%.*]] = add i32 [[TMP66]], [[TMP74]] 1431; CHECK-NEXT: [[TMP76:%.*]] = zext i32 [[TMP61]] to i64 1432; CHECK-NEXT: [[TMP77:%.*]] = zext i32 [[TMP75]] to i64 1433; CHECK-NEXT: [[TMP78:%.*]] = mul i64 [[TMP76]], [[TMP77]] 1434; CHECK-NEXT: [[TMP79:%.*]] = trunc i64 [[TMP78]] to i32 1435; CHECK-NEXT: [[TMP80:%.*]] = lshr i64 [[TMP78]], 32 1436; CHECK-NEXT: [[TMP81:%.*]] = trunc i64 [[TMP80]] to i32 1437; CHECK-NEXT: [[TMP82:%.*]] = mul i32 [[TMP81]], [[TMP62]] 1438; CHECK-NEXT: [[TMP83:%.*]] = sub i32 [[TMP61]], [[TMP82]] 1439; CHECK-NEXT: [[TMP84:%.*]] = icmp uge i32 [[TMP83]], [[TMP62]] 1440; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP83]], [[TMP62]] 1441; CHECK-NEXT: [[TMP86:%.*]] = select i1 [[TMP84]], i32 [[TMP85]], i32 [[TMP83]] 1442; CHECK-NEXT: [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP62]] 1443; CHECK-NEXT: [[TMP88:%.*]] = sub i32 [[TMP86]], [[TMP62]] 1444; CHECK-NEXT: [[TMP89:%.*]] = select i1 [[TMP87]], i32 [[TMP88]], i32 [[TMP86]] 1445; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP89]], i64 2 1446; CHECK-NEXT: [[TMP91:%.*]] = extractelement <4 x i32> [[X]], i64 3 1447; CHECK-NEXT: [[TMP92:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1448; CHECK-NEXT: [[TMP93:%.*]] = uitofp i32 [[TMP92]] to float 1449; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP93]]) 1450; CHECK-NEXT: [[TMP95:%.*]] = fmul fast float [[TMP94]], 0x41EFFFFFC0000000 1451; CHECK-NEXT: [[TMP96:%.*]] = fptoui float [[TMP95]] to i32 1452; CHECK-NEXT: [[TMP97:%.*]] = sub i32 0, [[TMP92]] 1453; CHECK-NEXT: [[TMP98:%.*]] = mul i32 [[TMP97]], [[TMP96]] 1454; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP96]] to i64 1455; CHECK-NEXT: [[TMP100:%.*]] = zext i32 [[TMP98]] to i64 1456; CHECK-NEXT: [[TMP101:%.*]] = mul i64 [[TMP99]], [[TMP100]] 1457; CHECK-NEXT: [[TMP102:%.*]] = trunc i64 [[TMP101]] to i32 1458; CHECK-NEXT: [[TMP103:%.*]] = lshr i64 [[TMP101]], 32 1459; CHECK-NEXT: [[TMP104:%.*]] = trunc i64 [[TMP103]] to i32 1460; CHECK-NEXT: [[TMP105:%.*]] = add i32 [[TMP96]], [[TMP104]] 1461; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP91]] to i64 1462; CHECK-NEXT: [[TMP107:%.*]] = zext i32 [[TMP105]] to i64 1463; CHECK-NEXT: [[TMP108:%.*]] = mul i64 [[TMP106]], [[TMP107]] 1464; CHECK-NEXT: [[TMP109:%.*]] = trunc i64 [[TMP108]] to i32 1465; CHECK-NEXT: [[TMP110:%.*]] = lshr i64 [[TMP108]], 32 1466; CHECK-NEXT: [[TMP111:%.*]] = trunc i64 [[TMP110]] to i32 1467; CHECK-NEXT: [[TMP112:%.*]] = mul i32 [[TMP111]], [[TMP92]] 1468; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP91]], [[TMP112]] 1469; CHECK-NEXT: [[TMP114:%.*]] = icmp uge i32 [[TMP113]], [[TMP92]] 1470; CHECK-NEXT: [[TMP115:%.*]] = sub i32 [[TMP113]], [[TMP92]] 1471; CHECK-NEXT: [[TMP116:%.*]] = select i1 [[TMP114]], i32 [[TMP115]], i32 [[TMP113]] 1472; CHECK-NEXT: [[TMP117:%.*]] = icmp uge i32 [[TMP116]], [[TMP92]] 1473; CHECK-NEXT: [[TMP118:%.*]] = sub i32 [[TMP116]], [[TMP92]] 1474; CHECK-NEXT: [[TMP119:%.*]] = select i1 [[TMP117]], i32 [[TMP118]], i32 [[TMP116]] 1475; CHECK-NEXT: [[TMP120:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP119]], i64 3 1476; CHECK-NEXT: store <4 x i32> [[TMP120]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1477; CHECK-NEXT: ret void 1478; 1479; GFX6-LABEL: urem_v4i32: 1480; GFX6: ; %bb.0: 1481; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1482; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1483; GFX6-NEXT: s_mov_b32 s3, 0xf000 1484; GFX6-NEXT: s_mov_b32 s2, -1 1485; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1486; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 1487; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 1488; GFX6-NEXT: s_sub_i32 s12, 0, s8 1489; GFX6-NEXT: s_sub_i32 s13, 0, s9 1490; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 1491; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 1492; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 1493; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s11 1494; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1495; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 1496; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 1497; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 1498; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 1499; GFX6-NEXT: v_mul_lo_u32 v2, s12, v0 1500; GFX6-NEXT: v_mul_lo_u32 v4, s13, v1 1501; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 1502; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 1503; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1504; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 1505; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 1506; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 1507; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 1508; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v3 1509; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 1510; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 1511; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1512; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 1513; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1514; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1515; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 1516; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1517; GFX6-NEXT: s_sub_i32 s4, 0, s10 1518; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1519; GFX6-NEXT: v_mul_lo_u32 v3, s4, v2 1520; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 1521; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 1522; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1523; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 1524; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1525; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 1526; GFX6-NEXT: s_sub_i32 s4, 0, s11 1527; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 1528; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 1529; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 1530; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 1531; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 1532; GFX6-NEXT: v_mul_lo_u32 v5, s4, v3 1533; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1534; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1535; GFX6-NEXT: v_mul_lo_u32 v2, v2, s10 1536; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 1537; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 1538; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 1539; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 1540; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 1541; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1542; GFX6-NEXT: v_mul_lo_u32 v3, v3, s11 1543; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1544; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s10, v2 1545; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1546; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 1547; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 1548; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 1549; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1550; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1551; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 1552; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1553; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1554; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1555; GFX6-NEXT: s_endpgm 1556; 1557; GFX9-LABEL: urem_v4i32: 1558; GFX9: ; %bb.0: 1559; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 1560; GFX9-NEXT: v_mov_b32_e32 v4, 0 1561; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1562; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1563; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 1564; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 1565; GFX9-NEXT: s_sub_i32 s2, 0, s8 1566; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s10 1567; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1568; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 1569; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 1570; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1571; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1572; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 1573; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 1574; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 1575; GFX9-NEXT: v_readfirstlane_b32 s3, v0 1576; GFX9-NEXT: s_mul_i32 s2, s2, s3 1577; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2 1578; GFX9-NEXT: s_add_i32 s3, s3, s2 1579; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3 1580; GFX9-NEXT: s_mul_i32 s2, s2, s8 1581; GFX9-NEXT: s_sub_i32 s2, s4, s2 1582; GFX9-NEXT: s_sub_i32 s3, s2, s8 1583; GFX9-NEXT: s_cmp_ge_u32 s2, s8 1584; GFX9-NEXT: s_cselect_b32 s2, s3, s2 1585; GFX9-NEXT: s_sub_i32 s3, s2, s8 1586; GFX9-NEXT: s_cmp_ge_u32 s2, s8 1587; GFX9-NEXT: v_readfirstlane_b32 s12, v1 1588; GFX9-NEXT: s_cselect_b32 s2, s3, s2 1589; GFX9-NEXT: s_sub_i32 s3, 0, s9 1590; GFX9-NEXT: s_mul_i32 s3, s3, s12 1591; GFX9-NEXT: s_mul_hi_u32 s3, s12, s3 1592; GFX9-NEXT: s_add_i32 s12, s12, s3 1593; GFX9-NEXT: s_mul_hi_u32 s3, s5, s12 1594; GFX9-NEXT: s_mul_i32 s3, s3, s9 1595; GFX9-NEXT: s_sub_i32 s3, s5, s3 1596; GFX9-NEXT: s_sub_i32 s4, s3, s9 1597; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 1598; GFX9-NEXT: s_cmp_ge_u32 s3, s9 1599; GFX9-NEXT: s_cselect_b32 s3, s4, s3 1600; GFX9-NEXT: s_sub_i32 s4, s3, s9 1601; GFX9-NEXT: s_cmp_ge_u32 s3, s9 1602; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11 1603; GFX9-NEXT: s_cselect_b32 s3, s4, s3 1604; GFX9-NEXT: s_sub_i32 s4, 0, s10 1605; GFX9-NEXT: v_readfirstlane_b32 s5, v2 1606; GFX9-NEXT: s_mul_i32 s4, s4, s5 1607; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4 1608; GFX9-NEXT: s_add_i32 s5, s5, s4 1609; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1610; GFX9-NEXT: s_mul_hi_u32 s4, s6, s5 1611; GFX9-NEXT: s_mul_i32 s4, s4, s10 1612; GFX9-NEXT: s_sub_i32 s4, s6, s4 1613; GFX9-NEXT: s_sub_i32 s5, s4, s10 1614; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1615; GFX9-NEXT: s_cmp_ge_u32 s4, s10 1616; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1617; GFX9-NEXT: s_cselect_b32 s4, s5, s4 1618; GFX9-NEXT: s_sub_i32 s5, s4, s10 1619; GFX9-NEXT: s_cmp_ge_u32 s4, s10 1620; GFX9-NEXT: s_cselect_b32 s4, s5, s4 1621; GFX9-NEXT: s_sub_i32 s5, 0, s11 1622; GFX9-NEXT: v_readfirstlane_b32 s6, v0 1623; GFX9-NEXT: s_mul_i32 s5, s5, s6 1624; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 1625; GFX9-NEXT: s_add_i32 s6, s6, s5 1626; GFX9-NEXT: s_mul_hi_u32 s5, s7, s6 1627; GFX9-NEXT: s_mul_i32 s5, s5, s11 1628; GFX9-NEXT: s_sub_i32 s5, s7, s5 1629; GFX9-NEXT: s_sub_i32 s6, s5, s11 1630; GFX9-NEXT: s_cmp_ge_u32 s5, s11 1631; GFX9-NEXT: s_cselect_b32 s5, s6, s5 1632; GFX9-NEXT: s_sub_i32 s6, s5, s11 1633; GFX9-NEXT: s_cmp_ge_u32 s5, s11 1634; GFX9-NEXT: s_cselect_b32 s5, s6, s5 1635; GFX9-NEXT: v_mov_b32_e32 v0, s2 1636; GFX9-NEXT: v_mov_b32_e32 v1, s3 1637; GFX9-NEXT: v_mov_b32_e32 v2, s4 1638; GFX9-NEXT: v_mov_b32_e32 v3, s5 1639; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1640; GFX9-NEXT: s_endpgm 1641 %r = urem <4 x i32> %x, %y 1642 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1643 ret void 1644} 1645 1646define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1647; CHECK-LABEL: @sdiv_v4i32( 1648; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1649; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1650; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 1651; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 1652; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 1653; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 1654; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 1655; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 1656; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 1657; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 1658; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 1659; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000 1660; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 1661; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]] 1662; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]] 1663; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64 1664; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1665; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1666; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1667; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1668; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1669; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]] 1670; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64 1671; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64 1672; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 1673; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 1674; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 1675; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 1676; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]] 1677; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]] 1678; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]] 1679; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1 1680; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]] 1681; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]] 1682; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]] 1683; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]] 1684; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1 1685; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]] 1686; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]] 1687; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]] 1688; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> undef, i32 [[TMP40]], i64 0 1689; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[X]], i64 1 1690; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1691; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31 1692; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31 1693; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]] 1694; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]] 1695; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]] 1696; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]] 1697; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]] 1698; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float 1699; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]]) 1700; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000 1701; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32 1702; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]] 1703; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]] 1704; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64 1705; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64 1706; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]] 1707; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32 1708; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32 1709; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32 1710; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]] 1711; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64 1712; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64 1713; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]] 1714; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32 1715; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32 1716; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32 1717; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]] 1718; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]] 1719; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]] 1720; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1 1721; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]] 1722; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]] 1723; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]] 1724; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]] 1725; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1 1726; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]] 1727; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]] 1728; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]] 1729; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP81]], i64 1 1730; CHECK-NEXT: [[TMP83:%.*]] = extractelement <4 x i32> [[X]], i64 2 1731; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1732; CHECK-NEXT: [[TMP85:%.*]] = ashr i32 [[TMP83]], 31 1733; CHECK-NEXT: [[TMP86:%.*]] = ashr i32 [[TMP84]], 31 1734; CHECK-NEXT: [[TMP87:%.*]] = xor i32 [[TMP85]], [[TMP86]] 1735; CHECK-NEXT: [[TMP88:%.*]] = add i32 [[TMP83]], [[TMP85]] 1736; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP84]], [[TMP86]] 1737; CHECK-NEXT: [[TMP90:%.*]] = xor i32 [[TMP88]], [[TMP85]] 1738; CHECK-NEXT: [[TMP91:%.*]] = xor i32 [[TMP89]], [[TMP86]] 1739; CHECK-NEXT: [[TMP92:%.*]] = uitofp i32 [[TMP91]] to float 1740; CHECK-NEXT: [[TMP93:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP92]]) 1741; CHECK-NEXT: [[TMP94:%.*]] = fmul fast float [[TMP93]], 0x41EFFFFFC0000000 1742; CHECK-NEXT: [[TMP95:%.*]] = fptoui float [[TMP94]] to i32 1743; CHECK-NEXT: [[TMP96:%.*]] = sub i32 0, [[TMP91]] 1744; CHECK-NEXT: [[TMP97:%.*]] = mul i32 [[TMP96]], [[TMP95]] 1745; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP95]] to i64 1746; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64 1747; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]] 1748; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 1749; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32 1750; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32 1751; CHECK-NEXT: [[TMP104:%.*]] = add i32 [[TMP95]], [[TMP103]] 1752; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP90]] to i64 1753; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64 1754; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 1755; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 1756; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 1757; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 1758; CHECK-NEXT: [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP91]] 1759; CHECK-NEXT: [[TMP112:%.*]] = sub i32 [[TMP90]], [[TMP111]] 1760; CHECK-NEXT: [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP91]] 1761; CHECK-NEXT: [[TMP114:%.*]] = add i32 [[TMP110]], 1 1762; CHECK-NEXT: [[TMP115:%.*]] = select i1 [[TMP113]], i32 [[TMP114]], i32 [[TMP110]] 1763; CHECK-NEXT: [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP91]] 1764; CHECK-NEXT: [[TMP117:%.*]] = select i1 [[TMP113]], i32 [[TMP116]], i32 [[TMP112]] 1765; CHECK-NEXT: [[TMP118:%.*]] = icmp uge i32 [[TMP117]], [[TMP91]] 1766; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], 1 1767; CHECK-NEXT: [[TMP120:%.*]] = select i1 [[TMP118]], i32 [[TMP119]], i32 [[TMP115]] 1768; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP120]], [[TMP87]] 1769; CHECK-NEXT: [[TMP122:%.*]] = sub i32 [[TMP121]], [[TMP87]] 1770; CHECK-NEXT: [[TMP123:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP122]], i64 2 1771; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i32> [[X]], i64 3 1772; CHECK-NEXT: [[TMP125:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1773; CHECK-NEXT: [[TMP126:%.*]] = ashr i32 [[TMP124]], 31 1774; CHECK-NEXT: [[TMP127:%.*]] = ashr i32 [[TMP125]], 31 1775; CHECK-NEXT: [[TMP128:%.*]] = xor i32 [[TMP126]], [[TMP127]] 1776; CHECK-NEXT: [[TMP129:%.*]] = add i32 [[TMP124]], [[TMP126]] 1777; CHECK-NEXT: [[TMP130:%.*]] = add i32 [[TMP125]], [[TMP127]] 1778; CHECK-NEXT: [[TMP131:%.*]] = xor i32 [[TMP129]], [[TMP126]] 1779; CHECK-NEXT: [[TMP132:%.*]] = xor i32 [[TMP130]], [[TMP127]] 1780; CHECK-NEXT: [[TMP133:%.*]] = uitofp i32 [[TMP132]] to float 1781; CHECK-NEXT: [[TMP134:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP133]]) 1782; CHECK-NEXT: [[TMP135:%.*]] = fmul fast float [[TMP134]], 0x41EFFFFFC0000000 1783; CHECK-NEXT: [[TMP136:%.*]] = fptoui float [[TMP135]] to i32 1784; CHECK-NEXT: [[TMP137:%.*]] = sub i32 0, [[TMP132]] 1785; CHECK-NEXT: [[TMP138:%.*]] = mul i32 [[TMP137]], [[TMP136]] 1786; CHECK-NEXT: [[TMP139:%.*]] = zext i32 [[TMP136]] to i64 1787; CHECK-NEXT: [[TMP140:%.*]] = zext i32 [[TMP138]] to i64 1788; CHECK-NEXT: [[TMP141:%.*]] = mul i64 [[TMP139]], [[TMP140]] 1789; CHECK-NEXT: [[TMP142:%.*]] = trunc i64 [[TMP141]] to i32 1790; CHECK-NEXT: [[TMP143:%.*]] = lshr i64 [[TMP141]], 32 1791; CHECK-NEXT: [[TMP144:%.*]] = trunc i64 [[TMP143]] to i32 1792; CHECK-NEXT: [[TMP145:%.*]] = add i32 [[TMP136]], [[TMP144]] 1793; CHECK-NEXT: [[TMP146:%.*]] = zext i32 [[TMP131]] to i64 1794; CHECK-NEXT: [[TMP147:%.*]] = zext i32 [[TMP145]] to i64 1795; CHECK-NEXT: [[TMP148:%.*]] = mul i64 [[TMP146]], [[TMP147]] 1796; CHECK-NEXT: [[TMP149:%.*]] = trunc i64 [[TMP148]] to i32 1797; CHECK-NEXT: [[TMP150:%.*]] = lshr i64 [[TMP148]], 32 1798; CHECK-NEXT: [[TMP151:%.*]] = trunc i64 [[TMP150]] to i32 1799; CHECK-NEXT: [[TMP152:%.*]] = mul i32 [[TMP151]], [[TMP132]] 1800; CHECK-NEXT: [[TMP153:%.*]] = sub i32 [[TMP131]], [[TMP152]] 1801; CHECK-NEXT: [[TMP154:%.*]] = icmp uge i32 [[TMP153]], [[TMP132]] 1802; CHECK-NEXT: [[TMP155:%.*]] = add i32 [[TMP151]], 1 1803; CHECK-NEXT: [[TMP156:%.*]] = select i1 [[TMP154]], i32 [[TMP155]], i32 [[TMP151]] 1804; CHECK-NEXT: [[TMP157:%.*]] = sub i32 [[TMP153]], [[TMP132]] 1805; CHECK-NEXT: [[TMP158:%.*]] = select i1 [[TMP154]], i32 [[TMP157]], i32 [[TMP153]] 1806; CHECK-NEXT: [[TMP159:%.*]] = icmp uge i32 [[TMP158]], [[TMP132]] 1807; CHECK-NEXT: [[TMP160:%.*]] = add i32 [[TMP156]], 1 1808; CHECK-NEXT: [[TMP161:%.*]] = select i1 [[TMP159]], i32 [[TMP160]], i32 [[TMP156]] 1809; CHECK-NEXT: [[TMP162:%.*]] = xor i32 [[TMP161]], [[TMP128]] 1810; CHECK-NEXT: [[TMP163:%.*]] = sub i32 [[TMP162]], [[TMP128]] 1811; CHECK-NEXT: [[TMP164:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP163]], i64 3 1812; CHECK-NEXT: store <4 x i32> [[TMP164]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1813; CHECK-NEXT: ret void 1814; 1815; GFX6-LABEL: sdiv_v4i32: 1816; GFX6: ; %bb.0: 1817; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1818; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 1819; GFX6-NEXT: s_mov_b32 s15, 0xf000 1820; GFX6-NEXT: s_mov_b32 s14, -1 1821; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1822; GFX6-NEXT: s_ashr_i32 s2, s8, 31 1823; GFX6-NEXT: s_add_i32 s3, s8, s2 1824; GFX6-NEXT: s_xor_b32 s3, s3, s2 1825; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 1826; GFX6-NEXT: s_ashr_i32 s8, s9, 31 1827; GFX6-NEXT: s_add_i32 s0, s9, s8 1828; GFX6-NEXT: s_xor_b32 s9, s0, s8 1829; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 1830; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 1831; GFX6-NEXT: s_sub_i32 s1, 0, s3 1832; GFX6-NEXT: s_ashr_i32 s0, s4, 31 1833; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1834; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 1835; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 1836; GFX6-NEXT: s_xor_b32 s2, s0, s2 1837; GFX6-NEXT: v_mul_lo_u32 v2, s1, v0 1838; GFX6-NEXT: s_add_i32 s1, s4, s0 1839; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 1840; GFX6-NEXT: s_xor_b32 s1, s1, s0 1841; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 1842; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 1843; GFX6-NEXT: s_sub_i32 s0, 0, s9 1844; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1845; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 1846; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 1847; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 1848; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 1849; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 1850; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 1851; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v3 1852; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 1853; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s3, v3 1854; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] 1855; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 1856; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 1857; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 1858; GFX6-NEXT: s_ashr_i32 s0, s5, 31 1859; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 1860; GFX6-NEXT: s_add_i32 s1, s5, s0 1861; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 1862; GFX6-NEXT: s_ashr_i32 s3, s10, 31 1863; GFX6-NEXT: s_xor_b32 s1, s1, s0 1864; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 1865; GFX6-NEXT: s_xor_b32 s2, s0, s8 1866; GFX6-NEXT: s_add_i32 s0, s10, s3 1867; GFX6-NEXT: s_xor_b32 s4, s0, s3 1868; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 1869; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 1870; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 1871; GFX6-NEXT: v_mul_lo_u32 v2, v1, s9 1872; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1873; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 1874; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 1875; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 1876; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 1877; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 1878; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v2 1879; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 1880; GFX6-NEXT: s_sub_i32 s0, 0, s4 1881; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 1882; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1883; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 1884; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1885; GFX6-NEXT: v_mul_hi_u32 v2, v3, v5 1886; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 1887; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 1888; GFX6-NEXT: s_ashr_i32 s2, s11, 31 1889; GFX6-NEXT: s_ashr_i32 s0, s6, 31 1890; GFX6-NEXT: s_add_i32 s5, s11, s2 1891; GFX6-NEXT: s_add_i32 s1, s6, s0 1892; GFX6-NEXT: s_xor_b32 s5, s5, s2 1893; GFX6-NEXT: s_xor_b32 s1, s1, s0 1894; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 1895; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s5 1896; GFX6-NEXT: v_mul_hi_u32 v2, s1, v2 1897; GFX6-NEXT: s_xor_b32 s3, s0, s3 1898; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v4 1899; GFX6-NEXT: v_mul_lo_u32 v3, v2, s4 1900; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v2 1901; GFX6-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 1902; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 1903; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 1904; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v3 1905; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 1906; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v3 1907; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1908; GFX6-NEXT: s_sub_i32 s0, 0, s5 1909; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 1910; GFX6-NEXT: s_ashr_i32 s0, s7, 31 1911; GFX6-NEXT: s_add_i32 s1, s7, s0 1912; GFX6-NEXT: s_xor_b32 s1, s1, s0 1913; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 1914; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v2 1915; GFX6-NEXT: s_xor_b32 s2, s0, s2 1916; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 1917; GFX6-NEXT: v_mul_hi_u32 v4, s1, v4 1918; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 1919; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 1920; GFX6-NEXT: v_xor_b32_e32 v2, s3, v2 1921; GFX6-NEXT: v_mul_lo_u32 v3, v4, s5 1922; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1923; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v2 1924; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 1925; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v3 1926; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] 1927; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v3 1928; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1929; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1930; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 1931; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 1932; GFX6-NEXT: v_xor_b32_e32 v3, s2, v3 1933; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v3 1934; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 1935; GFX6-NEXT: s_endpgm 1936; 1937; GFX9-LABEL: sdiv_v4i32: 1938; GFX9: ; %bb.0: 1939; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 1940; GFX9-NEXT: v_mov_b32_e32 v4, 0 1941; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1942; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1943; GFX9-NEXT: s_ashr_i32 s2, s8, 31 1944; GFX9-NEXT: s_add_i32 s3, s8, s2 1945; GFX9-NEXT: s_xor_b32 s3, s3, s2 1946; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 1947; GFX9-NEXT: s_ashr_i32 s12, s9, 31 1948; GFX9-NEXT: s_add_i32 s9, s9, s12 1949; GFX9-NEXT: s_xor_b32 s9, s9, s12 1950; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1951; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 1952; GFX9-NEXT: s_sub_i32 s14, 0, s3 1953; GFX9-NEXT: s_ashr_i32 s8, s4, 31 1954; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1955; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1956; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 1957; GFX9-NEXT: s_add_i32 s4, s4, s8 1958; GFX9-NEXT: s_xor_b32 s4, s4, s8 1959; GFX9-NEXT: v_mul_lo_u32 v2, s14, v0 1960; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 1961; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 1962; GFX9-NEXT: s_sub_i32 s14, 0, s9 1963; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 1964; GFX9-NEXT: s_ashr_i32 s13, s5, 31 1965; GFX9-NEXT: v_mul_lo_u32 v3, s14, v1 1966; GFX9-NEXT: s_add_i32 s5, s5, s13 1967; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 1968; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 1969; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 1970; GFX9-NEXT: s_xor_b32 s5, s5, s13 1971; GFX9-NEXT: s_xor_b32 s2, s8, s2 1972; GFX9-NEXT: v_mul_lo_u32 v3, v0, s3 1973; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 1974; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 1975; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 1976; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 1977; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 1978; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 1979; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v3 1980; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 1981; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 1982; GFX9-NEXT: s_ashr_i32 s3, s10, 31 1983; GFX9-NEXT: s_add_i32 s4, s10, s3 1984; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 1985; GFX9-NEXT: s_xor_b32 s4, s4, s3 1986; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1987; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 1988; GFX9-NEXT: v_mul_lo_u32 v2, v1, s9 1989; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 1990; GFX9-NEXT: s_ashr_i32 s8, s11, 31 1991; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 1992; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 1993; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 1994; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1995; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 1996; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 1997; GFX9-NEXT: v_subrev_u32_e32 v5, s9, v2 1998; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1999; GFX9-NEXT: s_sub_i32 s5, 0, s4 2000; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 2001; GFX9-NEXT: v_mul_lo_u32 v2, s5, v3 2002; GFX9-NEXT: s_add_i32 s9, s11, s8 2003; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 2004; GFX9-NEXT: s_xor_b32 s9, s9, s8 2005; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 2006; GFX9-NEXT: v_mul_hi_u32 v2, v3, v2 2007; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s9 2008; GFX9-NEXT: s_ashr_i32 s5, s6, 31 2009; GFX9-NEXT: s_add_i32 s6, s6, s5 2010; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 2011; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v5 2012; GFX9-NEXT: s_xor_b32 s6, s6, s5 2013; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 2014; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 2015; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 2016; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 2017; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 2018; GFX9-NEXT: s_xor_b32 s2, s13, s12 2019; GFX9-NEXT: v_mul_lo_u32 v5, v2, s4 2020; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 2021; GFX9-NEXT: v_subrev_u32_e32 v1, s2, v1 2022; GFX9-NEXT: s_xor_b32 s2, s5, s3 2023; GFX9-NEXT: s_sub_i32 s3, 0, s9 2024; GFX9-NEXT: v_mul_lo_u32 v7, s3, v3 2025; GFX9-NEXT: v_sub_u32_e32 v5, s6, v5 2026; GFX9-NEXT: v_add_u32_e32 v6, 1, v2 2027; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 2028; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2029; GFX9-NEXT: v_subrev_u32_e32 v6, s4, v5 2030; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 2031; GFX9-NEXT: v_mul_hi_u32 v6, v3, v7 2032; GFX9-NEXT: s_ashr_i32 s3, s7, 31 2033; GFX9-NEXT: s_add_i32 s5, s7, s3 2034; GFX9-NEXT: s_xor_b32 s5, s5, s3 2035; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 2036; GFX9-NEXT: v_mul_hi_u32 v3, s5, v3 2037; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 2038; GFX9-NEXT: v_add_u32_e32 v6, 1, v2 2039; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2040; GFX9-NEXT: v_mul_lo_u32 v5, v3, s9 2041; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 2042; GFX9-NEXT: v_xor_b32_e32 v2, s2, v2 2043; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v2 2044; GFX9-NEXT: v_sub_u32_e32 v5, s5, v5 2045; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 2046; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2047; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v5 2048; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 2049; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 2050; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 2051; GFX9-NEXT: s_xor_b32 s2, s3, s8 2052; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2053; GFX9-NEXT: v_xor_b32_e32 v3, s2, v3 2054; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v3 2055; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2056; GFX9-NEXT: s_endpgm 2057 %r = sdiv <4 x i32> %x, %y 2058 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 2059 ret void 2060} 2061 2062define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 2063; CHECK-LABEL: @srem_v4i32( 2064; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 2065; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 2066; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 2067; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 2068; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 2069; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 2070; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 2071; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 2072; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 2073; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2074; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000 2075; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 2076; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]] 2077; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]] 2078; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64 2079; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 2080; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]] 2081; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 2082; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32 2083; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 2084; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] 2085; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 2086; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64 2087; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 2088; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 2089; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 2090; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 2091; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]] 2092; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]] 2093; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]] 2094; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]] 2095; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]] 2096; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]] 2097; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]] 2098; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]] 2099; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]] 2100; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]] 2101; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i32> undef, i32 [[TMP37]], i64 0 2102; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i32> [[X]], i64 1 2103; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[Y]], i64 1 2104; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31 2105; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31 2106; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]] 2107; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]] 2108; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]] 2109; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]] 2110; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float 2111; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]]) 2112; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000 2113; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32 2114; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]] 2115; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]] 2116; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64 2117; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64 2118; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]] 2119; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32 2120; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32 2121; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32 2122; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]] 2123; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64 2124; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64 2125; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]] 2126; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32 2127; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32 2128; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32 2129; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]] 2130; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]] 2131; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]] 2132; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]] 2133; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]] 2134; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]] 2135; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]] 2136; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]] 2137; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]] 2138; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]] 2139; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP75]], i64 1 2140; CHECK-NEXT: [[TMP77:%.*]] = extractelement <4 x i32> [[X]], i64 2 2141; CHECK-NEXT: [[TMP78:%.*]] = extractelement <4 x i32> [[Y]], i64 2 2142; CHECK-NEXT: [[TMP79:%.*]] = ashr i32 [[TMP77]], 31 2143; CHECK-NEXT: [[TMP80:%.*]] = ashr i32 [[TMP78]], 31 2144; CHECK-NEXT: [[TMP81:%.*]] = add i32 [[TMP77]], [[TMP79]] 2145; CHECK-NEXT: [[TMP82:%.*]] = add i32 [[TMP78]], [[TMP80]] 2146; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP79]] 2147; CHECK-NEXT: [[TMP84:%.*]] = xor i32 [[TMP82]], [[TMP80]] 2148; CHECK-NEXT: [[TMP85:%.*]] = uitofp i32 [[TMP84]] to float 2149; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP85]]) 2150; CHECK-NEXT: [[TMP87:%.*]] = fmul fast float [[TMP86]], 0x41EFFFFFC0000000 2151; CHECK-NEXT: [[TMP88:%.*]] = fptoui float [[TMP87]] to i32 2152; CHECK-NEXT: [[TMP89:%.*]] = sub i32 0, [[TMP84]] 2153; CHECK-NEXT: [[TMP90:%.*]] = mul i32 [[TMP89]], [[TMP88]] 2154; CHECK-NEXT: [[TMP91:%.*]] = zext i32 [[TMP88]] to i64 2155; CHECK-NEXT: [[TMP92:%.*]] = zext i32 [[TMP90]] to i64 2156; CHECK-NEXT: [[TMP93:%.*]] = mul i64 [[TMP91]], [[TMP92]] 2157; CHECK-NEXT: [[TMP94:%.*]] = trunc i64 [[TMP93]] to i32 2158; CHECK-NEXT: [[TMP95:%.*]] = lshr i64 [[TMP93]], 32 2159; CHECK-NEXT: [[TMP96:%.*]] = trunc i64 [[TMP95]] to i32 2160; CHECK-NEXT: [[TMP97:%.*]] = add i32 [[TMP88]], [[TMP96]] 2161; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP83]] to i64 2162; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64 2163; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]] 2164; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 2165; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32 2166; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32 2167; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP84]] 2168; CHECK-NEXT: [[TMP105:%.*]] = sub i32 [[TMP83]], [[TMP104]] 2169; CHECK-NEXT: [[TMP106:%.*]] = icmp uge i32 [[TMP105]], [[TMP84]] 2170; CHECK-NEXT: [[TMP107:%.*]] = sub i32 [[TMP105]], [[TMP84]] 2171; CHECK-NEXT: [[TMP108:%.*]] = select i1 [[TMP106]], i32 [[TMP107]], i32 [[TMP105]] 2172; CHECK-NEXT: [[TMP109:%.*]] = icmp uge i32 [[TMP108]], [[TMP84]] 2173; CHECK-NEXT: [[TMP110:%.*]] = sub i32 [[TMP108]], [[TMP84]] 2174; CHECK-NEXT: [[TMP111:%.*]] = select i1 [[TMP109]], i32 [[TMP110]], i32 [[TMP108]] 2175; CHECK-NEXT: [[TMP112:%.*]] = xor i32 [[TMP111]], [[TMP79]] 2176; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP112]], [[TMP79]] 2177; CHECK-NEXT: [[TMP114:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP113]], i64 2 2178; CHECK-NEXT: [[TMP115:%.*]] = extractelement <4 x i32> [[X]], i64 3 2179; CHECK-NEXT: [[TMP116:%.*]] = extractelement <4 x i32> [[Y]], i64 3 2180; CHECK-NEXT: [[TMP117:%.*]] = ashr i32 [[TMP115]], 31 2181; CHECK-NEXT: [[TMP118:%.*]] = ashr i32 [[TMP116]], 31 2182; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], [[TMP117]] 2183; CHECK-NEXT: [[TMP120:%.*]] = add i32 [[TMP116]], [[TMP118]] 2184; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP119]], [[TMP117]] 2185; CHECK-NEXT: [[TMP122:%.*]] = xor i32 [[TMP120]], [[TMP118]] 2186; CHECK-NEXT: [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float 2187; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]]) 2188; CHECK-NEXT: [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41EFFFFFC0000000 2189; CHECK-NEXT: [[TMP126:%.*]] = fptoui float [[TMP125]] to i32 2190; CHECK-NEXT: [[TMP127:%.*]] = sub i32 0, [[TMP122]] 2191; CHECK-NEXT: [[TMP128:%.*]] = mul i32 [[TMP127]], [[TMP126]] 2192; CHECK-NEXT: [[TMP129:%.*]] = zext i32 [[TMP126]] to i64 2193; CHECK-NEXT: [[TMP130:%.*]] = zext i32 [[TMP128]] to i64 2194; CHECK-NEXT: [[TMP131:%.*]] = mul i64 [[TMP129]], [[TMP130]] 2195; CHECK-NEXT: [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32 2196; CHECK-NEXT: [[TMP133:%.*]] = lshr i64 [[TMP131]], 32 2197; CHECK-NEXT: [[TMP134:%.*]] = trunc i64 [[TMP133]] to i32 2198; CHECK-NEXT: [[TMP135:%.*]] = add i32 [[TMP126]], [[TMP134]] 2199; CHECK-NEXT: [[TMP136:%.*]] = zext i32 [[TMP121]] to i64 2200; CHECK-NEXT: [[TMP137:%.*]] = zext i32 [[TMP135]] to i64 2201; CHECK-NEXT: [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]] 2202; CHECK-NEXT: [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32 2203; CHECK-NEXT: [[TMP140:%.*]] = lshr i64 [[TMP138]], 32 2204; CHECK-NEXT: [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32 2205; CHECK-NEXT: [[TMP142:%.*]] = mul i32 [[TMP141]], [[TMP122]] 2206; CHECK-NEXT: [[TMP143:%.*]] = sub i32 [[TMP121]], [[TMP142]] 2207; CHECK-NEXT: [[TMP144:%.*]] = icmp uge i32 [[TMP143]], [[TMP122]] 2208; CHECK-NEXT: [[TMP145:%.*]] = sub i32 [[TMP143]], [[TMP122]] 2209; CHECK-NEXT: [[TMP146:%.*]] = select i1 [[TMP144]], i32 [[TMP145]], i32 [[TMP143]] 2210; CHECK-NEXT: [[TMP147:%.*]] = icmp uge i32 [[TMP146]], [[TMP122]] 2211; CHECK-NEXT: [[TMP148:%.*]] = sub i32 [[TMP146]], [[TMP122]] 2212; CHECK-NEXT: [[TMP149:%.*]] = select i1 [[TMP147]], i32 [[TMP148]], i32 [[TMP146]] 2213; CHECK-NEXT: [[TMP150:%.*]] = xor i32 [[TMP149]], [[TMP117]] 2214; CHECK-NEXT: [[TMP151:%.*]] = sub i32 [[TMP150]], [[TMP117]] 2215; CHECK-NEXT: [[TMP152:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP151]], i64 3 2216; CHECK-NEXT: store <4 x i32> [[TMP152]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 2217; CHECK-NEXT: ret void 2218; 2219; GFX6-LABEL: srem_v4i32: 2220; GFX6: ; %bb.0: 2221; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 2222; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2223; GFX6-NEXT: s_mov_b32 s3, 0xf000 2224; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2225; GFX6-NEXT: s_ashr_i32 s2, s8, 31 2226; GFX6-NEXT: s_add_i32 s8, s8, s2 2227; GFX6-NEXT: s_xor_b32 s8, s8, s2 2228; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 2229; GFX6-NEXT: s_ashr_i32 s13, s9, 31 2230; GFX6-NEXT: s_add_i32 s9, s9, s13 2231; GFX6-NEXT: s_xor_b32 s9, s9, s13 2232; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 2233; GFX6-NEXT: s_sub_i32 s14, 0, s8 2234; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 2235; GFX6-NEXT: s_ashr_i32 s12, s4, 31 2236; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 2237; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 2238; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 2239; GFX6-NEXT: s_add_i32 s4, s4, s12 2240; GFX6-NEXT: s_xor_b32 s4, s4, s12 2241; GFX6-NEXT: v_mul_lo_u32 v2, s14, v0 2242; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 2243; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2244; GFX6-NEXT: s_sub_i32 s14, 0, s9 2245; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 2246; GFX6-NEXT: s_ashr_i32 s13, s5, 31 2247; GFX6-NEXT: s_add_i32 s5, s5, s13 2248; GFX6-NEXT: s_xor_b32 s5, s5, s13 2249; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 2250; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 2251; GFX6-NEXT: v_mul_lo_u32 v2, s14, v1 2252; GFX6-NEXT: s_mov_b32 s2, -1 2253; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 2254; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 2255; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 2256; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 2257; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2258; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 2259; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 2260; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2261; GFX6-NEXT: s_ashr_i32 s4, s10, 31 2262; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2263; GFX6-NEXT: s_add_i32 s8, s10, s4 2264; GFX6-NEXT: s_xor_b32 s4, s8, s4 2265; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s4 2266; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 2267; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 2268; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 2269; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 2270; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 2271; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 2272; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 2273; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 2274; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 2275; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v1 2276; GFX6-NEXT: s_sub_i32 s5, 0, s4 2277; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2278; GFX6-NEXT: v_mul_lo_u32 v4, s5, v2 2279; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2280; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v1 2281; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2282; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2283; GFX6-NEXT: v_mul_hi_u32 v3, v2, v4 2284; GFX6-NEXT: s_ashr_i32 s8, s11, 31 2285; GFX6-NEXT: s_add_i32 s9, s11, s8 2286; GFX6-NEXT: s_ashr_i32 s5, s6, 31 2287; GFX6-NEXT: s_xor_b32 s8, s9, s8 2288; GFX6-NEXT: s_add_i32 s6, s6, s5 2289; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 2290; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 2291; GFX6-NEXT: s_xor_b32 s6, s6, s5 2292; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 2293; GFX6-NEXT: v_xor_b32_e32 v1, s13, v1 2294; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 2295; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s13, v1 2296; GFX6-NEXT: v_mul_lo_u32 v2, v2, s4 2297; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 2298; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 2299; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 2300; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v2 2301; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 2302; GFX6-NEXT: s_sub_i32 s6, 0, s8 2303; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 2304; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 2305; GFX6-NEXT: s_ashr_i32 s6, s7, 31 2306; GFX6-NEXT: s_add_i32 s7, s7, s6 2307; GFX6-NEXT: s_xor_b32 s7, s7, s6 2308; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 2309; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v2 2310; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 2311; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 2312; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 2313; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2314; GFX6-NEXT: v_xor_b32_e32 v2, s5, v2 2315; GFX6-NEXT: v_mul_lo_u32 v3, v3, s8 2316; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s5, v2 2317; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 2318; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 2319; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 2320; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 2321; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 2322; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 2323; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 2324; GFX6-NEXT: v_xor_b32_e32 v3, s6, v3 2325; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s6, v3 2326; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2327; GFX6-NEXT: s_endpgm 2328; 2329; GFX9-LABEL: srem_v4i32: 2330; GFX9: ; %bb.0: 2331; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 2332; GFX9-NEXT: v_mov_b32_e32 v4, 0 2333; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2334; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2335; GFX9-NEXT: s_ashr_i32 s2, s8, 31 2336; GFX9-NEXT: s_add_i32 s3, s8, s2 2337; GFX9-NEXT: s_xor_b32 s2, s3, s2 2338; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 2339; GFX9-NEXT: s_sub_i32 s8, 0, s2 2340; GFX9-NEXT: s_ashr_i32 s3, s4, 31 2341; GFX9-NEXT: s_add_i32 s4, s4, s3 2342; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 2343; GFX9-NEXT: s_xor_b32 s4, s4, s3 2344; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 2345; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 2346; GFX9-NEXT: v_readfirstlane_b32 s12, v0 2347; GFX9-NEXT: s_mul_i32 s8, s8, s12 2348; GFX9-NEXT: s_mul_hi_u32 s8, s12, s8 2349; GFX9-NEXT: s_add_i32 s12, s12, s8 2350; GFX9-NEXT: s_mul_hi_u32 s8, s4, s12 2351; GFX9-NEXT: s_mul_i32 s8, s8, s2 2352; GFX9-NEXT: s_sub_i32 s4, s4, s8 2353; GFX9-NEXT: s_sub_i32 s8, s4, s2 2354; GFX9-NEXT: s_cmp_ge_u32 s4, s2 2355; GFX9-NEXT: s_cselect_b32 s4, s8, s4 2356; GFX9-NEXT: s_sub_i32 s8, s4, s2 2357; GFX9-NEXT: s_cmp_ge_u32 s4, s2 2358; GFX9-NEXT: s_cselect_b32 s2, s8, s4 2359; GFX9-NEXT: s_ashr_i32 s4, s9, 31 2360; GFX9-NEXT: s_add_i32 s8, s9, s4 2361; GFX9-NEXT: s_xor_b32 s4, s8, s4 2362; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 2363; GFX9-NEXT: s_ashr_i32 s8, s5, 31 2364; GFX9-NEXT: s_xor_b32 s2, s2, s3 2365; GFX9-NEXT: s_add_i32 s5, s5, s8 2366; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 2367; GFX9-NEXT: s_sub_i32 s2, s2, s3 2368; GFX9-NEXT: s_xor_b32 s3, s5, s8 2369; GFX9-NEXT: s_sub_i32 s5, 0, s4 2370; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 2371; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 2372; GFX9-NEXT: v_readfirstlane_b32 s9, v0 2373; GFX9-NEXT: s_mul_i32 s5, s5, s9 2374; GFX9-NEXT: s_mul_hi_u32 s5, s9, s5 2375; GFX9-NEXT: s_add_i32 s9, s9, s5 2376; GFX9-NEXT: s_mul_hi_u32 s5, s3, s9 2377; GFX9-NEXT: s_mul_i32 s5, s5, s4 2378; GFX9-NEXT: s_sub_i32 s3, s3, s5 2379; GFX9-NEXT: s_sub_i32 s5, s3, s4 2380; GFX9-NEXT: s_cmp_ge_u32 s3, s4 2381; GFX9-NEXT: s_cselect_b32 s3, s5, s3 2382; GFX9-NEXT: s_sub_i32 s5, s3, s4 2383; GFX9-NEXT: s_cmp_ge_u32 s3, s4 2384; GFX9-NEXT: s_cselect_b32 s3, s5, s3 2385; GFX9-NEXT: s_ashr_i32 s4, s10, 31 2386; GFX9-NEXT: s_add_i32 s5, s10, s4 2387; GFX9-NEXT: s_xor_b32 s4, s5, s4 2388; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 2389; GFX9-NEXT: s_xor_b32 s3, s3, s8 2390; GFX9-NEXT: s_sub_i32 s3, s3, s8 2391; GFX9-NEXT: s_sub_i32 s8, 0, s4 2392; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 2393; GFX9-NEXT: s_ashr_i32 s5, s6, 31 2394; GFX9-NEXT: s_add_i32 s6, s6, s5 2395; GFX9-NEXT: s_xor_b32 s6, s6, s5 2396; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 2397; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 2398; GFX9-NEXT: v_mov_b32_e32 v1, s3 2399; GFX9-NEXT: v_readfirstlane_b32 s9, v0 2400; GFX9-NEXT: s_mul_i32 s8, s8, s9 2401; GFX9-NEXT: s_mul_hi_u32 s8, s9, s8 2402; GFX9-NEXT: s_add_i32 s9, s9, s8 2403; GFX9-NEXT: s_mul_hi_u32 s8, s6, s9 2404; GFX9-NEXT: s_mul_i32 s8, s8, s4 2405; GFX9-NEXT: s_sub_i32 s6, s6, s8 2406; GFX9-NEXT: s_sub_i32 s8, s6, s4 2407; GFX9-NEXT: s_cmp_ge_u32 s6, s4 2408; GFX9-NEXT: s_cselect_b32 s6, s8, s6 2409; GFX9-NEXT: s_sub_i32 s8, s6, s4 2410; GFX9-NEXT: s_cmp_ge_u32 s6, s4 2411; GFX9-NEXT: s_cselect_b32 s4, s8, s6 2412; GFX9-NEXT: s_ashr_i32 s6, s11, 31 2413; GFX9-NEXT: s_add_i32 s8, s11, s6 2414; GFX9-NEXT: s_xor_b32 s6, s8, s6 2415; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 2416; GFX9-NEXT: v_mov_b32_e32 v0, s2 2417; GFX9-NEXT: s_ashr_i32 s2, s7, 31 2418; GFX9-NEXT: s_xor_b32 s3, s4, s5 2419; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 2420; GFX9-NEXT: s_add_i32 s4, s7, s2 2421; GFX9-NEXT: s_sub_i32 s3, s3, s5 2422; GFX9-NEXT: s_sub_i32 s5, 0, s6 2423; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 2424; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 2425; GFX9-NEXT: s_xor_b32 s4, s4, s2 2426; GFX9-NEXT: v_readfirstlane_b32 s7, v2 2427; GFX9-NEXT: s_mul_i32 s5, s5, s7 2428; GFX9-NEXT: s_mul_hi_u32 s5, s7, s5 2429; GFX9-NEXT: s_add_i32 s7, s7, s5 2430; GFX9-NEXT: s_mul_hi_u32 s5, s4, s7 2431; GFX9-NEXT: s_mul_i32 s5, s5, s6 2432; GFX9-NEXT: s_sub_i32 s4, s4, s5 2433; GFX9-NEXT: s_sub_i32 s5, s4, s6 2434; GFX9-NEXT: s_cmp_ge_u32 s4, s6 2435; GFX9-NEXT: s_cselect_b32 s4, s5, s4 2436; GFX9-NEXT: s_sub_i32 s5, s4, s6 2437; GFX9-NEXT: s_cmp_ge_u32 s4, s6 2438; GFX9-NEXT: s_cselect_b32 s4, s5, s4 2439; GFX9-NEXT: s_xor_b32 s4, s4, s2 2440; GFX9-NEXT: s_sub_i32 s2, s4, s2 2441; GFX9-NEXT: v_mov_b32_e32 v2, s3 2442; GFX9-NEXT: v_mov_b32_e32 v3, s2 2443; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2444; GFX9-NEXT: s_endpgm 2445 %r = srem <4 x i32> %x, %y 2446 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 2447 ret void 2448} 2449 2450define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 2451; CHECK-LABEL: @udiv_v4i16( 2452; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2453; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2454; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 2455; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 2456; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 2457; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 2458; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 2459; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 2460; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 2461; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 2462; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 2463; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 2464; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 2465; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 2466; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 2467; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 2468; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 2469; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 2470; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 2471; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i16> undef, i16 [[TMP19]], i64 0 2472; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1 2473; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2474; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 2475; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 2476; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 2477; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 2478; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 2479; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 2480; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 2481; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 2482; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 2483; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 2484; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 2485; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 2486; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 2487; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 2488; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 2489; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 2490; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 2491; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1 2492; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2 2493; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2494; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 2495; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 2496; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 2497; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 2498; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 2499; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 2500; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 2501; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 2502; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 2503; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 2504; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 2505; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 2506; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 2507; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 2508; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 2509; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 2510; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 2511; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2 2512; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3 2513; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2514; CHECK-NEXT: [[TMP63:%.*]] = zext i16 [[TMP61]] to i32 2515; CHECK-NEXT: [[TMP64:%.*]] = zext i16 [[TMP62]] to i32 2516; CHECK-NEXT: [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float 2517; CHECK-NEXT: [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float 2518; CHECK-NEXT: [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]]) 2519; CHECK-NEXT: [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]] 2520; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]]) 2521; CHECK-NEXT: [[TMP70:%.*]] = fneg fast float [[TMP69]] 2522; CHECK-NEXT: [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]]) 2523; CHECK-NEXT: [[TMP72:%.*]] = fptoui float [[TMP69]] to i32 2524; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]]) 2525; CHECK-NEXT: [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 2526; CHECK-NEXT: [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]] 2527; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0 2528; CHECK-NEXT: [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]] 2529; CHECK-NEXT: [[TMP78:%.*]] = and i32 [[TMP77]], 65535 2530; CHECK-NEXT: [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16 2531; CHECK-NEXT: [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3 2532; CHECK-NEXT: store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 2533; CHECK-NEXT: ret void 2534; 2535; GFX6-LABEL: udiv_v4i16: 2536; GFX6: ; %bb.0: 2537; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 2538; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2539; GFX6-NEXT: s_mov_b32 s3, 0xf000 2540; GFX6-NEXT: s_mov_b32 s2, -1 2541; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2542; GFX6-NEXT: s_and_b32 s9, s6, 0xffff 2543; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 2544; GFX6-NEXT: s_lshr_b32 s6, s6, 16 2545; GFX6-NEXT: s_and_b32 s8, s4, 0xffff 2546; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 2547; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 2548; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 2549; GFX6-NEXT: s_lshr_b32 s4, s4, 16 2550; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 2551; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 2552; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 2553; GFX6-NEXT: v_trunc_f32_e32 v3, v3 2554; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 2555; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2556; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 2557; GFX6-NEXT: v_trunc_f32_e32 v1, v1 2558; GFX6-NEXT: s_and_b32 s4, s7, 0xffff 2559; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 2560; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 2561; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 2562; GFX6-NEXT: s_and_b32 s4, s5, 0xffff 2563; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc 2564; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2565; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 2566; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 2567; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 2568; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc 2569; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 2570; GFX6-NEXT: v_trunc_f32_e32 v1, v1 2571; GFX6-NEXT: s_lshr_b32 s4, s7, 16 2572; GFX6-NEXT: v_mad_f32 v3, -v1, v4, v5 2573; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 2574; GFX6-NEXT: s_lshr_b32 s4, s5, 16 2575; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 2576; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2577; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 2578; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 2579; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2580; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2581; GFX6-NEXT: v_mul_f32_e32 v3, v6, v7 2582; GFX6-NEXT: v_trunc_f32_e32 v3, v3 2583; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3 2584; GFX6-NEXT: v_mad_f32 v3, -v3, v5, v6 2585; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 2586; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 2587; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 2588; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2589; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 2590; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 2591; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 2592; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2593; GFX6-NEXT: s_endpgm 2594; 2595; GFX9-LABEL: udiv_v4i16: 2596; GFX9: ; %bb.0: 2597; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 2598; GFX9-NEXT: v_mov_b32_e32 v6, 0 2599; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2600; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2601; GFX9-NEXT: s_and_b32 s3, s6, 0xffff 2602; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 2603; GFX9-NEXT: s_and_b32 s2, s4, 0xffff 2604; GFX9-NEXT: s_lshr_b32 s6, s6, 16 2605; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 2606; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 2607; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 2608; GFX9-NEXT: s_lshr_b32 s4, s4, 16 2609; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 2610; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 2611; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 2612; GFX9-NEXT: v_trunc_f32_e32 v4, v4 2613; GFX9-NEXT: s_and_b32 s2, s7, 0xffff 2614; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 2615; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 2616; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 2617; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 2618; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 2619; GFX9-NEXT: s_and_b32 s2, s5, 0xffff 2620; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc 2621; GFX9-NEXT: v_trunc_f32_e32 v2, v5 2622; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 2623; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 2624; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 2625; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 2626; GFX9-NEXT: s_lshr_b32 s2, s7, 16 2627; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7 2628; GFX9-NEXT: v_trunc_f32_e32 v1, v1 2629; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 2630; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 2631; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 2632; GFX9-NEXT: s_lshr_b32 s2, s5, 16 2633; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 2634; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 2635; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 2636; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 2637; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 2638; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 2639; GFX9-NEXT: v_trunc_f32_e32 v3, v3 2640; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 2641; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 2642; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 2643; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 2644; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 2645; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 2646; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 2647; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 2648; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 2649; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] 2650; GFX9-NEXT: s_endpgm 2651 %r = udiv <4 x i16> %x, %y 2652 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 2653 ret void 2654} 2655 2656define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 2657; CHECK-LABEL: @urem_v4i16( 2658; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2659; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2660; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 2661; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 2662; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 2663; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 2664; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 2665; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 2666; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 2667; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 2668; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 2669; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 2670; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 2671; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 2672; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 2673; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 2674; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 2675; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 2676; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 2677; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 2678; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 2679; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i16> undef, i16 [[TMP21]], i64 0 2680; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1 2681; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2682; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 2683; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 2684; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 2685; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 2686; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 2687; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 2688; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 2689; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 2690; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 2691; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 2692; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 2693; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 2694; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 2695; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 2696; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 2697; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 2698; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 2699; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 2700; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 2701; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1 2702; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2 2703; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2704; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 2705; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 2706; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 2707; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 2708; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 2709; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 2710; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 2711; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 2712; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 2713; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 2714; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 2715; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 2716; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 2717; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 2718; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 2719; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 2720; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 2721; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 2722; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 2723; CHECK-NEXT: [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2 2724; CHECK-NEXT: [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3 2725; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2726; CHECK-NEXT: [[TMP69:%.*]] = zext i16 [[TMP67]] to i32 2727; CHECK-NEXT: [[TMP70:%.*]] = zext i16 [[TMP68]] to i32 2728; CHECK-NEXT: [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float 2729; CHECK-NEXT: [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float 2730; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]]) 2731; CHECK-NEXT: [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]] 2732; CHECK-NEXT: [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]]) 2733; CHECK-NEXT: [[TMP76:%.*]] = fneg fast float [[TMP75]] 2734; CHECK-NEXT: [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]]) 2735; CHECK-NEXT: [[TMP78:%.*]] = fptoui float [[TMP75]] to i32 2736; CHECK-NEXT: [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]]) 2737; CHECK-NEXT: [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]]) 2738; CHECK-NEXT: [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]] 2739; CHECK-NEXT: [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0 2740; CHECK-NEXT: [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]] 2741; CHECK-NEXT: [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]] 2742; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]] 2743; CHECK-NEXT: [[TMP86:%.*]] = and i32 [[TMP85]], 65535 2744; CHECK-NEXT: [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16 2745; CHECK-NEXT: [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3 2746; CHECK-NEXT: store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 2747; CHECK-NEXT: ret void 2748; 2749; GFX6-LABEL: urem_v4i16: 2750; GFX6: ; %bb.0: 2751; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 2752; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2753; GFX6-NEXT: s_mov_b32 s3, 0xf000 2754; GFX6-NEXT: s_mov_b32 s2, -1 2755; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2756; GFX6-NEXT: s_and_b32 s8, s6, 0xffff 2757; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 2758; GFX6-NEXT: v_mov_b32_e32 v4, s6 2759; GFX6-NEXT: v_alignbit_b32 v4, s7, v4, 16 2760; GFX6-NEXT: s_and_b32 s8, s4, 0xffff 2761; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v4 2762; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s8 2763; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 2764; GFX6-NEXT: v_cvt_f32_u32_e32 v5, v5 2765; GFX6-NEXT: v_mov_b32_e32 v1, s4 2766; GFX6-NEXT: v_alignbit_b32 v1, s5, v1, 16 2767; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v1 2768; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 2769; GFX6-NEXT: v_cvt_f32_u32_e32 v6, v6 2770; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 2771; GFX6-NEXT: v_trunc_f32_e32 v3, v3 2772; GFX6-NEXT: v_mad_f32 v2, -v3, v0, v2 2773; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 2774; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 2775; GFX6-NEXT: v_mul_f32_e32 v2, v6, v7 2776; GFX6-NEXT: v_trunc_f32_e32 v2, v2 2777; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 2778; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 2779; GFX6-NEXT: v_mad_f32 v2, -v2, v5, v6 2780; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 2781; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v5 2782; GFX6-NEXT: s_and_b32 s6, s7, 0xffff 2783; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 2784; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 2785; GFX6-NEXT: s_and_b32 s6, s5, 0xffff 2786; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 2787; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s6 2788; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 2789; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 2790; GFX6-NEXT: s_lshr_b32 s4, s7, 16 2791; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v2, v1 2792; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 2793; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 2794; GFX6-NEXT: s_lshr_b32 s6, s5, 16 2795; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s6 2796; GFX6-NEXT: v_trunc_f32_e32 v1, v1 2797; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 2798; GFX6-NEXT: v_mad_f32 v4, -v1, v3, v4 2799; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2800; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 2801; GFX6-NEXT: v_mul_f32_e32 v3, v6, v7 2802; GFX6-NEXT: v_trunc_f32_e32 v3, v3 2803; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3 2804; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2805; GFX6-NEXT: v_mad_f32 v3, -v3, v5, v6 2806; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 2807; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 2808; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 2809; GFX6-NEXT: v_mul_lo_u32 v3, v3, s4 2810; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2811; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 2812; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 2813; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 2814; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2815; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 2816; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 2817; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 2818; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2819; GFX6-NEXT: s_endpgm 2820; 2821; GFX9-LABEL: urem_v4i16: 2822; GFX9: ; %bb.0: 2823; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 2824; GFX9-NEXT: v_mov_b32_e32 v6, 0 2825; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2826; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2827; GFX9-NEXT: s_and_b32 s3, s6, 0xffff 2828; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 2829; GFX9-NEXT: s_and_b32 s2, s4, 0xffff 2830; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 2831; GFX9-NEXT: s_lshr_b32 s6, s6, 16 2832; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 2833; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 2834; GFX9-NEXT: s_lshr_b32 s4, s4, 16 2835; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 2836; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 2837; GFX9-NEXT: v_trunc_f32_e32 v4, v4 2838; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 2839; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 2840; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 2841; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 2842; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc 2843; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 2844; GFX9-NEXT: s_and_b32 s3, s7, 0xffff 2845; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 2846; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 2847; GFX9-NEXT: v_trunc_f32_e32 v2, v5 2848; GFX9-NEXT: s_and_b32 s8, s5, 0xffff 2849; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 2850; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 2851; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s8 2852; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 2853; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 2854; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 2855; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7 2856; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 2857; GFX9-NEXT: v_trunc_f32_e32 v2, v2 2858; GFX9-NEXT: s_lshr_b32 s6, s7, 16 2859; GFX9-NEXT: v_mad_f32 v3, -v2, v4, v5 2860; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6 2861; GFX9-NEXT: s_lshr_b32 s5, s5, 16 2862; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s5 2863; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 2864; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 2865; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 2866; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 2867; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 2868; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 2869; GFX9-NEXT: v_trunc_f32_e32 v3, v3 2870; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 2871; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 2872; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 2873; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 2874; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 2875; GFX9-NEXT: v_mul_lo_u32 v3, v3, s6 2876; GFX9-NEXT: v_sub_u32_e32 v4, s4, v1 2877; GFX9-NEXT: v_sub_u32_e32 v1, s8, v2 2878; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 2879; GFX9-NEXT: v_sub_u32_e32 v2, s5, v3 2880; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 2881; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 2882; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 2883; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] 2884; GFX9-NEXT: s_endpgm 2885 %r = urem <4 x i16> %x, %y 2886 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 2887 ret void 2888} 2889 2890define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 2891; CHECK-LABEL: @sdiv_v4i16( 2892; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2893; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2894; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 2895; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 2896; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 2897; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 2898; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 2899; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 2900; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 2901; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2902; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 2903; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 2904; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 2905; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 2906; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 2907; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 2908; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2909; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 2910; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 2911; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 2912; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 2913; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 2914; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 2915; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> undef, i16 [[TMP23]], i64 0 2916; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1 2917; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2918; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 2919; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 2920; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 2921; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 2922; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 2923; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 2924; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 2925; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 2926; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 2927; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 2928; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 2929; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 2930; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 2931; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 2932; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 2933; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 2934; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 2935; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 2936; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 2937; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 2938; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 2939; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1 2940; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2 2941; CHECK-NEXT: [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2942; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 2943; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 2944; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 2945; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 2946; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 2947; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 2948; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 2949; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 2950; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 2951; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 2952; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 2953; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 2954; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 2955; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 2956; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 2957; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 2958; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 2959; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 2960; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 2961; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 2962; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 2963; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2 2964; CHECK-NEXT: [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3 2965; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2966; CHECK-NEXT: [[TMP75:%.*]] = sext i16 [[TMP73]] to i32 2967; CHECK-NEXT: [[TMP76:%.*]] = sext i16 [[TMP74]] to i32 2968; CHECK-NEXT: [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]] 2969; CHECK-NEXT: [[TMP78:%.*]] = ashr i32 [[TMP77]], 30 2970; CHECK-NEXT: [[TMP79:%.*]] = or i32 [[TMP78]], 1 2971; CHECK-NEXT: [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float 2972; CHECK-NEXT: [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float 2973; CHECK-NEXT: [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]]) 2974; CHECK-NEXT: [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]] 2975; CHECK-NEXT: [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]]) 2976; CHECK-NEXT: [[TMP85:%.*]] = fneg fast float [[TMP84]] 2977; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]]) 2978; CHECK-NEXT: [[TMP87:%.*]] = fptosi float [[TMP84]] to i32 2979; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]]) 2980; CHECK-NEXT: [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]]) 2981; CHECK-NEXT: [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]] 2982; CHECK-NEXT: [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0 2983; CHECK-NEXT: [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]] 2984; CHECK-NEXT: [[TMP93:%.*]] = shl i32 [[TMP92]], 16 2985; CHECK-NEXT: [[TMP94:%.*]] = ashr i32 [[TMP93]], 16 2986; CHECK-NEXT: [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16 2987; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3 2988; CHECK-NEXT: store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 2989; CHECK-NEXT: ret void 2990; 2991; GFX6-LABEL: sdiv_v4i16: 2992; GFX6: ; %bb.0: 2993; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 2994; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2995; GFX6-NEXT: s_mov_b32 s3, 0xf000 2996; GFX6-NEXT: s_mov_b32 s2, -1 2997; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2998; GFX6-NEXT: s_sext_i32_i16 s8, s6 2999; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 3000; GFX6-NEXT: s_sext_i32_i16 s9, s4 3001; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 3002; GFX6-NEXT: s_xor_b32 s8, s9, s8 3003; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3004; GFX6-NEXT: s_ashr_i32 s6, s6, 16 3005; GFX6-NEXT: s_ashr_i32 s8, s8, 30 3006; GFX6-NEXT: s_or_b32 s8, s8, 1 3007; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3008; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3009; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3010; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 3011; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3012; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 3013; GFX6-NEXT: v_mov_b32_e32 v3, s8 3014; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3015; GFX6-NEXT: s_ashr_i32 s4, s4, 16 3016; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 3017; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 3018; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 3019; GFX6-NEXT: s_xor_b32 s4, s4, s6 3020; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3021; GFX6-NEXT: s_or_b32 s4, s4, 1 3022; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 3023; GFX6-NEXT: v_trunc_f32_e32 v3, v3 3024; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 3025; GFX6-NEXT: v_mov_b32_e32 v4, s4 3026; GFX6-NEXT: s_sext_i32_i16 s4, s7 3027; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 3028; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 3029; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 3030; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 3031; GFX6-NEXT: s_sext_i32_i16 s6, s5 3032; GFX6-NEXT: v_add_i32_e32 v3, vcc, v1, v3 3033; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 3034; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 3035; GFX6-NEXT: s_xor_b32 s4, s6, s4 3036; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3037; GFX6-NEXT: s_or_b32 s4, s4, 1 3038; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 3039; GFX6-NEXT: v_trunc_f32_e32 v4, v4 3040; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1 3041; GFX6-NEXT: v_mov_b32_e32 v5, s4 3042; GFX6-NEXT: s_ashr_i32 s4, s7, 16 3043; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 3044; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 3045; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 3046; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc 3047; GFX6-NEXT: s_ashr_i32 s5, s5, 16 3048; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 3049; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 3050; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 3051; GFX6-NEXT: s_xor_b32 s4, s5, s4 3052; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3053; GFX6-NEXT: s_or_b32 s4, s4, 1 3054; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 3055; GFX6-NEXT: v_trunc_f32_e32 v5, v5 3056; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 3057; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 3058; GFX6-NEXT: v_mov_b32_e32 v6, s4 3059; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| 3060; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc 3061; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 3062; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3063; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 3064; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 3065; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 3066; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 3067; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 3068; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3069; GFX6-NEXT: s_endpgm 3070; 3071; GFX9-LABEL: sdiv_v4i16: 3072; GFX9: ; %bb.0: 3073; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3074; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3075; GFX9-NEXT: v_mov_b32_e32 v2, 0 3076; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3077; GFX9-NEXT: s_sext_i32_i16 s0, s6 3078; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 3079; GFX9-NEXT: s_sext_i32_i16 s1, s4 3080; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 3081; GFX9-NEXT: s_xor_b32 s0, s1, s0 3082; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3083; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3084; GFX9-NEXT: s_or_b32 s8, s0, 1 3085; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 3086; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3087; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 3088; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3089; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3090; GFX9-NEXT: s_cselect_b32 s0, s8, 0 3091; GFX9-NEXT: s_ashr_i32 s1, s6, 16 3092; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 3093; GFX9-NEXT: s_ashr_i32 s4, s4, 16 3094; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s4 3095; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 3096; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 3097; GFX9-NEXT: v_add_u32_e32 v3, s0, v3 3098; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 3099; GFX9-NEXT: s_xor_b32 s0, s4, s1 3100; GFX9-NEXT: v_trunc_f32_e32 v4, v4 3101; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3102; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 3103; GFX9-NEXT: s_or_b32 s4, s0, 1 3104; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3105; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3106; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 3107; GFX9-NEXT: s_sext_i32_i16 s1, s7 3108; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 3109; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3110; GFX9-NEXT: v_add_u32_e32 v4, s0, v4 3111; GFX9-NEXT: s_sext_i32_i16 s0, s5 3112; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 3113; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 3114; GFX9-NEXT: s_xor_b32 s0, s0, s1 3115; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3116; GFX9-NEXT: s_or_b32 s4, s0, 1 3117; GFX9-NEXT: v_mul_f32_e32 v5, v1, v5 3118; GFX9-NEXT: v_trunc_f32_e32 v5, v5 3119; GFX9-NEXT: v_mad_f32 v1, -v5, v0, v1 3120; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3121; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3122; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 3123; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3124; GFX9-NEXT: s_ashr_i32 s1, s7, 16 3125; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 3126; GFX9-NEXT: v_add_u32_e32 v1, s0, v5 3127; GFX9-NEXT: s_ashr_i32 s0, s5, 16 3128; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 3129; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v0 3130; GFX9-NEXT: s_xor_b32 s0, s0, s1 3131; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3132; GFX9-NEXT: s_or_b32 s4, s0, 1 3133; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 3134; GFX9-NEXT: v_trunc_f32_e32 v6, v6 3135; GFX9-NEXT: v_mad_f32 v5, -v6, v0, v5 3136; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 3137; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| 3138; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3139; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3140; GFX9-NEXT: v_add_u32_e32 v0, s0, v6 3141; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 3142; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 3143; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 3144; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 3145; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3146; GFX9-NEXT: s_endpgm 3147 %r = sdiv <4 x i16> %x, %y 3148 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 3149 ret void 3150} 3151 3152define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 3153; CHECK-LABEL: @srem_v4i16( 3154; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 3155; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 3156; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 3157; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 3158; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 3159; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 3160; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 3161; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 3162; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 3163; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 3164; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 3165; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 3166; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 3167; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 3168; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 3169; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 3170; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3171; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 3172; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 3173; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 3174; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 3175; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 3176; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 3177; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 3178; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 3179; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> undef, i16 [[TMP25]], i64 0 3180; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1 3181; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1 3182; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 3183; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 3184; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 3185; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 3186; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 3187; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 3188; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 3189; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 3190; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 3191; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 3192; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 3193; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 3194; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 3195; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 3196; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 3197; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 3198; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 3199; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 3200; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 3201; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 3202; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 3203; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 3204; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 3205; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1 3206; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2 3207; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2 3208; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 3209; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 3210; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 3211; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 3212; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 3213; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 3214; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 3215; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 3216; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 3217; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 3218; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 3219; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 3220; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 3221; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 3222; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 3223; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 3224; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 3225; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 3226; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 3227; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 3228; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 3229; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 3230; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 3231; CHECK-NEXT: [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2 3232; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3 3233; CHECK-NEXT: [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3 3234; CHECK-NEXT: [[TMP81:%.*]] = sext i16 [[TMP79]] to i32 3235; CHECK-NEXT: [[TMP82:%.*]] = sext i16 [[TMP80]] to i32 3236; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]] 3237; CHECK-NEXT: [[TMP84:%.*]] = ashr i32 [[TMP83]], 30 3238; CHECK-NEXT: [[TMP85:%.*]] = or i32 [[TMP84]], 1 3239; CHECK-NEXT: [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float 3240; CHECK-NEXT: [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float 3241; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]]) 3242; CHECK-NEXT: [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]] 3243; CHECK-NEXT: [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]]) 3244; CHECK-NEXT: [[TMP91:%.*]] = fneg fast float [[TMP90]] 3245; CHECK-NEXT: [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]]) 3246; CHECK-NEXT: [[TMP93:%.*]] = fptosi float [[TMP90]] to i32 3247; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]]) 3248; CHECK-NEXT: [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]]) 3249; CHECK-NEXT: [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]] 3250; CHECK-NEXT: [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0 3251; CHECK-NEXT: [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]] 3252; CHECK-NEXT: [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]] 3253; CHECK-NEXT: [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]] 3254; CHECK-NEXT: [[TMP101:%.*]] = shl i32 [[TMP100]], 16 3255; CHECK-NEXT: [[TMP102:%.*]] = ashr i32 [[TMP101]], 16 3256; CHECK-NEXT: [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16 3257; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3 3258; CHECK-NEXT: store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 3259; CHECK-NEXT: ret void 3260; 3261; GFX6-LABEL: srem_v4i16: 3262; GFX6: ; %bb.0: 3263; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 3264; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3265; GFX6-NEXT: s_mov_b32 s3, 0xf000 3266; GFX6-NEXT: s_mov_b32 s2, -1 3267; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3268; GFX6-NEXT: s_sext_i32_i16 s8, s6 3269; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 3270; GFX6-NEXT: s_sext_i32_i16 s9, s4 3271; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 3272; GFX6-NEXT: s_xor_b32 s8, s9, s8 3273; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3274; GFX6-NEXT: s_ashr_i32 s8, s8, 30 3275; GFX6-NEXT: s_or_b32 s8, s8, 1 3276; GFX6-NEXT: v_mov_b32_e32 v3, s8 3277; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3278; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3279; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3280; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 3281; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3282; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3283; GFX6-NEXT: v_mov_b32_e32 v1, s4 3284; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 3285; GFX6-NEXT: v_mov_b32_e32 v2, s6 3286; GFX6-NEXT: v_alignbit_b32 v2, s7, v2, 16 3287; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 3288; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v3 3289; GFX6-NEXT: v_alignbit_b32 v1, s5, v1, 16 3290; GFX6-NEXT: v_bfe_i32 v5, v1, 0, 16 3291; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v5 3292; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 3293; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 3294; GFX6-NEXT: v_xor_b32_e32 v3, v5, v3 3295; GFX6-NEXT: v_ashrrev_i32_e32 v3, 30, v3 3296; GFX6-NEXT: v_mul_f32_e32 v5, v6, v7 3297; GFX6-NEXT: v_trunc_f32_e32 v5, v5 3298; GFX6-NEXT: v_mad_f32 v6, -v5, v4, v6 3299; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 3300; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 3301; GFX6-NEXT: v_or_b32_e32 v3, 1, v3 3302; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4| 3303; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 3304; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 3305; GFX6-NEXT: s_sext_i32_i16 s4, s7 3306; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 3307; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s4 3308; GFX6-NEXT: s_sext_i32_i16 s6, s5 3309; GFX6-NEXT: s_xor_b32 s4, s6, s4 3310; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 3311; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s6 3312; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v3 3313; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3314; GFX6-NEXT: s_or_b32 s4, s4, 1 3315; GFX6-NEXT: v_mov_b32_e32 v5, s4 3316; GFX6-NEXT: v_mul_f32_e32 v4, v2, v4 3317; GFX6-NEXT: v_trunc_f32_e32 v4, v4 3318; GFX6-NEXT: v_mad_f32 v2, -v4, v3, v2 3319; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 3320; GFX6-NEXT: s_ashr_i32 s4, s7, 16 3321; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v3| 3322; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s4 3323; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 3324; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 3325; GFX6-NEXT: v_mul_lo_u32 v2, v2, s7 3326; GFX6-NEXT: s_lshr_b32 s6, s7, 16 3327; GFX6-NEXT: s_ashr_i32 s7, s5, 16 3328; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s7 3329; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 3330; GFX6-NEXT: s_xor_b32 s4, s7, s4 3331; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3332; GFX6-NEXT: s_or_b32 s4, s4, 1 3333; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 3334; GFX6-NEXT: v_trunc_f32_e32 v5, v5 3335; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 3336; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 3337; GFX6-NEXT: v_mov_b32_e32 v6, s4 3338; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| 3339; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 3340; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 3341; GFX6-NEXT: v_mul_lo_u32 v3, v3, s6 3342; GFX6-NEXT: s_lshr_b32 s4, s5, 16 3343; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v2 3344; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 3345; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3346; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 3347; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 3348; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 3349; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 3350; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 3351; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3352; GFX6-NEXT: s_endpgm 3353; 3354; GFX9-LABEL: srem_v4i16: 3355; GFX9: ; %bb.0: 3356; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3357; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3358; GFX9-NEXT: v_mov_b32_e32 v2, 0 3359; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3360; GFX9-NEXT: s_sext_i32_i16 s8, s6 3361; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 3362; GFX9-NEXT: s_sext_i32_i16 s9, s4 3363; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 3364; GFX9-NEXT: s_xor_b32 s0, s9, s8 3365; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3366; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3367; GFX9-NEXT: s_or_b32 s10, s0, 1 3368; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 3369; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3370; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 3371; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3372; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3373; GFX9-NEXT: s_cselect_b32 s0, s10, 0 3374; GFX9-NEXT: s_ashr_i32 s6, s6, 16 3375; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 3376; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s6 3377; GFX9-NEXT: s_ashr_i32 s4, s4, 16 3378; GFX9-NEXT: v_add_u32_e32 v1, s0, v3 3379; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 3380; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 3381; GFX9-NEXT: s_xor_b32 s0, s4, s6 3382; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3383; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 3384; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 3385; GFX9-NEXT: v_trunc_f32_e32 v4, v4 3386; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 3387; GFX9-NEXT: s_or_b32 s8, s0, 1 3388; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| 3389; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 3390; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3391; GFX9-NEXT: s_cselect_b32 s0, s8, 0 3392; GFX9-NEXT: s_sext_i32_i16 s8, s7 3393; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s8 3394; GFX9-NEXT: v_add_u32_e32 v0, s0, v4 3395; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 3396; GFX9-NEXT: s_sext_i32_i16 s6, s5 3397; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s6 3398; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 3399; GFX9-NEXT: s_xor_b32 s0, s6, s8 3400; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3401; GFX9-NEXT: s_or_b32 s10, s0, 1 3402; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 3403; GFX9-NEXT: v_trunc_f32_e32 v5, v5 3404; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 3405; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 3406; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3407; GFX9-NEXT: s_cselect_b32 s0, s10, 0 3408; GFX9-NEXT: s_ashr_i32 s7, s7, 16 3409; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 3410; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s7 3411; GFX9-NEXT: s_ashr_i32 s5, s5, 16 3412; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 3413; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 3414; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s5 3415; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 3416; GFX9-NEXT: s_xor_b32 s0, s5, s7 3417; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3418; GFX9-NEXT: v_mul_lo_u32 v3, v3, s8 3419; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 3420; GFX9-NEXT: v_trunc_f32_e32 v6, v6 3421; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 3422; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 3423; GFX9-NEXT: s_or_b32 s8, s0, 1 3424; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| 3425; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3426; GFX9-NEXT: s_cselect_b32 s0, s8, 0 3427; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 3428; GFX9-NEXT: v_mul_lo_u32 v4, v4, s7 3429; GFX9-NEXT: v_sub_u32_e32 v5, s9, v1 3430; GFX9-NEXT: v_sub_u32_e32 v1, s6, v3 3431; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 3432; GFX9-NEXT: v_sub_u32_e32 v3, s5, v4 3433; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 3434; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v5 3435; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 3436; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3437; GFX9-NEXT: s_endpgm 3438 %r = srem <4 x i16> %x, %y 3439 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 3440 ret void 3441} 3442 3443define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 3444; CHECK-LABEL: @udiv_i3( 3445; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 3446; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 3447; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 3448; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 3449; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 3450; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 3451; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 3452; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 3453; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 3454; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 3455; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3456; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 3457; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 3458; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 3459; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 3460; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 7 3461; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3 3462; CHECK-NEXT: store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]], align 1 3463; CHECK-NEXT: ret void 3464; 3465; GFX6-LABEL: udiv_i3: 3466; GFX6: ; %bb.0: 3467; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 3468; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3469; GFX6-NEXT: s_mov_b32 s3, 0xf000 3470; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3471; GFX6-NEXT: s_bfe_u32 s2, s4, 0x30008 3472; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 3473; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 3474; GFX6-NEXT: s_and_b32 s4, s4, 7 3475; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 3476; GFX6-NEXT: s_mov_b32 s2, -1 3477; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 3478; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3479; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 3480; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 3481; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3482; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 3483; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 3484; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 3485; GFX6-NEXT: s_endpgm 3486; 3487; GFX9-LABEL: udiv_i3: 3488; GFX9: ; %bb.0: 3489; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 3490; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3491; GFX9-NEXT: v_mov_b32_e32 v2, 0 3492; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3493; GFX9-NEXT: s_bfe_u32 s0, s4, 0x30008 3494; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 3495; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 3496; GFX9-NEXT: s_and_b32 s0, s4, 7 3497; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 3498; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 3499; GFX9-NEXT: v_trunc_f32_e32 v1, v1 3500; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 3501; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v3 3502; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3503; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 3504; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 3505; GFX9-NEXT: global_store_byte v2, v0, s[2:3] 3506; GFX9-NEXT: s_endpgm 3507 %r = udiv i3 %x, %y 3508 store i3 %r, i3 addrspace(1)* %out 3509 ret void 3510} 3511 3512define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 3513; CHECK-LABEL: @urem_i3( 3514; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 3515; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 3516; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 3517; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 3518; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 3519; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 3520; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 3521; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 3522; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 3523; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 3524; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3525; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 3526; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 3527; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 3528; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 3529; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 3530; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 3531; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 7 3532; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3 3533; CHECK-NEXT: store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]], align 1 3534; CHECK-NEXT: ret void 3535; 3536; GFX6-LABEL: urem_i3: 3537; GFX6: ; %bb.0: 3538; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 3539; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3540; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3541; GFX6-NEXT: s_bfe_u32 s2, s4, 0x30008 3542; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 3543; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 3544; GFX6-NEXT: s_and_b32 s3, s4, 7 3545; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s3 3546; GFX6-NEXT: s_lshr_b32 s2, s4, 8 3547; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 3548; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3549; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 3550; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 3551; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3552; GFX6-NEXT: s_mov_b32 s3, 0xf000 3553; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 3554; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 3555; GFX6-NEXT: s_mov_b32 s2, -1 3556; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 3557; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 3558; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 3559; GFX6-NEXT: s_endpgm 3560; 3561; GFX9-LABEL: urem_i3: 3562; GFX9: ; %bb.0: 3563; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 3564; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3565; GFX9-NEXT: s_bfe_u32 s3, s2, 0x30008 3566; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 3567; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 3568; GFX9-NEXT: s_and_b32 s4, s2, 7 3569; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 3570; GFX9-NEXT: s_lshr_b32 s3, s2, 8 3571; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 3572; GFX9-NEXT: v_trunc_f32_e32 v1, v1 3573; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 3574; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2 3575; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3576; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3577; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 3578; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 3579; GFX9-NEXT: v_mov_b32_e32 v1, 0 3580; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 3581; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 3582; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3583; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 3584; GFX9-NEXT: s_endpgm 3585 %r = urem i3 %x, %y 3586 store i3 %r, i3 addrspace(1)* %out 3587 ret void 3588} 3589 3590define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 3591; CHECK-LABEL: @sdiv_i3( 3592; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 3593; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 3594; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 3595; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 3596; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 3597; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 3598; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 3599; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 3600; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 3601; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 3602; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 3603; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 3604; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 3605; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 3606; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 3607; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 3608; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 3609; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 3610; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 29 3611; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 29 3612; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3 3613; CHECK-NEXT: store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]], align 1 3614; CHECK-NEXT: ret void 3615; 3616; GFX6-LABEL: sdiv_i3: 3617; GFX6: ; %bb.0: 3618; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 3619; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3620; GFX6-NEXT: s_mov_b32 s3, 0xf000 3621; GFX6-NEXT: s_mov_b32 s2, -1 3622; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3623; GFX6-NEXT: s_bfe_i32 s5, s4, 0x30008 3624; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 3625; GFX6-NEXT: s_bfe_i32 s4, s4, 0x30000 3626; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 3627; GFX6-NEXT: s_xor_b32 s4, s4, s5 3628; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3629; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3630; GFX6-NEXT: s_or_b32 s4, s4, 1 3631; GFX6-NEXT: v_mov_b32_e32 v3, s4 3632; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3633; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3634; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3635; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 3636; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3637; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3638; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 3639; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 3640; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 3641; GFX6-NEXT: s_endpgm 3642; 3643; GFX9-LABEL: sdiv_i3: 3644; GFX9: ; %bb.0: 3645; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 3646; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3647; GFX9-NEXT: v_mov_b32_e32 v1, 0 3648; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3649; GFX9-NEXT: s_bfe_i32 s0, s4, 0x30008 3650; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 3651; GFX9-NEXT: s_bfe_i32 s1, s4, 0x30000 3652; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 3653; GFX9-NEXT: s_xor_b32 s0, s1, s0 3654; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3655; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3656; GFX9-NEXT: s_or_b32 s4, s0, 1 3657; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 3658; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3659; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 3660; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 3661; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 3662; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3663; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3664; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 3665; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 3666; GFX9-NEXT: global_store_byte v1, v0, s[2:3] 3667; GFX9-NEXT: s_endpgm 3668 %r = sdiv i3 %x, %y 3669 store i3 %r, i3 addrspace(1)* %out 3670 ret void 3671} 3672 3673define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 3674; CHECK-LABEL: @srem_i3( 3675; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 3676; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 3677; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 3678; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 3679; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 3680; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 3681; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 3682; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 3683; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 3684; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 3685; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 3686; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 3687; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 3688; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 3689; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 3690; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 3691; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 3692; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 3693; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 3694; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 3695; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 29 3696; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 29 3697; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3 3698; CHECK-NEXT: store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]], align 1 3699; CHECK-NEXT: ret void 3700; 3701; GFX6-LABEL: srem_i3: 3702; GFX6: ; %bb.0: 3703; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 3704; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3705; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3706; GFX6-NEXT: s_bfe_i32 s2, s4, 0x30008 3707; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 3708; GFX6-NEXT: s_bfe_i32 s5, s4, 0x30000 3709; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 3710; GFX6-NEXT: s_xor_b32 s2, s5, s2 3711; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3712; GFX6-NEXT: s_ashr_i32 s2, s2, 30 3713; GFX6-NEXT: s_or_b32 s2, s2, 1 3714; GFX6-NEXT: v_mov_b32_e32 v3, s2 3715; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3716; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3717; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3718; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 3719; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3720; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3721; GFX6-NEXT: s_lshr_b32 s3, s4, 8 3722; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 3723; GFX6-NEXT: v_mul_lo_u32 v0, v0, s3 3724; GFX6-NEXT: s_mov_b32 s3, 0xf000 3725; GFX6-NEXT: s_mov_b32 s2, -1 3726; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 3727; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 3728; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 3729; GFX6-NEXT: s_endpgm 3730; 3731; GFX9-LABEL: srem_i3: 3732; GFX9: ; %bb.0: 3733; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 3734; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3735; GFX9-NEXT: s_bfe_i32 s2, s4, 0x30008 3736; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 3737; GFX9-NEXT: s_bfe_i32 s3, s4, 0x30000 3738; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 3739; GFX9-NEXT: s_xor_b32 s2, s3, s2 3740; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 3741; GFX9-NEXT: s_ashr_i32 s2, s2, 30 3742; GFX9-NEXT: s_lshr_b32 s5, s4, 8 3743; GFX9-NEXT: s_or_b32 s6, s2, 1 3744; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 3745; GFX9-NEXT: v_trunc_f32_e32 v2, v2 3746; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 3747; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 3748; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 3749; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 3750; GFX9-NEXT: s_cselect_b32 s2, s6, 0 3751; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 3752; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 3753; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3754; GFX9-NEXT: v_mov_b32_e32 v1, 0 3755; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 3756; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 3757; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3758; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 3759; GFX9-NEXT: s_endpgm 3760 %r = srem i3 %x, %y 3761 store i3 %r, i3 addrspace(1)* %out 3762 ret void 3763} 3764 3765define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 3766; CHECK-LABEL: @udiv_v3i16( 3767; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 3768; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 3769; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 3770; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 3771; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3772; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3773; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3774; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3775; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3776; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3777; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3778; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3779; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3780; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3781; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3782; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3783; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3784; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 3785; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 3786; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i16> undef, i16 [[TMP19]], i64 0 3787; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1 3788; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1 3789; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 3790; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 3791; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 3792; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 3793; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 3794; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 3795; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 3796; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 3797; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 3798; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 3799; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 3800; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 3801; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 3802; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 3803; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 3804; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 3805; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 3806; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1 3807; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2 3808; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2 3809; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 3810; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 3811; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 3812; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 3813; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 3814; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 3815; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 3816; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 3817; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 3818; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 3819; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 3820; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 3821; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 3822; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 3823; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 3824; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 3825; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 3826; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2 3827; CHECK-NEXT: store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 3828; CHECK-NEXT: ret void 3829; 3830; GFX6-LABEL: udiv_v3i16: 3831; GFX6: ; %bb.0: 3832; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 3833; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3834; GFX6-NEXT: s_mov_b32 s3, 0xf000 3835; GFX6-NEXT: s_mov_b32 s2, -1 3836; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3837; GFX6-NEXT: s_and_b32 s9, s6, 0xffff 3838; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 3839; GFX6-NEXT: s_lshr_b32 s6, s6, 16 3840; GFX6-NEXT: s_and_b32 s8, s4, 0xffff 3841; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 3842; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 3843; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 3844; GFX6-NEXT: s_lshr_b32 s4, s4, 16 3845; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 3846; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 3847; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 3848; GFX6-NEXT: v_trunc_f32_e32 v3, v3 3849; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 3850; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3851; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 3852; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3853; GFX6-NEXT: s_and_b32 s4, s7, 0xffff 3854; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 3855; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 3856; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 3857; GFX6-NEXT: s_and_b32 s4, s5, 0xffff 3858; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc 3859; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 3860; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 3861; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 3862; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 3863; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 3864; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6 3865; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3866; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 3867; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3868; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5 3869; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 3870; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 3871; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3872; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 3873; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 3874; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 3875; GFX6-NEXT: s_endpgm 3876; 3877; GFX9-LABEL: udiv_v3i16: 3878; GFX9: ; %bb.0: 3879; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3880; GFX9-NEXT: v_mov_b32_e32 v6, 0 3881; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3882; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3883; GFX9-NEXT: s_and_b32 s3, s6, 0xffff 3884; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 3885; GFX9-NEXT: s_and_b32 s2, s4, 0xffff 3886; GFX9-NEXT: s_lshr_b32 s6, s6, 16 3887; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 3888; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 3889; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 3890; GFX9-NEXT: s_lshr_b32 s4, s4, 16 3891; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 3892; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 3893; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 3894; GFX9-NEXT: v_trunc_f32_e32 v4, v4 3895; GFX9-NEXT: s_and_b32 s2, s7, 0xffff 3896; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 3897; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 3898; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 3899; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 3900; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 3901; GFX9-NEXT: v_trunc_f32_e32 v2, v5 3902; GFX9-NEXT: s_and_b32 s2, s5, 0xffff 3903; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc 3904; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 3905; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 3906; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 3907; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 3908; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 3909; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 3910; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7 3911; GFX9-NEXT: v_trunc_f32_e32 v2, v2 3912; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 3913; GFX9-NEXT: v_mad_f32 v2, -v2, v4, v5 3914; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 3915; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 3916; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc 3917; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 3918; GFX9-NEXT: global_store_short v6, v2, s[0:1] offset:4 3919; GFX9-NEXT: global_store_dword v6, v0, s[0:1] 3920; GFX9-NEXT: s_endpgm 3921 %r = udiv <3 x i16> %x, %y 3922 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 3923 ret void 3924} 3925 3926define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 3927; CHECK-LABEL: @urem_v3i16( 3928; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 3929; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 3930; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 3931; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 3932; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3933; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3934; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3935; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3936; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3937; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3938; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3939; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3940; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3941; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3942; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3943; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3944; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3945; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 3946; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 3947; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 3948; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 3949; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i16> undef, i16 [[TMP21]], i64 0 3950; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1 3951; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1 3952; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 3953; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 3954; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 3955; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 3956; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 3957; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 3958; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 3959; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 3960; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 3961; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 3962; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 3963; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 3964; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 3965; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 3966; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 3967; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 3968; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 3969; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 3970; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 3971; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1 3972; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2 3973; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2 3974; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 3975; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 3976; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 3977; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 3978; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 3979; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 3980; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 3981; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 3982; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 3983; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 3984; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 3985; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 3986; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 3987; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 3988; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 3989; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 3990; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 3991; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 3992; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 3993; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2 3994; CHECK-NEXT: store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 3995; CHECK-NEXT: ret void 3996; 3997; GFX6-LABEL: urem_v3i16: 3998; GFX6: ; %bb.0: 3999; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 4000; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4001; GFX6-NEXT: s_mov_b32 s3, 0xf000 4002; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4003; GFX6-NEXT: s_and_b32 s8, s6, 0xffff 4004; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 4005; GFX6-NEXT: s_and_b32 s2, s4, 0xffff 4006; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 4007; GFX6-NEXT: v_mov_b32_e32 v2, s6 4008; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 4009; GFX6-NEXT: v_alignbit_b32 v2, s7, v2, 16 4010; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v2 4011; GFX6-NEXT: v_cvt_f32_u32_e32 v5, v5 4012; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 4013; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4014; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v4 4015; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 4016; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 4017; GFX6-NEXT: v_mov_b32_e32 v0, s4 4018; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc 4019; GFX6-NEXT: v_alignbit_b32 v0, s5, v0, 16 4020; GFX6-NEXT: v_mul_lo_u32 v1, v1, s6 4021; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v0 4022; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 4023; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 4024; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 4025; GFX6-NEXT: s_and_b32 s4, s7, 0xffff 4026; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 4027; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 4028; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4029; GFX6-NEXT: v_mad_f32 v3, -v4, v5, v3 4030; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 4031; GFX6-NEXT: s_and_b32 s4, s5, 0xffff 4032; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s4 4033; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 4034; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 4035; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 4036; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 4037; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 4038; GFX6-NEXT: v_trunc_f32_e32 v3, v3 4039; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3 4040; GFX6-NEXT: v_mad_f32 v3, -v3, v6, v7 4041; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v6 4042; GFX6-NEXT: s_mov_b32 s2, -1 4043; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 4044; GFX6-NEXT: v_mul_lo_u32 v3, v3, s7 4045; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 4046; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 4047; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v3 4048; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 4049; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 4050; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 4051; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 4052; GFX6-NEXT: s_endpgm 4053; 4054; GFX9-LABEL: urem_v3i16: 4055; GFX9: ; %bb.0: 4056; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 4057; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4058; GFX9-NEXT: s_and_b32 s3, s6, 0xffff 4059; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 4060; GFX9-NEXT: s_and_b32 s2, s4, 0xffff 4061; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 4062; GFX9-NEXT: s_lshr_b32 s6, s6, 16 4063; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 4064; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 4065; GFX9-NEXT: s_lshr_b32 s4, s4, 16 4066; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 4067; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 4068; GFX9-NEXT: v_trunc_f32_e32 v4, v4 4069; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 4070; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4 4071; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 4072; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 4073; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 4074; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc 4075; GFX9-NEXT: v_trunc_f32_e32 v5, v5 4076; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 4077; GFX9-NEXT: s_and_b32 s3, s7, 0xffff 4078; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3 4079; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3 4080; GFX9-NEXT: s_and_b32 s5, s5, 0xffff 4081; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v5 4082; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5 4083; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 4084; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 4085; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc 4086; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6 4087; GFX9-NEXT: v_trunc_f32_e32 v2, v2 4088; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 4089; GFX9-NEXT: v_mad_f32 v2, -v2, v3, v5 4090; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 4091; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4092; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc 4093; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 4094; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 4095; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 4096; GFX9-NEXT: v_mov_b32_e32 v3, 0 4097; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 4098; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 4099; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 4100; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 4101; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4102; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 4103; GFX9-NEXT: global_store_dword v3, v0, s[0:1] 4104; GFX9-NEXT: s_endpgm 4105 %r = urem <3 x i16> %x, %y 4106 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 4107 ret void 4108} 4109 4110define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 4111; CHECK-LABEL: @sdiv_v3i16( 4112; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 4113; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 4114; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 4115; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 4116; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 4117; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 4118; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 4119; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 4120; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 4121; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 4122; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 4123; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 4124; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 4125; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 4126; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 4127; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 4128; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4129; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 4130; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 4131; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 4132; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 4133; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 4134; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 4135; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i16> undef, i16 [[TMP23]], i64 0 4136; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1 4137; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1 4138; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 4139; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 4140; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 4141; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 4142; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 4143; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 4144; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 4145; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 4146; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 4147; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 4148; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 4149; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 4150; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 4151; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 4152; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 4153; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 4154; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 4155; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 4156; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 4157; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 4158; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 4159; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1 4160; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2 4161; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2 4162; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 4163; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 4164; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 4165; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 4166; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 4167; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 4168; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 4169; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 4170; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 4171; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 4172; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 4173; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 4174; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 4175; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 4176; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 4177; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 4178; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 4179; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 4180; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 4181; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 4182; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 4183; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2 4184; CHECK-NEXT: store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 4185; CHECK-NEXT: ret void 4186; 4187; GFX6-LABEL: sdiv_v3i16: 4188; GFX6: ; %bb.0: 4189; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 4190; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4191; GFX6-NEXT: s_mov_b32 s3, 0xf000 4192; GFX6-NEXT: s_mov_b32 s2, -1 4193; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4194; GFX6-NEXT: s_sext_i32_i16 s8, s6 4195; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 4196; GFX6-NEXT: s_sext_i32_i16 s9, s4 4197; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 4198; GFX6-NEXT: s_xor_b32 s8, s9, s8 4199; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 4200; GFX6-NEXT: s_ashr_i32 s6, s6, 16 4201; GFX6-NEXT: s_ashr_i32 s8, s8, 30 4202; GFX6-NEXT: s_or_b32 s8, s8, 1 4203; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 4204; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4205; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 4206; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 4207; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 4208; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 4209; GFX6-NEXT: v_mov_b32_e32 v3, s8 4210; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 4211; GFX6-NEXT: s_ashr_i32 s4, s4, 16 4212; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 4213; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 4214; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 4215; GFX6-NEXT: s_xor_b32 s4, s4, s6 4216; GFX6-NEXT: s_ashr_i32 s4, s4, 30 4217; GFX6-NEXT: s_or_b32 s4, s4, 1 4218; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 4219; GFX6-NEXT: v_trunc_f32_e32 v3, v3 4220; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 4221; GFX6-NEXT: v_mov_b32_e32 v4, s4 4222; GFX6-NEXT: s_sext_i32_i16 s4, s7 4223; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 4224; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 4225; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 4226; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 4227; GFX6-NEXT: s_sext_i32_i16 s5, s5 4228; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 4229; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 4230; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 4231; GFX6-NEXT: s_xor_b32 s4, s5, s4 4232; GFX6-NEXT: s_ashr_i32 s4, s4, 30 4233; GFX6-NEXT: s_or_b32 s4, s4, 1 4234; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 4235; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4236; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 4237; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 4238; GFX6-NEXT: v_mov_b32_e32 v5, s4 4239; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 4240; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 4241; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 4242; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 4243; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 4244; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 4245; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 4246; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 4247; GFX6-NEXT: s_endpgm 4248; 4249; GFX9-LABEL: sdiv_v3i16: 4250; GFX9: ; %bb.0: 4251; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 4252; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4253; GFX9-NEXT: v_mov_b32_e32 v1, 0 4254; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4255; GFX9-NEXT: s_sext_i32_i16 s0, s6 4256; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 4257; GFX9-NEXT: s_sext_i32_i16 s1, s4 4258; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 4259; GFX9-NEXT: s_xor_b32 s0, s1, s0 4260; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 4261; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4262; GFX9-NEXT: s_or_b32 s8, s0, 1 4263; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 4264; GFX9-NEXT: v_trunc_f32_e32 v3, v3 4265; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 4266; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 4267; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4268; GFX9-NEXT: s_cselect_b32 s0, s8, 0 4269; GFX9-NEXT: s_ashr_i32 s1, s6, 16 4270; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 4271; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 4272; GFX9-NEXT: s_ashr_i32 s4, s4, 16 4273; GFX9-NEXT: v_add_u32_e32 v2, s0, v3 4274; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 4275; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 4276; GFX9-NEXT: s_xor_b32 s0, s4, s1 4277; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4278; GFX9-NEXT: s_or_b32 s4, s0, 1 4279; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 4280; GFX9-NEXT: v_trunc_f32_e32 v4, v4 4281; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 4282; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| 4283; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4284; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 4285; GFX9-NEXT: s_sext_i32_i16 s1, s7 4286; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 4287; GFX9-NEXT: s_cselect_b32 s0, s4, 0 4288; GFX9-NEXT: v_add_u32_e32 v3, s0, v4 4289; GFX9-NEXT: s_sext_i32_i16 s0, s5 4290; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 4291; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 4292; GFX9-NEXT: s_xor_b32 s0, s0, s1 4293; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4294; GFX9-NEXT: s_or_b32 s4, s0, 1 4295; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 4296; GFX9-NEXT: v_trunc_f32_e32 v5, v5 4297; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 4298; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 4299; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| 4300; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4301; GFX9-NEXT: s_cselect_b32 s0, s4, 0 4302; GFX9-NEXT: v_add_u32_e32 v0, s0, v5 4303; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 4304; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 4305; GFX9-NEXT: global_store_short v1, v0, s[2:3] offset:4 4306; GFX9-NEXT: global_store_dword v1, v2, s[2:3] 4307; GFX9-NEXT: s_endpgm 4308 %r = sdiv <3 x i16> %x, %y 4309 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 4310 ret void 4311} 4312 4313define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 4314; CHECK-LABEL: @srem_v3i16( 4315; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 4316; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 4317; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 4318; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 4319; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 4320; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 4321; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 4322; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 4323; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 4324; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 4325; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 4326; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 4327; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 4328; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 4329; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 4330; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 4331; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4332; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 4333; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 4334; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 4335; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 4336; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 4337; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 4338; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 4339; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 4340; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i16> undef, i16 [[TMP25]], i64 0 4341; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1 4342; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1 4343; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 4344; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 4345; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 4346; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 4347; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 4348; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 4349; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 4350; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 4351; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 4352; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 4353; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 4354; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 4355; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 4356; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 4357; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 4358; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 4359; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 4360; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 4361; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 4362; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 4363; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 4364; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 4365; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 4366; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1 4367; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2 4368; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2 4369; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 4370; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 4371; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 4372; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 4373; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 4374; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 4375; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 4376; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 4377; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 4378; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 4379; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 4380; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 4381; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 4382; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 4383; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 4384; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 4385; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 4386; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 4387; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 4388; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 4389; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 4390; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 4391; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 4392; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2 4393; CHECK-NEXT: store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 4394; CHECK-NEXT: ret void 4395; 4396; GFX6-LABEL: srem_v3i16: 4397; GFX6: ; %bb.0: 4398; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 4399; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4400; GFX6-NEXT: s_mov_b32 s3, 0xf000 4401; GFX6-NEXT: s_mov_b32 s2, -1 4402; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4403; GFX6-NEXT: s_sext_i32_i16 s8, s6 4404; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 4405; GFX6-NEXT: s_sext_i32_i16 s9, s4 4406; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 4407; GFX6-NEXT: s_xor_b32 s8, s9, s8 4408; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 4409; GFX6-NEXT: s_ashr_i32 s8, s8, 30 4410; GFX6-NEXT: s_or_b32 s8, s8, 1 4411; GFX6-NEXT: v_mov_b32_e32 v3, s8 4412; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 4413; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4414; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 4415; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 4416; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 4417; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 4418; GFX6-NEXT: v_mov_b32_e32 v1, s4 4419; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 4420; GFX6-NEXT: v_mov_b32_e32 v2, s6 4421; GFX6-NEXT: v_alignbit_b32 v2, s7, v2, 16 4422; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 4423; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v3 4424; GFX6-NEXT: v_alignbit_b32 v1, s5, v1, 16 4425; GFX6-NEXT: v_bfe_i32 v5, v1, 0, 16 4426; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v5 4427; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 4428; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 4429; GFX6-NEXT: v_xor_b32_e32 v3, v5, v3 4430; GFX6-NEXT: v_ashrrev_i32_e32 v3, 30, v3 4431; GFX6-NEXT: v_mul_f32_e32 v5, v6, v7 4432; GFX6-NEXT: v_trunc_f32_e32 v5, v5 4433; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 4434; GFX6-NEXT: v_mad_f32 v6, -v5, v4, v6 4435; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 4436; GFX6-NEXT: s_sext_i32_i16 s4, s7 4437; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4| 4438; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 4439; GFX6-NEXT: v_or_b32_e32 v3, 1, v3 4440; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 4441; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 4442; GFX6-NEXT: s_sext_i32_i16 s6, s5 4443; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 4444; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s6 4445; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v4 4446; GFX6-NEXT: s_xor_b32 s4, s6, s4 4447; GFX6-NEXT: s_ashr_i32 s4, s4, 30 4448; GFX6-NEXT: s_or_b32 s4, s4, 1 4449; GFX6-NEXT: v_mul_f32_e32 v5, v3, v5 4450; GFX6-NEXT: v_trunc_f32_e32 v5, v5 4451; GFX6-NEXT: v_mad_f32 v3, -v5, v4, v3 4452; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 4453; GFX6-NEXT: v_mov_b32_e32 v6, s4 4454; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v4| 4455; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 4456; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 4457; GFX6-NEXT: v_mul_lo_u32 v3, v3, s7 4458; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 4459; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 4460; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v3 4461; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 4462; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 4463; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 4464; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 4465; GFX6-NEXT: s_endpgm 4466; 4467; GFX9-LABEL: srem_v3i16: 4468; GFX9: ; %bb.0: 4469; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 4470; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4471; GFX9-NEXT: s_sext_i32_i16 s8, s6 4472; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 4473; GFX9-NEXT: s_sext_i32_i16 s9, s4 4474; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 4475; GFX9-NEXT: s_xor_b32 s2, s9, s8 4476; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 4477; GFX9-NEXT: s_ashr_i32 s2, s2, 30 4478; GFX9-NEXT: s_or_b32 s10, s2, 1 4479; GFX9-NEXT: s_sext_i32_i16 s7, s7 4480; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 4481; GFX9-NEXT: v_trunc_f32_e32 v2, v2 4482; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 4483; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 4484; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 4485; GFX9-NEXT: s_cselect_b32 s2, s10, 0 4486; GFX9-NEXT: s_ashr_i32 s6, s6, 16 4487; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 4488; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s6 4489; GFX9-NEXT: s_ashr_i32 s4, s4, 16 4490; GFX9-NEXT: s_sext_i32_i16 s5, s5 4491; GFX9-NEXT: v_add_u32_e32 v1, s2, v2 4492; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s4 4493; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 4494; GFX9-NEXT: s_xor_b32 s2, s4, s6 4495; GFX9-NEXT: s_ashr_i32 s2, s2, 30 4496; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 4497; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 4498; GFX9-NEXT: v_trunc_f32_e32 v3, v3 4499; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 4500; GFX9-NEXT: s_or_b32 s8, s2, 1 4501; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 4502; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| 4503; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s7 4504; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 4505; GFX9-NEXT: s_cselect_b32 s2, s8, 0 4506; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 4507; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5 4508; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 4509; GFX9-NEXT: s_xor_b32 s2, s5, s7 4510; GFX9-NEXT: s_ashr_i32 s2, s2, 30 4511; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 4512; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 4513; GFX9-NEXT: v_trunc_f32_e32 v4, v4 4514; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 4515; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 4516; GFX9-NEXT: s_or_b32 s6, s2, 1 4517; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v2| 4518; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 4519; GFX9-NEXT: s_cselect_b32 s2, s6, 0 4520; GFX9-NEXT: v_add_u32_e32 v2, s2, v4 4521; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4522; GFX9-NEXT: v_mul_lo_u32 v2, v2, s7 4523; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 4524; GFX9-NEXT: v_mov_b32_e32 v3, 0 4525; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 4526; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 4527; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 4528; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 4529; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4530; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 4531; GFX9-NEXT: global_store_dword v3, v0, s[0:1] 4532; GFX9-NEXT: s_endpgm 4533 %r = srem <3 x i16> %x, %y 4534 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 4535 ret void 4536} 4537 4538define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 4539; CHECK-LABEL: @udiv_v3i15( 4540; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 4541; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 4542; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 4543; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 4544; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 4545; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 4546; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 4547; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 4548; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 4549; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 4550; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 4551; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 4552; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 4553; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 4554; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 4555; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 4556; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 4557; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 32767 4558; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15 4559; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i15> undef, i15 [[TMP19]], i64 0 4560; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1 4561; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1 4562; CHECK-NEXT: [[TMP23:%.*]] = zext i15 [[TMP21]] to i32 4563; CHECK-NEXT: [[TMP24:%.*]] = zext i15 [[TMP22]] to i32 4564; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 4565; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 4566; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 4567; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 4568; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 4569; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 4570; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 4571; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 4572; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 4573; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 4574; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 4575; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 4576; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 4577; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 32767 4578; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15 4579; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1 4580; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2 4581; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2 4582; CHECK-NEXT: [[TMP43:%.*]] = zext i15 [[TMP41]] to i32 4583; CHECK-NEXT: [[TMP44:%.*]] = zext i15 [[TMP42]] to i32 4584; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 4585; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 4586; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 4587; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 4588; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 4589; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 4590; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 4591; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 4592; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 4593; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 4594; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 4595; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 4596; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 4597; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 32767 4598; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15 4599; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2 4600; CHECK-NEXT: store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 4601; CHECK-NEXT: ret void 4602; 4603; GFX6-LABEL: udiv_v3i15: 4604; GFX6: ; %bb.0: 4605; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4606; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4607; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 4608; GFX6-NEXT: s_mov_b32 s7, 0xf000 4609; GFX6-NEXT: s_mov_b32 s6, -1 4610; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4611; GFX6-NEXT: v_mov_b32_e32 v0, s2 4612; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 4613; GFX6-NEXT: s_and_b32 s8, s0, 0x7fff 4614; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 4615; GFX6-NEXT: s_and_b32 s3, s2, 0x7fff 4616; GFX6-NEXT: v_mov_b32_e32 v2, s0 4617; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf000f 4618; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s3 4619; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 4620; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 4621; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf000f 4622; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 4623; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 4624; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s2 4625; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 4626; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 4627; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4628; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 4629; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 4630; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v2 4631; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 4632; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7 4633; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 4634; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4635; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 4636; GFX6-NEXT: v_mad_f32 v4, -v1, v5, v6 4637; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 4638; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0 4639; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 4640; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v5 4641; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 4642; GFX6-NEXT: v_mul_f32_e32 v1, v0, v6 4643; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4644; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v1 4645; GFX6-NEXT: v_mad_f32 v0, -v1, v2, v0 4646; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 4647; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v3 4648; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc 4649; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 4650; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 4651; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 4652; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 4653; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 4654; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4655; GFX6-NEXT: s_waitcnt expcnt(0) 4656; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 4657; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 4658; GFX6-NEXT: s_endpgm 4659; 4660; GFX9-LABEL: udiv_v3i15: 4661; GFX9: ; %bb.0: 4662; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 4663; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 4664; GFX9-NEXT: v_mov_b32_e32 v2, 0 4665; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 4666; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4667; GFX9-NEXT: v_mov_b32_e32 v0, s2 4668; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 4669; GFX9-NEXT: s_and_b32 s6, s2, 0x7fff 4670; GFX9-NEXT: s_and_b32 s3, s0, 0x7fff 4671; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 4672; GFX9-NEXT: v_mov_b32_e32 v3, s0 4673; GFX9-NEXT: s_bfe_u32 s0, s0, 0xf000f 4674; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 4675; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 4676; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 4677; GFX9-NEXT: s_bfe_u32 s2, s2, 0xf000f 4678; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30 4679; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 4680; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 4681; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 4682; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 4683; GFX9-NEXT: v_trunc_f32_e32 v5, v5 4684; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 4685; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 4686; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 4687; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 4688; GFX9-NEXT: v_mul_f32_e32 v1, v7, v8 4689; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 4690; GFX9-NEXT: v_trunc_f32_e32 v1, v1 4691; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 4692; GFX9-NEXT: v_mad_f32 v5, -v1, v6, v7 4693; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 4694; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 4695; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 4696; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v6 4697; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc 4698; GFX9-NEXT: v_mul_f32_e32 v1, v0, v7 4699; GFX9-NEXT: v_trunc_f32_e32 v1, v1 4700; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v1 4701; GFX9-NEXT: v_mad_f32 v0, -v1, v3, v0 4702; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v3 4703; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v4 4704; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc 4705; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v5 4706; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 4707; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 4708; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 4709; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 4710; GFX9-NEXT: global_store_dword v2, v0, s[4:5] 4711; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 4712; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 4713; GFX9-NEXT: s_endpgm 4714 %r = udiv <3 x i15> %x, %y 4715 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 4716 ret void 4717} 4718 4719define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 4720; CHECK-LABEL: @urem_v3i15( 4721; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 4722; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 4723; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 4724; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 4725; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 4726; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 4727; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 4728; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 4729; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 4730; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 4731; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 4732; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 4733; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 4734; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 4735; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 4736; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 4737; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 4738; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 4739; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 4740; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 32767 4741; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15 4742; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i15> undef, i15 [[TMP21]], i64 0 4743; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1 4744; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1 4745; CHECK-NEXT: [[TMP25:%.*]] = zext i15 [[TMP23]] to i32 4746; CHECK-NEXT: [[TMP26:%.*]] = zext i15 [[TMP24]] to i32 4747; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 4748; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 4749; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 4750; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 4751; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 4752; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 4753; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 4754; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 4755; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 4756; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 4757; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 4758; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 4759; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 4760; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 4761; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 4762; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 32767 4763; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15 4764; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1 4765; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2 4766; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2 4767; CHECK-NEXT: [[TMP47:%.*]] = zext i15 [[TMP45]] to i32 4768; CHECK-NEXT: [[TMP48:%.*]] = zext i15 [[TMP46]] to i32 4769; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 4770; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 4771; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 4772; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 4773; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 4774; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 4775; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 4776; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 4777; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 4778; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 4779; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 4780; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 4781; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 4782; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 4783; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 4784; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 32767 4785; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15 4786; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2 4787; CHECK-NEXT: store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 4788; CHECK-NEXT: ret void 4789; 4790; GFX6-LABEL: urem_v3i15: 4791; GFX6: ; %bb.0: 4792; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4793; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4794; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 4795; GFX6-NEXT: s_mov_b32 s7, 0xf000 4796; GFX6-NEXT: s_mov_b32 s6, -1 4797; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4798; GFX6-NEXT: s_and_b32 s8, s2, 0x7fff 4799; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 4800; GFX6-NEXT: s_and_b32 s9, s0, 0x7fff 4801; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 4802; GFX6-NEXT: v_mov_b32_e32 v2, s0 4803; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 4804; GFX6-NEXT: s_bfe_u32 s1, s0, 0xf000f 4805; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 4806; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 4807; GFX6-NEXT: s_bfe_u32 s9, s2, 0xf000f 4808; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 4809; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 4810; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4811; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 4812; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 4813; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 4814; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s9 4815; GFX6-NEXT: v_mov_b32_e32 v0, s2 4816; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc 4817; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 4818; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 4819; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 4820; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 4821; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s2, v1 4822; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 4823; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2 4824; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0 4825; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4826; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 4827; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 4828; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 4829; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 4830; GFX6-NEXT: s_lshr_b32 s0, s0, 15 4831; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 4832; GFX6-NEXT: v_trunc_f32_e32 v3, v3 4833; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3 4834; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4835; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 4836; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 4837; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 4838; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 4839; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 4840; GFX6-NEXT: s_lshr_b32 s3, s2, 15 4841; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v1 4842; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 4843; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 4844; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 4845; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v6 4846; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 4847; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 4848; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 4849; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4850; GFX6-NEXT: s_waitcnt expcnt(0) 4851; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 4852; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 4853; GFX6-NEXT: s_endpgm 4854; 4855; GFX9-LABEL: urem_v3i15: 4856; GFX9: ; %bb.0: 4857; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 4858; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 4859; GFX9-NEXT: v_mov_b32_e32 v2, 0 4860; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 4861; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4862; GFX9-NEXT: s_and_b32 s6, s2, 0x7fff 4863; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 4864; GFX9-NEXT: v_mov_b32_e32 v0, s2 4865; GFX9-NEXT: s_and_b32 s7, s0, 0x7fff 4866; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 4867; GFX9-NEXT: s_bfe_u32 s6, s0, 0xf000f 4868; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s6 4869; GFX9-NEXT: v_mov_b32_e32 v3, s0 4870; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 4871; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30 4872; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 4873; GFX9-NEXT: s_bfe_u32 s3, s2, 0xf000f 4874; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 4875; GFX9-NEXT: v_trunc_f32_e32 v5, v5 4876; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 4877; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 4878; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 4879; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 4880; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s3 4881; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 4882; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc 4883; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 4884; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 4885; GFX9-NEXT: v_mul_f32_e32 v4, v7, v8 4886; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 4887; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v5 4888; GFX9-NEXT: v_trunc_f32_e32 v4, v4 4889; GFX9-NEXT: v_mad_f32 v7, -v4, v6, v7 4890; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 4891; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 4892; GFX9-NEXT: v_mul_f32_e32 v6, v8, v9 4893; GFX9-NEXT: v_trunc_f32_e32 v6, v6 4894; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 4895; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 4896; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v8 4897; GFX9-NEXT: s_lshr_b32 s1, s0, 15 4898; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 4899; GFX9-NEXT: v_mul_lo_u32 v4, v4, s1 4900; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc 4901; GFX9-NEXT: v_mul_lo_u32 v1, v1, s0 4902; GFX9-NEXT: v_mul_lo_u32 v3, v5, v3 4903; GFX9-NEXT: s_lshr_b32 s0, s2, 15 4904; GFX9-NEXT: v_sub_u32_e32 v4, s0, v4 4905; GFX9-NEXT: v_sub_u32_e32 v5, s2, v1 4906; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 4907; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4 4908; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 4909; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v5 4910; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 4911; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 4912; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 4913; GFX9-NEXT: global_store_dword v2, v0, s[4:5] 4914; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 4915; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 4916; GFX9-NEXT: s_endpgm 4917 %r = urem <3 x i15> %x, %y 4918 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 4919 ret void 4920} 4921 4922define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 4923; CHECK-LABEL: @sdiv_v3i15( 4924; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 4925; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 4926; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 4927; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 4928; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 4929; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 4930; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 4931; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 4932; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 4933; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 4934; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 4935; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 4936; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 4937; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 4938; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 4939; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 4940; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4941; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 4942; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 4943; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 4944; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 17 4945; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 17 4946; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15 4947; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i15> undef, i15 [[TMP23]], i64 0 4948; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1 4949; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1 4950; CHECK-NEXT: [[TMP27:%.*]] = sext i15 [[TMP25]] to i32 4951; CHECK-NEXT: [[TMP28:%.*]] = sext i15 [[TMP26]] to i32 4952; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 4953; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 4954; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 4955; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 4956; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 4957; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 4958; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 4959; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 4960; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 4961; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 4962; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 4963; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 4964; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 4965; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 4966; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 4967; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 4968; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 17 4969; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 17 4970; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15 4971; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1 4972; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2 4973; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2 4974; CHECK-NEXT: [[TMP51:%.*]] = sext i15 [[TMP49]] to i32 4975; CHECK-NEXT: [[TMP52:%.*]] = sext i15 [[TMP50]] to i32 4976; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 4977; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 4978; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 4979; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 4980; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 4981; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 4982; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 4983; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 4984; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 4985; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 4986; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 4987; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 4988; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 4989; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 4990; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 4991; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 4992; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 17 4993; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 17 4994; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15 4995; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2 4996; CHECK-NEXT: store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 4997; CHECK-NEXT: ret void 4998; 4999; GFX6-LABEL: sdiv_v3i15: 5000; GFX6: ; %bb.0: 5001; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 5002; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5003; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 5004; GFX6-NEXT: s_mov_b32 s7, 0xf000 5005; GFX6-NEXT: s_mov_b32 s6, -1 5006; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5007; GFX6-NEXT: v_mov_b32_e32 v0, s2 5008; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 5009; GFX6-NEXT: s_bfe_i32 s3, s0, 0xf0000 5010; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s3 5011; GFX6-NEXT: v_mov_b32_e32 v1, s0 5012; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 30 5013; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf0000 5014; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s1 5015; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 5016; GFX6-NEXT: s_xor_b32 s1, s1, s3 5017; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f 5018; GFX6-NEXT: s_ashr_i32 s1, s1, 30 5019; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 5020; GFX6-NEXT: v_trunc_f32_e32 v4, v4 5021; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 5022; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 5023; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 5024; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s0 5025; GFX6-NEXT: s_or_b32 s1, s1, 1 5026; GFX6-NEXT: v_mov_b32_e32 v5, s1 5027; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 5028; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf000f 5029; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 5030; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 5031; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 5032; GFX6-NEXT: s_xor_b32 s0, s1, s0 5033; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 5034; GFX6-NEXT: s_ashr_i32 s0, s0, 30 5035; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 5036; GFX6-NEXT: v_trunc_f32_e32 v5, v5 5037; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 5038; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 5039; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| 5040; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v1 5041; GFX6-NEXT: s_or_b32 s0, s0, 1 5042; GFX6-NEXT: v_mov_b32_e32 v6, s0 5043; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 5044; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 5045; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 5046; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v0 5047; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 5048; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 5049; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 5050; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 5051; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 5052; GFX6-NEXT: v_trunc_f32_e32 v1, v1 5053; GFX6-NEXT: v_mad_f32 v5, -v1, v4, v5 5054; GFX6-NEXT: v_cvt_i32_f32_e32 v1, v1 5055; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| 5056; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 5057; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 5058; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5059; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 5060; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 5061; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 5062; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 5063; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 5064; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5065; GFX6-NEXT: s_waitcnt expcnt(0) 5066; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5067; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 5068; GFX6-NEXT: s_endpgm 5069; 5070; GFX9-LABEL: sdiv_v3i15: 5071; GFX9: ; %bb.0: 5072; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 5073; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 5074; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 5075; GFX9-NEXT: v_mov_b32_e32 v2, 0 5076; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5077; GFX9-NEXT: s_bfe_i32 s1, s2, 0xf0000 5078; GFX9-NEXT: s_bfe_i32 s0, s4, 0xf0000 5079; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 5080; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 5081; GFX9-NEXT: s_xor_b32 s0, s1, s0 5082; GFX9-NEXT: v_mov_b32_e32 v0, s2 5083; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 5084; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5085; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 5086; GFX9-NEXT: s_or_b32 s3, s0, 1 5087; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 5088; GFX9-NEXT: v_trunc_f32_e32 v5, v5 5089; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 5090; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 5091; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5092; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 5093; GFX9-NEXT: s_cselect_b32 s0, s3, 0 5094; GFX9-NEXT: s_bfe_i32 s1, s4, 0xf000f 5095; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s1 5096; GFX9-NEXT: v_add_u32_e32 v4, s0, v5 5097; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf000f 5098; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 5099; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 5100; GFX9-NEXT: v_mov_b32_e32 v1, s4 5101; GFX9-NEXT: v_alignbit_b32 v1, s5, v1, 30 5102; GFX9-NEXT: s_xor_b32 s0, s0, s1 5103; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 5104; GFX9-NEXT: v_trunc_f32_e32 v6, v6 5105; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5106; GFX9-NEXT: v_mad_f32 v5, -v6, v3, v5 5107; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 5108; GFX9-NEXT: s_or_b32 s2, s0, 1 5109; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 5110; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| 5111; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 5112; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5113; GFX9-NEXT: s_cselect_b32 s0, s2, 0 5114; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 15 5115; GFX9-NEXT: v_add_u32_e32 v5, s0, v6 5116; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v0 5117; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 5118; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 5119; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 5120; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 5121; GFX9-NEXT: v_mul_f32_e32 v1, v6, v7 5122; GFX9-NEXT: v_trunc_f32_e32 v1, v1 5123; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v1 5124; GFX9-NEXT: v_mad_f32 v1, -v1, v3, v6 5125; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| 5126; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 5127; GFX9-NEXT: v_add_u32_e32 v0, v7, v0 5128; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v4 5129; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v5 5130; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 5131; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 5132; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 5133; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 5134; GFX9-NEXT: global_store_dword v2, v0, s[6:7] 5135; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5136; GFX9-NEXT: global_store_short v2, v0, s[6:7] offset:4 5137; GFX9-NEXT: s_endpgm 5138 %r = sdiv <3 x i15> %x, %y 5139 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 5140 ret void 5141} 5142 5143define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 5144; CHECK-LABEL: @srem_v3i15( 5145; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 5146; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 5147; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 5148; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 5149; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 5150; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 5151; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 5152; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 5153; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 5154; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 5155; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 5156; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 5157; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 5158; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 5159; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 5160; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 5161; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 5162; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 5163; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 5164; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 5165; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 5166; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 5167; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 17 5168; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 17 5169; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15 5170; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i15> undef, i15 [[TMP25]], i64 0 5171; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1 5172; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1 5173; CHECK-NEXT: [[TMP29:%.*]] = sext i15 [[TMP27]] to i32 5174; CHECK-NEXT: [[TMP30:%.*]] = sext i15 [[TMP28]] to i32 5175; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 5176; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 5177; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 5178; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 5179; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 5180; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 5181; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 5182; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 5183; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 5184; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 5185; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 5186; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 5187; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 5188; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 5189; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 5190; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 5191; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 5192; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 5193; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 17 5194; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 17 5195; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15 5196; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1 5197; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2 5198; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2 5199; CHECK-NEXT: [[TMP55:%.*]] = sext i15 [[TMP53]] to i32 5200; CHECK-NEXT: [[TMP56:%.*]] = sext i15 [[TMP54]] to i32 5201; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 5202; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 5203; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 5204; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 5205; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 5206; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 5207; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 5208; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 5209; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 5210; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 5211; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 5212; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 5213; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 5214; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 5215; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 5216; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 5217; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 5218; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 5219; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 17 5220; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 17 5221; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15 5222; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2 5223; CHECK-NEXT: store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 5224; CHECK-NEXT: ret void 5225; 5226; GFX6-LABEL: srem_v3i15: 5227; GFX6: ; %bb.0: 5228; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 5229; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5230; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 5231; GFX6-NEXT: s_mov_b32 s7, 0xf000 5232; GFX6-NEXT: s_mov_b32 s6, -1 5233; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5234; GFX6-NEXT: s_bfe_i32 s9, s2, 0xf0000 5235; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s9 5236; GFX6-NEXT: v_mov_b32_e32 v2, s0 5237; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 5238; GFX6-NEXT: s_bfe_i32 s1, s0, 0xf0000 5239; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 5240; GFX6-NEXT: s_xor_b32 s1, s9, s1 5241; GFX6-NEXT: s_ashr_i32 s1, s1, 30 5242; GFX6-NEXT: s_or_b32 s1, s1, 1 5243; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 5244; GFX6-NEXT: v_mov_b32_e32 v7, s1 5245; GFX6-NEXT: s_lshr_b32 s8, s0, 15 5246; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf000f 5247; GFX6-NEXT: v_mul_f32_e32 v6, v5, v6 5248; GFX6-NEXT: v_trunc_f32_e32 v6, v6 5249; GFX6-NEXT: v_mad_f32 v5, -v6, v4, v5 5250; GFX6-NEXT: v_cvt_i32_f32_e32 v6, v6 5251; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| 5252; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v7, vcc 5253; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v2 5254; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 5255; GFX6-NEXT: v_mul_lo_u32 v4, v4, s0 5256; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f 5257; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s0 5258; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s1 5259; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s2, v4 5260; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 5261; GFX6-NEXT: s_xor_b32 s0, s1, s0 5262; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15 5263; GFX6-NEXT: s_ashr_i32 s0, s0, 30 5264; GFX6-NEXT: v_mul_f32_e32 v7, v6, v7 5265; GFX6-NEXT: v_trunc_f32_e32 v7, v7 5266; GFX6-NEXT: v_mad_f32 v6, -v7, v5, v6 5267; GFX6-NEXT: v_cvt_i32_f32_e32 v7, v7 5268; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v5| 5269; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v2 5270; GFX6-NEXT: v_mov_b32_e32 v0, s2 5271; GFX6-NEXT: s_or_b32 s0, s0, 1 5272; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 5273; GFX6-NEXT: v_mov_b32_e32 v8, s0 5274; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v0 5275; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v8, vcc 5276; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 5277; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 5278; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v0 5279; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 5280; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2 5281; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 5282; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 5283; GFX6-NEXT: v_mul_f32_e32 v2, v7, v8 5284; GFX6-NEXT: v_trunc_f32_e32 v2, v2 5285; GFX6-NEXT: v_mad_f32 v7, -v2, v6, v7 5286; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 5287; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| 5288; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 5289; GFX6-NEXT: v_mul_lo_u32 v5, v5, s8 5290; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 5291; GFX6-NEXT: v_mul_lo_u32 v0, v0, v3 5292; GFX6-NEXT: s_lshr_b32 s3, s2, 15 5293; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v5 5294; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v0, v1 5295; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 5296; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 5297; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 5298; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2 5299; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 5300; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 5301; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5302; GFX6-NEXT: s_waitcnt expcnt(0) 5303; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5304; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 5305; GFX6-NEXT: s_endpgm 5306; 5307; GFX9-LABEL: srem_v3i15: 5308; GFX9: ; %bb.0: 5309; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 5310; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 5311; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 5312; GFX9-NEXT: v_mov_b32_e32 v2, 0 5313; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5314; GFX9-NEXT: s_bfe_i32 s1, s2, 0xf0000 5315; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 5316; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf0000 5317; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 5318; GFX9-NEXT: s_xor_b32 s0, s1, s0 5319; GFX9-NEXT: v_mov_b32_e32 v0, s2 5320; GFX9-NEXT: v_mov_b32_e32 v1, s6 5321; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 5322; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5323; GFX9-NEXT: s_lshr_b32 s8, s2, 15 5324; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 5325; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 5326; GFX9-NEXT: v_trunc_f32_e32 v6, v6 5327; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 5328; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 5329; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 5330; GFX9-NEXT: s_lshr_b32 s3, s6, 15 5331; GFX9-NEXT: s_or_b32 s7, s0, 1 5332; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| 5333; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5334; GFX9-NEXT: s_cselect_b32 s0, s7, 0 5335; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 5336; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf000f 5337; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 5338; GFX9-NEXT: s_bfe_i32 s1, s2, 0xf000f 5339; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s1 5340; GFX9-NEXT: s_xor_b32 s0, s1, s0 5341; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 5342; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v1 5343; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5344; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 5345; GFX9-NEXT: v_mul_f32_e32 v7, v6, v7 5346; GFX9-NEXT: v_trunc_f32_e32 v7, v7 5347; GFX9-NEXT: v_mad_f32 v6, -v7, v5, v6 5348; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v7 5349; GFX9-NEXT: v_mul_lo_u32 v4, v4, s6 5350; GFX9-NEXT: s_or_b32 s6, s0, 1 5351; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v5| 5352; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v1 5353; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5354; GFX9-NEXT: s_cselect_b32 s0, s6, 0 5355; GFX9-NEXT: v_add_u32_e32 v5, s0, v7 5356; GFX9-NEXT: v_bfe_i32 v7, v0, 0, 15 5357; GFX9-NEXT: v_cvt_f32_i32_e32 v8, v7 5358; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v6 5359; GFX9-NEXT: v_xor_b32_e32 v1, v7, v1 5360; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1 5361; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 5362; GFX9-NEXT: v_mul_f32_e32 v7, v8, v9 5363; GFX9-NEXT: v_trunc_f32_e32 v7, v7 5364; GFX9-NEXT: v_cvt_i32_f32_e32 v9, v7 5365; GFX9-NEXT: v_mad_f32 v7, -v7, v6, v8 5366; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| 5367; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 5368; GFX9-NEXT: v_mul_lo_u32 v5, v5, s3 5369; GFX9-NEXT: v_add_u32_e32 v1, v9, v1 5370; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3 5371; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 5372; GFX9-NEXT: v_sub_u32_e32 v3, s2, v4 5373; GFX9-NEXT: v_sub_u32_e32 v4, s8, v5 5374; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 5375; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4 5376; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 5377; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 5378; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 5379; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 5380; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 5381; GFX9-NEXT: global_store_dword v2, v0, s[4:5] 5382; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5383; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 5384; GFX9-NEXT: s_endpgm 5385 %r = srem <3 x i15> %x, %y 5386 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 5387 ret void 5388} 5389 5390define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 5391; CHECK-LABEL: @udiv_i32_oddk_denom( 5392; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 1235195 5393; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5394; CHECK-NEXT: ret void 5395; 5396; GFX6-LABEL: udiv_i32_oddk_denom: 5397; GFX6: ; %bb.0: 5398; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 5399; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5400; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 5401; GFX6-NEXT: s_mov_b32 s3, 0xf000 5402; GFX6-NEXT: s_mov_b32 s2, -1 5403; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5404; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 5405; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v0 5406; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 5407; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5408; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 5409; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 5410; GFX6-NEXT: s_endpgm 5411; 5412; GFX9-LABEL: udiv_i32_oddk_denom: 5413; GFX9: ; %bb.0: 5414; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 5415; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5416; GFX9-NEXT: v_mov_b32_e32 v0, 0 5417; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5418; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 5419; GFX9-NEXT: s_sub_i32 s1, s4, s0 5420; GFX9-NEXT: s_lshr_b32 s1, s1, 1 5421; GFX9-NEXT: s_add_i32 s1, s1, s0 5422; GFX9-NEXT: s_lshr_b32 s0, s1, 20 5423; GFX9-NEXT: v_mov_b32_e32 v1, s0 5424; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 5425; GFX9-NEXT: s_endpgm 5426 %r = udiv i32 %x, 1235195 5427 store i32 %r, i32 addrspace(1)* %out 5428 ret void 5429} 5430 5431define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 5432; CHECK-LABEL: @udiv_i32_pow2k_denom( 5433; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 4096 5434; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5435; CHECK-NEXT: ret void 5436; 5437; GFX6-LABEL: udiv_i32_pow2k_denom: 5438; GFX6: ; %bb.0: 5439; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 5440; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5441; GFX6-NEXT: s_mov_b32 s3, 0xf000 5442; GFX6-NEXT: s_mov_b32 s2, -1 5443; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5444; GFX6-NEXT: s_lshr_b32 s4, s4, 12 5445; GFX6-NEXT: v_mov_b32_e32 v0, s4 5446; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 5447; GFX6-NEXT: s_endpgm 5448; 5449; GFX9-LABEL: udiv_i32_pow2k_denom: 5450; GFX9: ; %bb.0: 5451; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 5452; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5453; GFX9-NEXT: v_mov_b32_e32 v0, 0 5454; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5455; GFX9-NEXT: s_lshr_b32 s0, s4, 12 5456; GFX9-NEXT: v_mov_b32_e32 v1, s0 5457; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 5458; GFX9-NEXT: s_endpgm 5459 %r = udiv i32 %x, 4096 5460 store i32 %r, i32 addrspace(1)* %out 5461 ret void 5462} 5463 5464define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 5465; CHECK-LABEL: @udiv_i32_pow2_shl_denom( 5466; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 5467; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]] 5468; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5469; CHECK-NEXT: ret void 5470; 5471; GFX6-LABEL: udiv_i32_pow2_shl_denom: 5472; GFX6: ; %bb.0: 5473; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 5474; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5475; GFX6-NEXT: s_mov_b32 s3, 0xf000 5476; GFX6-NEXT: s_mov_b32 s2, -1 5477; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5478; GFX6-NEXT: s_add_i32 s5, s5, 12 5479; GFX6-NEXT: s_lshr_b32 s4, s4, s5 5480; GFX6-NEXT: v_mov_b32_e32 v0, s4 5481; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 5482; GFX6-NEXT: s_endpgm 5483; 5484; GFX9-LABEL: udiv_i32_pow2_shl_denom: 5485; GFX9: ; %bb.0: 5486; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 5487; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 5488; GFX9-NEXT: v_mov_b32_e32 v0, 0 5489; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5490; GFX9-NEXT: s_add_i32 s0, s3, 12 5491; GFX9-NEXT: s_lshr_b32 s0, s2, s0 5492; GFX9-NEXT: v_mov_b32_e32 v1, s0 5493; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 5494; GFX9-NEXT: s_endpgm 5495 %shl.y = shl i32 4096, %y 5496 %r = udiv i32 %x, %shl.y 5497 store i32 %r, i32 addrspace(1)* %out 5498 ret void 5499} 5500 5501define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 5502; CHECK-LABEL: @udiv_v2i32_pow2k_denom( 5503; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5504; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 5505; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 5506; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 5507; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096 5508; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 5509; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 5510; CHECK-NEXT: ret void 5511; 5512; GFX6-LABEL: udiv_v2i32_pow2k_denom: 5513; GFX6: ; %bb.0: 5514; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 5515; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5516; GFX6-NEXT: s_mov_b32 s3, 0xf000 5517; GFX6-NEXT: s_mov_b32 s2, -1 5518; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5519; GFX6-NEXT: s_lshr_b32 s4, s4, 12 5520; GFX6-NEXT: s_lshr_b32 s5, s5, 12 5521; GFX6-NEXT: v_mov_b32_e32 v0, s4 5522; GFX6-NEXT: v_mov_b32_e32 v1, s5 5523; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5524; GFX6-NEXT: s_endpgm 5525; 5526; GFX9-LABEL: udiv_v2i32_pow2k_denom: 5527; GFX9: ; %bb.0: 5528; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 5529; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 5530; GFX9-NEXT: v_mov_b32_e32 v2, 0 5531; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5532; GFX9-NEXT: s_lshr_b32 s0, s2, 12 5533; GFX9-NEXT: s_lshr_b32 s1, s3, 12 5534; GFX9-NEXT: v_mov_b32_e32 v0, s0 5535; GFX9-NEXT: v_mov_b32_e32 v1, s1 5536; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 5537; GFX9-NEXT: s_endpgm 5538 %r = udiv <2 x i32> %x, <i32 4096, i32 4096> 5539 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 5540 ret void 5541} 5542 5543define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 5544; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom( 5545; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5546; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 5547; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 5548; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 5549; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095 5550; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 5551; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 5552; CHECK-NEXT: ret void 5553; 5554; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom: 5555; GFX6: ; %bb.0: 5556; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 5557; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5558; GFX6-NEXT: v_mov_b32_e32 v0, 0x100101 5559; GFX6-NEXT: s_mov_b32 s3, 0xf000 5560; GFX6-NEXT: s_mov_b32 s2, -1 5561; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5562; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 5563; GFX6-NEXT: s_lshr_b32 s4, s4, 12 5564; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v0 5565; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 5566; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5567; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v0 5568; GFX6-NEXT: v_mov_b32_e32 v0, s4 5569; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5570; GFX6-NEXT: s_endpgm 5571; 5572; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: 5573; GFX9: ; %bb.0: 5574; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 5575; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 5576; GFX9-NEXT: v_mov_b32_e32 v2, 0 5577; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5578; GFX9-NEXT: s_mul_hi_u32 s1, s3, 0x100101 5579; GFX9-NEXT: s_lshr_b32 s0, s2, 12 5580; GFX9-NEXT: s_sub_i32 s2, s3, s1 5581; GFX9-NEXT: s_lshr_b32 s2, s2, 1 5582; GFX9-NEXT: s_add_i32 s2, s2, s1 5583; GFX9-NEXT: s_lshr_b32 s1, s2, 11 5584; GFX9-NEXT: v_mov_b32_e32 v0, s0 5585; GFX9-NEXT: v_mov_b32_e32 v1, s1 5586; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 5587; GFX9-NEXT: s_endpgm 5588 %r = udiv <2 x i32> %x, <i32 4096, i32 4095> 5589 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 5590 ret void 5591} 5592 5593define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 5594; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom( 5595; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 5596; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5597; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 5598; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 5599; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 5600; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 5601; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 5602; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 5603; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 5604; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 5605; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 5606; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 5607; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 5608; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 5609; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 5610; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 5611; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 5612; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 5613; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 5614; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 5615; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 5616; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 5617; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 5618; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 5619; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 5620; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1 5621; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]] 5622; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]] 5623; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]] 5624; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]] 5625; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 5626; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 5627; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> undef, i32 [[TMP31]], i64 0 5628; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x i32> [[X]], i64 1 5629; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 5630; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float 5631; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 5632; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000 5633; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32 5634; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]] 5635; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]] 5636; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64 5637; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64 5638; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]] 5639; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 5640; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32 5641; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32 5642; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]] 5643; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64 5644; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64 5645; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]] 5646; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 5647; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32 5648; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32 5649; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]] 5650; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]] 5651; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]] 5652; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1 5653; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]] 5654; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]] 5655; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]] 5656; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]] 5657; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1 5658; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]] 5659; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP63]], i64 1 5660; CHECK-NEXT: store <2 x i32> [[TMP64]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 5661; CHECK-NEXT: ret void 5662; 5663; GFX6-LABEL: udiv_v2i32_pow2_shl_denom: 5664; GFX6: ; %bb.0: 5665; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 5666; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 5667; GFX6-NEXT: s_mov_b32 s11, 0xf000 5668; GFX6-NEXT: s_mov_b32 s10, -1 5669; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5670; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 5671; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 5672; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s7 5673; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 5674; GFX6-NEXT: s_sub_i32 s0, 0, s2 5675; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 5676; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 5677; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 5678; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 5679; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 5680; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 5681; GFX6-NEXT: v_mul_lo_u32 v2, s0, v0 5682; GFX6-NEXT: s_sub_i32 s0, 0, s3 5683; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1 5684; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 5685; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 5686; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 5687; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 5688; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 5689; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 5690; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 5691; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 5692; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 5693; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 5694; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 5695; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 5696; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v2 5697; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 5698; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 5699; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 5700; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 5701; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v4 5702; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 5703; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v2 5704; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 5705; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v2 5706; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 5707; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 5708; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 5709; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 5710; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 5711; GFX6-NEXT: s_endpgm 5712; 5713; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: 5714; GFX9: ; %bb.0: 5715; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 5716; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5717; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6 5718; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7 5719; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 5720; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 5721; GFX9-NEXT: s_sub_i32 s2, 0, s6 5722; GFX9-NEXT: s_sub_i32 s3, 0, s7 5723; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 5724; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 5725; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 5726; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 5727; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 5728; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 5729; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 5730; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 5731; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5732; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 5733; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 5734; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 5735; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 5736; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 5737; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 5738; GFX9-NEXT: v_mov_b32_e32 v2, 0 5739; GFX9-NEXT: v_mul_lo_u32 v3, v0, s6 5740; GFX9-NEXT: v_mul_lo_u32 v4, v1, s7 5741; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 5742; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 5743; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 5744; GFX9-NEXT: v_sub_u32_e32 v4, s5, v4 5745; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 5746; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 5747; GFX9-NEXT: v_subrev_u32_e32 v5, s6, v3 5748; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v4 5749; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[0:1] 5750; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v4 5751; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 5752; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 5753; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1] 5754; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 5755; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 5756; GFX9-NEXT: v_add_u32_e32 v3, 1, v1 5757; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v4 5758; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 5759; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5760; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 5761; GFX9-NEXT: s_endpgm 5762 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 5763 %r = udiv <2 x i32> %x, %shl.y 5764 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 5765 ret void 5766} 5767 5768define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 5769; CHECK-LABEL: @urem_i32_oddk_denom( 5770; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 1235195 5771; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5772; CHECK-NEXT: ret void 5773; 5774; GFX6-LABEL: urem_i32_oddk_denom: 5775; GFX6: ; %bb.0: 5776; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 5777; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 5778; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb 5779; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5780; GFX6-NEXT: s_mov_b32 s3, 0xf000 5781; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5782; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 5783; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v0 5784; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 5785; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 5786; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 5787; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 5788; GFX6-NEXT: s_mov_b32 s2, -1 5789; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 5790; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 5791; GFX6-NEXT: s_endpgm 5792; 5793; GFX9-LABEL: urem_i32_oddk_denom: 5794; GFX9: ; %bb.0: 5795; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 5796; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5797; GFX9-NEXT: v_mov_b32_e32 v0, 0 5798; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5799; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 5800; GFX9-NEXT: s_sub_i32 s1, s4, s0 5801; GFX9-NEXT: s_lshr_b32 s1, s1, 1 5802; GFX9-NEXT: s_add_i32 s1, s1, s0 5803; GFX9-NEXT: s_lshr_b32 s0, s1, 20 5804; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb 5805; GFX9-NEXT: s_sub_i32 s0, s4, s0 5806; GFX9-NEXT: v_mov_b32_e32 v1, s0 5807; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 5808; GFX9-NEXT: s_endpgm 5809 %r = urem i32 %x, 1235195 5810 store i32 %r, i32 addrspace(1)* %out 5811 ret void 5812} 5813 5814define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 5815; CHECK-LABEL: @urem_i32_pow2k_denom( 5816; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 4096 5817; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5818; CHECK-NEXT: ret void 5819; 5820; GFX6-LABEL: urem_i32_pow2k_denom: 5821; GFX6: ; %bb.0: 5822; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 5823; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5824; GFX6-NEXT: s_mov_b32 s3, 0xf000 5825; GFX6-NEXT: s_mov_b32 s2, -1 5826; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5827; GFX6-NEXT: s_and_b32 s4, s4, 0xfff 5828; GFX6-NEXT: v_mov_b32_e32 v0, s4 5829; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 5830; GFX6-NEXT: s_endpgm 5831; 5832; GFX9-LABEL: urem_i32_pow2k_denom: 5833; GFX9: ; %bb.0: 5834; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 5835; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5836; GFX9-NEXT: v_mov_b32_e32 v0, 0 5837; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5838; GFX9-NEXT: s_and_b32 s0, s4, 0xfff 5839; GFX9-NEXT: v_mov_b32_e32 v1, s0 5840; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 5841; GFX9-NEXT: s_endpgm 5842 %r = urem i32 %x, 4096 5843 store i32 %r, i32 addrspace(1)* %out 5844 ret void 5845} 5846 5847define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 5848; CHECK-LABEL: @urem_i32_pow2_shl_denom( 5849; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 5850; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]] 5851; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5852; CHECK-NEXT: ret void 5853; 5854; GFX6-LABEL: urem_i32_pow2_shl_denom: 5855; GFX6: ; %bb.0: 5856; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 5857; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5858; GFX6-NEXT: s_mov_b32 s3, 0xf000 5859; GFX6-NEXT: s_mov_b32 s2, -1 5860; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5861; GFX6-NEXT: s_lshl_b32 s5, 0x1000, s5 5862; GFX6-NEXT: s_add_i32 s5, s5, -1 5863; GFX6-NEXT: s_and_b32 s4, s4, s5 5864; GFX6-NEXT: v_mov_b32_e32 v0, s4 5865; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 5866; GFX6-NEXT: s_endpgm 5867; 5868; GFX9-LABEL: urem_i32_pow2_shl_denom: 5869; GFX9: ; %bb.0: 5870; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 5871; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 5872; GFX9-NEXT: v_mov_b32_e32 v0, 0 5873; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5874; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s3 5875; GFX9-NEXT: s_add_i32 s0, s0, -1 5876; GFX9-NEXT: s_and_b32 s0, s2, s0 5877; GFX9-NEXT: v_mov_b32_e32 v1, s0 5878; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 5879; GFX9-NEXT: s_endpgm 5880 %shl.y = shl i32 4096, %y 5881 %r = urem i32 %x, %shl.y 5882 store i32 %r, i32 addrspace(1)* %out 5883 ret void 5884} 5885 5886define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 5887; CHECK-LABEL: @urem_v2i32_pow2k_denom( 5888; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5889; CHECK-NEXT: [[TMP2:%.*]] = urem i32 [[TMP1]], 4096 5890; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 5891; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 5892; CHECK-NEXT: [[TMP5:%.*]] = urem i32 [[TMP4]], 4096 5893; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 5894; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 5895; CHECK-NEXT: ret void 5896; 5897; GFX6-LABEL: urem_v2i32_pow2k_denom: 5898; GFX6: ; %bb.0: 5899; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 5900; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5901; GFX6-NEXT: s_mov_b32 s3, 0xf000 5902; GFX6-NEXT: s_mov_b32 s2, -1 5903; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5904; GFX6-NEXT: s_and_b32 s4, s4, 0xfff 5905; GFX6-NEXT: s_and_b32 s5, s5, 0xfff 5906; GFX6-NEXT: v_mov_b32_e32 v0, s4 5907; GFX6-NEXT: v_mov_b32_e32 v1, s5 5908; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5909; GFX6-NEXT: s_endpgm 5910; 5911; GFX9-LABEL: urem_v2i32_pow2k_denom: 5912; GFX9: ; %bb.0: 5913; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 5914; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 5915; GFX9-NEXT: v_mov_b32_e32 v2, 0 5916; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5917; GFX9-NEXT: s_and_b32 s0, s2, 0xfff 5918; GFX9-NEXT: s_and_b32 s1, s3, 0xfff 5919; GFX9-NEXT: v_mov_b32_e32 v0, s0 5920; GFX9-NEXT: v_mov_b32_e32 v1, s1 5921; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 5922; GFX9-NEXT: s_endpgm 5923 %r = urem <2 x i32> %x, <i32 4096, i32 4096> 5924 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 5925 ret void 5926} 5927 5928define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 5929; CHECK-LABEL: @urem_v2i32_pow2_shl_denom( 5930; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 5931; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5932; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 5933; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 5934; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 5935; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 5936; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 5937; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 5938; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 5939; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 5940; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 5941; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 5942; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 5943; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 5944; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 5945; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 5946; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 5947; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 5948; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 5949; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 5950; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 5951; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 5952; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 5953; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 5954; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 5955; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]] 5956; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]] 5957; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]] 5958; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]] 5959; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]] 5960; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> undef, i32 [[TMP29]], i64 0 5961; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[X]], i64 1 5962; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 5963; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float 5964; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 5965; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000 5966; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32 5967; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]] 5968; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]] 5969; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64 5970; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64 5971; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]] 5972; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32 5973; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32 5974; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 5975; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]] 5976; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64 5977; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64 5978; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]] 5979; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32 5980; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32 5981; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 5982; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]] 5983; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]] 5984; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]] 5985; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]] 5986; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]] 5987; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]] 5988; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]] 5989; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]] 5990; CHECK-NEXT: [[TMP60:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP59]], i64 1 5991; CHECK-NEXT: store <2 x i32> [[TMP60]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 5992; CHECK-NEXT: ret void 5993; 5994; GFX6-LABEL: urem_v2i32_pow2_shl_denom: 5995; GFX6: ; %bb.0: 5996; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 5997; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5998; GFX6-NEXT: s_mov_b32 s3, 0xf000 5999; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6000; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s6 6001; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 6002; GFX6-NEXT: s_lshl_b32 s7, 0x1000, s7 6003; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 6004; GFX6-NEXT: s_sub_i32 s2, 0, s6 6005; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6006; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 6007; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6008; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6009; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 6010; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 6011; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 6012; GFX6-NEXT: s_sub_i32 s2, 0, s7 6013; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 6014; GFX6-NEXT: s_mov_b32 s2, -1 6015; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 6016; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 6017; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 6018; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 6019; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 6020; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 6021; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 6022; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 6023; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 6024; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 6025; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 6026; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 6027; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6028; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 6029; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 6030; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6031; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 6032; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 6033; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6034; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 6035; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 6036; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6037; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6038; GFX6-NEXT: s_endpgm 6039; 6040; GFX9-LABEL: urem_v2i32_pow2_shl_denom: 6041; GFX9: ; %bb.0: 6042; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 6043; GFX9-NEXT: v_mov_b32_e32 v2, 0 6044; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6045; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6046; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6 6047; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 6048; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7 6049; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 6050; GFX9-NEXT: s_sub_i32 s6, 0, s3 6051; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6052; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 6053; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6054; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6055; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 6056; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 6057; GFX9-NEXT: v_readfirstlane_b32 s7, v0 6058; GFX9-NEXT: s_mul_i32 s6, s6, s7 6059; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 6060; GFX9-NEXT: s_add_i32 s7, s7, s6 6061; GFX9-NEXT: s_mul_hi_u32 s6, s4, s7 6062; GFX9-NEXT: s_mul_i32 s6, s6, s3 6063; GFX9-NEXT: s_sub_i32 s4, s4, s6 6064; GFX9-NEXT: s_sub_i32 s6, s4, s3 6065; GFX9-NEXT: s_cmp_ge_u32 s4, s3 6066; GFX9-NEXT: s_cselect_b32 s4, s6, s4 6067; GFX9-NEXT: s_sub_i32 s6, s4, s3 6068; GFX9-NEXT: s_cmp_ge_u32 s4, s3 6069; GFX9-NEXT: v_readfirstlane_b32 s8, v1 6070; GFX9-NEXT: s_cselect_b32 s3, s6, s4 6071; GFX9-NEXT: s_sub_i32 s4, 0, s2 6072; GFX9-NEXT: s_mul_i32 s4, s4, s8 6073; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4 6074; GFX9-NEXT: s_add_i32 s8, s8, s4 6075; GFX9-NEXT: s_mul_hi_u32 s4, s5, s8 6076; GFX9-NEXT: s_mul_i32 s4, s4, s2 6077; GFX9-NEXT: s_sub_i32 s4, s5, s4 6078; GFX9-NEXT: s_sub_i32 s5, s4, s2 6079; GFX9-NEXT: s_cmp_ge_u32 s4, s2 6080; GFX9-NEXT: s_cselect_b32 s4, s5, s4 6081; GFX9-NEXT: s_sub_i32 s5, s4, s2 6082; GFX9-NEXT: s_cmp_ge_u32 s4, s2 6083; GFX9-NEXT: s_cselect_b32 s2, s5, s4 6084; GFX9-NEXT: v_mov_b32_e32 v0, s3 6085; GFX9-NEXT: v_mov_b32_e32 v1, s2 6086; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 6087; GFX9-NEXT: s_endpgm 6088 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 6089 %r = urem <2 x i32> %x, %shl.y 6090 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 6091 ret void 6092} 6093 6094define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 6095; CHECK-LABEL: @sdiv_i32_oddk_denom( 6096; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195 6097; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6098; CHECK-NEXT: ret void 6099; 6100; GFX6-LABEL: sdiv_i32_oddk_denom: 6101; GFX6: ; %bb.0: 6102; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 6103; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6104; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 6105; GFX6-NEXT: s_mov_b32 s3, 0xf000 6106; GFX6-NEXT: s_mov_b32 s2, -1 6107; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6108; GFX6-NEXT: v_mul_hi_i32 v0, s4, v0 6109; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 6110; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 6111; GFX6-NEXT: v_ashrrev_i32_e32 v0, 20, v0 6112; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 6113; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 6114; GFX6-NEXT: s_endpgm 6115; 6116; GFX9-LABEL: sdiv_i32_oddk_denom: 6117; GFX9: ; %bb.0: 6118; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 6119; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6120; GFX9-NEXT: v_mov_b32_e32 v0, 0 6121; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6122; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 6123; GFX9-NEXT: s_add_i32 s0, s0, s4 6124; GFX9-NEXT: s_lshr_b32 s1, s0, 31 6125; GFX9-NEXT: s_ashr_i32 s0, s0, 20 6126; GFX9-NEXT: s_add_i32 s0, s0, s1 6127; GFX9-NEXT: v_mov_b32_e32 v1, s0 6128; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 6129; GFX9-NEXT: s_endpgm 6130 %r = sdiv i32 %x, 1235195 6131 store i32 %r, i32 addrspace(1)* %out 6132 ret void 6133} 6134 6135define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 6136; CHECK-LABEL: @sdiv_i32_pow2k_denom( 6137; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 4096 6138; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6139; CHECK-NEXT: ret void 6140; 6141; GFX6-LABEL: sdiv_i32_pow2k_denom: 6142; GFX6: ; %bb.0: 6143; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 6144; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6145; GFX6-NEXT: s_mov_b32 s3, 0xf000 6146; GFX6-NEXT: s_mov_b32 s2, -1 6147; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6148; GFX6-NEXT: s_ashr_i32 s5, s4, 31 6149; GFX6-NEXT: s_lshr_b32 s5, s5, 20 6150; GFX6-NEXT: s_add_i32 s4, s4, s5 6151; GFX6-NEXT: s_ashr_i32 s4, s4, 12 6152; GFX6-NEXT: v_mov_b32_e32 v0, s4 6153; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 6154; GFX6-NEXT: s_endpgm 6155; 6156; GFX9-LABEL: sdiv_i32_pow2k_denom: 6157; GFX9: ; %bb.0: 6158; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 6159; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6160; GFX9-NEXT: v_mov_b32_e32 v0, 0 6161; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6162; GFX9-NEXT: s_ashr_i32 s0, s4, 31 6163; GFX9-NEXT: s_lshr_b32 s0, s0, 20 6164; GFX9-NEXT: s_add_i32 s4, s4, s0 6165; GFX9-NEXT: s_ashr_i32 s0, s4, 12 6166; GFX9-NEXT: v_mov_b32_e32 v1, s0 6167; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 6168; GFX9-NEXT: s_endpgm 6169 %r = sdiv i32 %x, 4096 6170 store i32 %r, i32 addrspace(1)* %out 6171 ret void 6172} 6173 6174define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 6175; CHECK-LABEL: @sdiv_i32_pow2_shl_denom( 6176; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 6177; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]] 6178; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6179; CHECK-NEXT: ret void 6180; 6181; GFX6-LABEL: sdiv_i32_pow2_shl_denom: 6182; GFX6: ; %bb.0: 6183; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 6184; GFX6-NEXT: s_mov_b32 s7, 0xf000 6185; GFX6-NEXT: s_mov_b32 s6, -1 6186; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6187; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 6188; GFX6-NEXT: s_ashr_i32 s8, s3, 31 6189; GFX6-NEXT: s_add_i32 s3, s3, s8 6190; GFX6-NEXT: s_xor_b32 s3, s3, s8 6191; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 6192; GFX6-NEXT: s_sub_i32 s4, 0, s3 6193; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6194; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6195; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6196; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 6197; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6198; GFX6-NEXT: s_ashr_i32 s0, s2, 31 6199; GFX6-NEXT: s_add_i32 s1, s2, s0 6200; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 6201; GFX6-NEXT: s_xor_b32 s1, s1, s0 6202; GFX6-NEXT: s_xor_b32 s2, s0, s8 6203; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6204; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 6205; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 6206; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 6207; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 6208; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 6209; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 6210; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 6211; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 6212; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 6213; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 6214; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6215; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 6216; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 6217; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6218; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 6219; GFX6-NEXT: s_endpgm 6220; 6221; GFX9-LABEL: sdiv_i32_pow2_shl_denom: 6222; GFX9: ; %bb.0: 6223; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6224; GFX9-NEXT: v_mov_b32_e32 v2, 0 6225; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6226; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6227; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 6228; GFX9-NEXT: s_ashr_i32 s4, s3, 31 6229; GFX9-NEXT: s_add_i32 s3, s3, s4 6230; GFX9-NEXT: s_xor_b32 s3, s3, s4 6231; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 6232; GFX9-NEXT: s_sub_i32 s5, 0, s3 6233; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6234; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6235; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6236; GFX9-NEXT: v_mul_lo_u32 v1, s5, v0 6237; GFX9-NEXT: s_ashr_i32 s5, s2, 31 6238; GFX9-NEXT: s_add_i32 s2, s2, s5 6239; GFX9-NEXT: s_xor_b32 s2, s2, s5 6240; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 6241; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 6242; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 6243; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3 6244; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 6245; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 6246; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 6247; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6248; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 6249; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 6250; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 6251; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 6252; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 6253; GFX9-NEXT: s_xor_b32 s2, s5, s4 6254; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 6255; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 6256; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 6257; GFX9-NEXT: s_endpgm 6258 %shl.y = shl i32 4096, %y 6259 %r = sdiv i32 %x, %shl.y 6260 store i32 %r, i32 addrspace(1)* %out 6261 ret void 6262} 6263 6264define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 6265; CHECK-LABEL: @sdiv_v2i32_pow2k_denom( 6266; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6267; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 6268; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 6269; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 6270; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096 6271; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 6272; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 6273; CHECK-NEXT: ret void 6274; 6275; GFX6-LABEL: sdiv_v2i32_pow2k_denom: 6276; GFX6: ; %bb.0: 6277; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 6278; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6279; GFX6-NEXT: s_mov_b32 s3, 0xf000 6280; GFX6-NEXT: s_mov_b32 s2, -1 6281; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6282; GFX6-NEXT: s_ashr_i32 s6, s4, 31 6283; GFX6-NEXT: s_ashr_i32 s7, s5, 31 6284; GFX6-NEXT: s_lshr_b32 s6, s6, 20 6285; GFX6-NEXT: s_add_i32 s4, s4, s6 6286; GFX6-NEXT: s_lshr_b32 s6, s7, 20 6287; GFX6-NEXT: s_add_i32 s5, s5, s6 6288; GFX6-NEXT: s_ashr_i32 s4, s4, 12 6289; GFX6-NEXT: s_ashr_i32 s5, s5, 12 6290; GFX6-NEXT: v_mov_b32_e32 v0, s4 6291; GFX6-NEXT: v_mov_b32_e32 v1, s5 6292; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6293; GFX6-NEXT: s_endpgm 6294; 6295; GFX9-LABEL: sdiv_v2i32_pow2k_denom: 6296; GFX9: ; %bb.0: 6297; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6298; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 6299; GFX9-NEXT: v_mov_b32_e32 v2, 0 6300; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6301; GFX9-NEXT: s_ashr_i32 s0, s2, 31 6302; GFX9-NEXT: s_ashr_i32 s1, s3, 31 6303; GFX9-NEXT: s_lshr_b32 s0, s0, 20 6304; GFX9-NEXT: s_lshr_b32 s1, s1, 20 6305; GFX9-NEXT: s_add_i32 s0, s2, s0 6306; GFX9-NEXT: s_add_i32 s1, s3, s1 6307; GFX9-NEXT: s_ashr_i32 s0, s0, 12 6308; GFX9-NEXT: s_ashr_i32 s1, s1, 12 6309; GFX9-NEXT: v_mov_b32_e32 v0, s0 6310; GFX9-NEXT: v_mov_b32_e32 v1, s1 6311; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 6312; GFX9-NEXT: s_endpgm 6313 %r = sdiv <2 x i32> %x, <i32 4096, i32 4096> 6314 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 6315 ret void 6316} 6317 6318define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 6319; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom( 6320; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6321; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 6322; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 6323; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 6324; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095 6325; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 6326; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 6327; CHECK-NEXT: ret void 6328; 6329; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 6330; GFX6: ; %bb.0: 6331; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 6332; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6333; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080081 6334; GFX6-NEXT: s_mov_b32 s3, 0xf000 6335; GFX6-NEXT: s_mov_b32 s2, -1 6336; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6337; GFX6-NEXT: v_mul_hi_i32 v0, s5, v0 6338; GFX6-NEXT: s_ashr_i32 s6, s4, 31 6339; GFX6-NEXT: s_lshr_b32 s6, s6, 20 6340; GFX6-NEXT: s_add_i32 s4, s4, s6 6341; GFX6-NEXT: v_add_i32_e32 v0, vcc, s5, v0 6342; GFX6-NEXT: s_ashr_i32 s4, s4, 12 6343; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 6344; GFX6-NEXT: v_ashrrev_i32_e32 v0, 11, v0 6345; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v0 6346; GFX6-NEXT: v_mov_b32_e32 v0, s4 6347; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6348; GFX6-NEXT: s_endpgm 6349; 6350; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 6351; GFX9: ; %bb.0: 6352; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6353; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 6354; GFX9-NEXT: v_mov_b32_e32 v2, 0 6355; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6356; GFX9-NEXT: s_ashr_i32 s0, s2, 31 6357; GFX9-NEXT: s_mul_hi_i32 s1, s3, 0x80080081 6358; GFX9-NEXT: s_lshr_b32 s0, s0, 20 6359; GFX9-NEXT: s_add_i32 s1, s1, s3 6360; GFX9-NEXT: s_add_i32 s0, s2, s0 6361; GFX9-NEXT: s_lshr_b32 s2, s1, 31 6362; GFX9-NEXT: s_ashr_i32 s1, s1, 11 6363; GFX9-NEXT: s_ashr_i32 s0, s0, 12 6364; GFX9-NEXT: s_add_i32 s1, s1, s2 6365; GFX9-NEXT: v_mov_b32_e32 v0, s0 6366; GFX9-NEXT: v_mov_b32_e32 v1, s1 6367; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 6368; GFX9-NEXT: s_endpgm 6369 %r = sdiv <2 x i32> %x, <i32 4096, i32 4095> 6370 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 6371 ret void 6372} 6373 6374define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 6375; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom( 6376; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 6377; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6378; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 6379; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 6380; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 6381; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 6382; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 6383; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 6384; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 6385; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 6386; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 6387; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 6388; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000 6389; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 6390; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]] 6391; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]] 6392; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64 6393; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 6394; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 6395; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 6396; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 6397; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 6398; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]] 6399; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64 6400; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64 6401; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 6402; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 6403; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 6404; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 6405; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]] 6406; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]] 6407; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]] 6408; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1 6409; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]] 6410; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]] 6411; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]] 6412; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]] 6413; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1 6414; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]] 6415; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]] 6416; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]] 6417; CHECK-NEXT: [[TMP41:%.*]] = insertelement <2 x i32> undef, i32 [[TMP40]], i64 0 6418; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[X]], i64 1 6419; CHECK-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 6420; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31 6421; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31 6422; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]] 6423; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]] 6424; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]] 6425; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]] 6426; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]] 6427; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float 6428; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]]) 6429; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000 6430; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32 6431; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]] 6432; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]] 6433; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64 6434; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64 6435; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]] 6436; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32 6437; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32 6438; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32 6439; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]] 6440; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64 6441; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64 6442; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]] 6443; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32 6444; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32 6445; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32 6446; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]] 6447; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]] 6448; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]] 6449; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1 6450; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]] 6451; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]] 6452; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]] 6453; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]] 6454; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1 6455; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]] 6456; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]] 6457; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]] 6458; CHECK-NEXT: [[TMP82:%.*]] = insertelement <2 x i32> [[TMP41]], i32 [[TMP81]], i64 1 6459; CHECK-NEXT: store <2 x i32> [[TMP82]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 6460; CHECK-NEXT: ret void 6461; 6462; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom: 6463; GFX6: ; %bb.0: 6464; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xb 6465; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6466; GFX6-NEXT: s_mov_b32 s7, 0xf000 6467; GFX6-NEXT: s_mov_b32 s6, -1 6468; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6469; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s10 6470; GFX6-NEXT: s_ashr_i32 s1, s0, 31 6471; GFX6-NEXT: s_add_i32 s0, s0, s1 6472; GFX6-NEXT: s_xor_b32 s2, s0, s1 6473; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 6474; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s11 6475; GFX6-NEXT: s_ashr_i32 s3, s0, 31 6476; GFX6-NEXT: s_add_i32 s0, s0, s3 6477; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6478; GFX6-NEXT: s_sub_i32 s11, 0, s2 6479; GFX6-NEXT: s_xor_b32 s10, s0, s3 6480; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 6481; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6482; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6483; GFX6-NEXT: s_ashr_i32 s0, s8, 31 6484; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 6485; GFX6-NEXT: s_add_i32 s8, s8, s0 6486; GFX6-NEXT: v_mul_lo_u32 v2, s11, v0 6487; GFX6-NEXT: s_xor_b32 s8, s8, s0 6488; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 6489; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 6490; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 6491; GFX6-NEXT: s_xor_b32 s11, s0, s1 6492; GFX6-NEXT: s_sub_i32 s0, 0, s10 6493; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 6494; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 6495; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 6496; GFX6-NEXT: v_mul_lo_u32 v3, v0, s2 6497; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 6498; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 6499; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 6500; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v3 6501; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 6502; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s2, v3 6503; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] 6504; GFX6-NEXT: s_ashr_i32 s0, s9, 31 6505; GFX6-NEXT: s_add_i32 s1, s9, s0 6506; GFX6-NEXT: s_xor_b32 s1, s1, s0 6507; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 6508; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 6509; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 6510; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 6511; GFX6-NEXT: v_mul_lo_u32 v2, v1, s10 6512; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 6513; GFX6-NEXT: s_xor_b32 s2, s0, s3 6514; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 6515; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 6516; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 6517; GFX6-NEXT: v_xor_b32_e32 v0, s11, v0 6518; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 6519; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s10, v2 6520; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s11, v0 6521; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 6522; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 6523; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 6524; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 6525; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 6526; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 6527; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6528; GFX6-NEXT: s_endpgm 6529; 6530; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: 6531; GFX9: ; %bb.0: 6532; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 6533; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6534; GFX9-NEXT: v_mov_b32_e32 v2, 0 6535; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6536; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 6537; GFX9-NEXT: s_ashr_i32 s1, s0, 31 6538; GFX9-NEXT: s_add_i32 s0, s0, s1 6539; GFX9-NEXT: s_xor_b32 s0, s0, s1 6540; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 6541; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s7 6542; GFX9-NEXT: s_ashr_i32 s8, s6, 31 6543; GFX9-NEXT: s_add_i32 s6, s6, s8 6544; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6545; GFX9-NEXT: s_xor_b32 s6, s6, s8 6546; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 6547; GFX9-NEXT: s_sub_i32 s10, 0, s0 6548; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6549; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6550; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 6551; GFX9-NEXT: s_ashr_i32 s7, s4, 31 6552; GFX9-NEXT: s_add_i32 s4, s4, s7 6553; GFX9-NEXT: v_mul_lo_u32 v3, s10, v0 6554; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 6555; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 6556; GFX9-NEXT: s_sub_i32 s10, 0, s6 6557; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 6558; GFX9-NEXT: s_xor_b32 s4, s4, s7 6559; GFX9-NEXT: v_mul_lo_u32 v4, s10, v1 6560; GFX9-NEXT: s_ashr_i32 s9, s5, 31 6561; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 6562; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 6563; GFX9-NEXT: v_mul_hi_u32 v3, v1, v4 6564; GFX9-NEXT: s_add_i32 s5, s5, s9 6565; GFX9-NEXT: s_xor_b32 s5, s5, s9 6566; GFX9-NEXT: v_mul_lo_u32 v4, v0, s0 6567; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 6568; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 6569; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 6570; GFX9-NEXT: v_sub_u32_e32 v4, s4, v4 6571; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v4 6572; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6573; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v4 6574; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc 6575; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 6576; GFX9-NEXT: v_mul_lo_u32 v3, v1, s6 6577; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 6578; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 6579; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 6580; GFX9-NEXT: v_sub_u32_e32 v3, s5, v3 6581; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 6582; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 6583; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 6584; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 6585; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 6586; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 6587; GFX9-NEXT: s_xor_b32 s1, s7, s1 6588; GFX9-NEXT: s_xor_b32 s0, s9, s8 6589; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 6590; GFX9-NEXT: v_xor_b32_e32 v0, s1, v0 6591; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 6592; GFX9-NEXT: v_subrev_u32_e32 v0, s1, v0 6593; GFX9-NEXT: v_subrev_u32_e32 v1, s0, v1 6594; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 6595; GFX9-NEXT: s_endpgm 6596 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 6597 %r = sdiv <2 x i32> %x, %shl.y 6598 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 6599 ret void 6600} 6601 6602define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 6603; CHECK-LABEL: @srem_i32_oddk_denom( 6604; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 1235195 6605; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6606; CHECK-NEXT: ret void 6607; 6608; GFX6-LABEL: srem_i32_oddk_denom: 6609; GFX6: ; %bb.0: 6610; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 6611; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 6612; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6613; GFX6-NEXT: s_mov_b32 s3, 0xf000 6614; GFX6-NEXT: s_mov_b32 s2, -1 6615; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6616; GFX6-NEXT: v_mul_hi_i32 v0, s4, v0 6617; GFX6-NEXT: v_readfirstlane_b32 s5, v0 6618; GFX6-NEXT: s_add_i32 s5, s5, s4 6619; GFX6-NEXT: s_lshr_b32 s6, s5, 31 6620; GFX6-NEXT: s_ashr_i32 s5, s5, 20 6621; GFX6-NEXT: s_add_i32 s5, s5, s6 6622; GFX6-NEXT: s_mul_i32 s5, s5, 0x12d8fb 6623; GFX6-NEXT: s_sub_i32 s4, s4, s5 6624; GFX6-NEXT: v_mov_b32_e32 v0, s4 6625; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 6626; GFX6-NEXT: s_endpgm 6627; 6628; GFX9-LABEL: srem_i32_oddk_denom: 6629; GFX9: ; %bb.0: 6630; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 6631; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6632; GFX9-NEXT: v_mov_b32_e32 v0, 0 6633; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6634; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 6635; GFX9-NEXT: s_add_i32 s0, s0, s4 6636; GFX9-NEXT: s_lshr_b32 s1, s0, 31 6637; GFX9-NEXT: s_ashr_i32 s0, s0, 20 6638; GFX9-NEXT: s_add_i32 s0, s0, s1 6639; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb 6640; GFX9-NEXT: s_sub_i32 s0, s4, s0 6641; GFX9-NEXT: v_mov_b32_e32 v1, s0 6642; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 6643; GFX9-NEXT: s_endpgm 6644 %r = srem i32 %x, 1235195 6645 store i32 %r, i32 addrspace(1)* %out 6646 ret void 6647} 6648 6649define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 6650; CHECK-LABEL: @srem_i32_pow2k_denom( 6651; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 4096 6652; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6653; CHECK-NEXT: ret void 6654; 6655; GFX6-LABEL: srem_i32_pow2k_denom: 6656; GFX6: ; %bb.0: 6657; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 6658; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6659; GFX6-NEXT: s_mov_b32 s3, 0xf000 6660; GFX6-NEXT: s_mov_b32 s2, -1 6661; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6662; GFX6-NEXT: s_ashr_i32 s5, s4, 31 6663; GFX6-NEXT: s_lshr_b32 s5, s5, 20 6664; GFX6-NEXT: s_add_i32 s5, s4, s5 6665; GFX6-NEXT: s_and_b32 s5, s5, 0xfffff000 6666; GFX6-NEXT: s_sub_i32 s4, s4, s5 6667; GFX6-NEXT: v_mov_b32_e32 v0, s4 6668; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 6669; GFX6-NEXT: s_endpgm 6670; 6671; GFX9-LABEL: srem_i32_pow2k_denom: 6672; GFX9: ; %bb.0: 6673; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 6674; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6675; GFX9-NEXT: v_mov_b32_e32 v0, 0 6676; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6677; GFX9-NEXT: s_ashr_i32 s0, s4, 31 6678; GFX9-NEXT: s_lshr_b32 s0, s0, 20 6679; GFX9-NEXT: s_add_i32 s0, s4, s0 6680; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 6681; GFX9-NEXT: s_sub_i32 s0, s4, s0 6682; GFX9-NEXT: v_mov_b32_e32 v1, s0 6683; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 6684; GFX9-NEXT: s_endpgm 6685 %r = srem i32 %x, 4096 6686 store i32 %r, i32 addrspace(1)* %out 6687 ret void 6688} 6689 6690define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 6691; CHECK-LABEL: @srem_i32_pow2_shl_denom( 6692; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 6693; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]] 6694; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6695; CHECK-NEXT: ret void 6696; 6697; GFX6-LABEL: srem_i32_pow2_shl_denom: 6698; GFX6: ; %bb.0: 6699; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 6700; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6701; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6702; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 6703; GFX6-NEXT: s_ashr_i32 s4, s3, 31 6704; GFX6-NEXT: s_add_i32 s3, s3, s4 6705; GFX6-NEXT: s_xor_b32 s4, s3, s4 6706; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 6707; GFX6-NEXT: s_sub_i32 s3, 0, s4 6708; GFX6-NEXT: s_ashr_i32 s5, s2, 31 6709; GFX6-NEXT: s_add_i32 s2, s2, s5 6710; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6711; GFX6-NEXT: s_xor_b32 s6, s2, s5 6712; GFX6-NEXT: s_mov_b32 s2, -1 6713; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6714; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6715; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 6716; GFX6-NEXT: s_mov_b32 s3, 0xf000 6717; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 6718; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6719; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 6720; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 6721; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 6722; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 6723; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 6724; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 6725; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 6726; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 6727; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 6728; GFX6-NEXT: v_xor_b32_e32 v0, s5, v0 6729; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s5, v0 6730; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 6731; GFX6-NEXT: s_endpgm 6732; 6733; GFX9-LABEL: srem_i32_pow2_shl_denom: 6734; GFX9: ; %bb.0: 6735; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6736; GFX9-NEXT: v_mov_b32_e32 v1, 0 6737; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6738; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6739; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 6740; GFX9-NEXT: s_ashr_i32 s4, s3, 31 6741; GFX9-NEXT: s_add_i32 s3, s3, s4 6742; GFX9-NEXT: s_xor_b32 s3, s3, s4 6743; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 6744; GFX9-NEXT: s_sub_i32 s5, 0, s3 6745; GFX9-NEXT: s_ashr_i32 s4, s2, 31 6746; GFX9-NEXT: s_add_i32 s2, s2, s4 6747; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6748; GFX9-NEXT: s_xor_b32 s2, s2, s4 6749; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6750; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6751; GFX9-NEXT: v_readfirstlane_b32 s6, v0 6752; GFX9-NEXT: s_mul_i32 s5, s5, s6 6753; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5 6754; GFX9-NEXT: s_add_i32 s6, s6, s5 6755; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6 6756; GFX9-NEXT: s_mul_i32 s5, s5, s3 6757; GFX9-NEXT: s_sub_i32 s2, s2, s5 6758; GFX9-NEXT: s_sub_i32 s5, s2, s3 6759; GFX9-NEXT: s_cmp_ge_u32 s2, s3 6760; GFX9-NEXT: s_cselect_b32 s2, s5, s2 6761; GFX9-NEXT: s_sub_i32 s5, s2, s3 6762; GFX9-NEXT: s_cmp_ge_u32 s2, s3 6763; GFX9-NEXT: s_cselect_b32 s2, s5, s2 6764; GFX9-NEXT: s_xor_b32 s2, s2, s4 6765; GFX9-NEXT: s_sub_i32 s2, s2, s4 6766; GFX9-NEXT: v_mov_b32_e32 v0, s2 6767; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 6768; GFX9-NEXT: s_endpgm 6769 %shl.y = shl i32 4096, %y 6770 %r = srem i32 %x, %shl.y 6771 store i32 %r, i32 addrspace(1)* %out 6772 ret void 6773} 6774 6775define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 6776; CHECK-LABEL: @srem_v2i32_pow2k_denom( 6777; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6778; CHECK-NEXT: [[TMP2:%.*]] = srem i32 [[TMP1]], 4096 6779; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 6780; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 6781; CHECK-NEXT: [[TMP5:%.*]] = srem i32 [[TMP4]], 4096 6782; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 6783; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 6784; CHECK-NEXT: ret void 6785; 6786; GFX6-LABEL: srem_v2i32_pow2k_denom: 6787; GFX6: ; %bb.0: 6788; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 6789; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6790; GFX6-NEXT: s_mov_b32 s3, 0xf000 6791; GFX6-NEXT: s_mov_b32 s2, -1 6792; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6793; GFX6-NEXT: s_ashr_i32 s6, s4, 31 6794; GFX6-NEXT: s_lshr_b32 s6, s6, 20 6795; GFX6-NEXT: s_add_i32 s6, s4, s6 6796; GFX6-NEXT: s_ashr_i32 s7, s5, 31 6797; GFX6-NEXT: s_and_b32 s6, s6, 0xfffff000 6798; GFX6-NEXT: s_sub_i32 s4, s4, s6 6799; GFX6-NEXT: s_lshr_b32 s6, s7, 20 6800; GFX6-NEXT: s_add_i32 s6, s5, s6 6801; GFX6-NEXT: s_and_b32 s6, s6, 0xfffff000 6802; GFX6-NEXT: s_sub_i32 s5, s5, s6 6803; GFX6-NEXT: v_mov_b32_e32 v0, s4 6804; GFX6-NEXT: v_mov_b32_e32 v1, s5 6805; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6806; GFX6-NEXT: s_endpgm 6807; 6808; GFX9-LABEL: srem_v2i32_pow2k_denom: 6809; GFX9: ; %bb.0: 6810; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6811; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 6812; GFX9-NEXT: v_mov_b32_e32 v2, 0 6813; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6814; GFX9-NEXT: s_ashr_i32 s0, s2, 31 6815; GFX9-NEXT: s_ashr_i32 s1, s3, 31 6816; GFX9-NEXT: s_lshr_b32 s0, s0, 20 6817; GFX9-NEXT: s_lshr_b32 s1, s1, 20 6818; GFX9-NEXT: s_add_i32 s0, s2, s0 6819; GFX9-NEXT: s_add_i32 s1, s3, s1 6820; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 6821; GFX9-NEXT: s_and_b32 s1, s1, 0xfffff000 6822; GFX9-NEXT: s_sub_i32 s0, s2, s0 6823; GFX9-NEXT: s_sub_i32 s1, s3, s1 6824; GFX9-NEXT: v_mov_b32_e32 v0, s0 6825; GFX9-NEXT: v_mov_b32_e32 v1, s1 6826; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 6827; GFX9-NEXT: s_endpgm 6828 %r = srem <2 x i32> %x, <i32 4096, i32 4096> 6829 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 6830 ret void 6831} 6832 6833define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 6834; CHECK-LABEL: @srem_v2i32_pow2_shl_denom( 6835; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 6836; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6837; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 6838; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 6839; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 6840; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 6841; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 6842; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 6843; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 6844; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 6845; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 6846; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000 6847; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 6848; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]] 6849; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]] 6850; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64 6851; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 6852; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]] 6853; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 6854; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32 6855; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 6856; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] 6857; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 6858; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64 6859; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 6860; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 6861; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 6862; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 6863; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]] 6864; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]] 6865; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]] 6866; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]] 6867; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]] 6868; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]] 6869; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]] 6870; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]] 6871; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]] 6872; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]] 6873; CHECK-NEXT: [[TMP38:%.*]] = insertelement <2 x i32> undef, i32 [[TMP37]], i64 0 6874; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[X]], i64 1 6875; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 6876; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31 6877; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31 6878; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]] 6879; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]] 6880; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]] 6881; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]] 6882; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float 6883; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]]) 6884; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000 6885; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32 6886; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]] 6887; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]] 6888; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64 6889; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64 6890; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]] 6891; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32 6892; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32 6893; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32 6894; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]] 6895; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64 6896; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64 6897; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]] 6898; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32 6899; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32 6900; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32 6901; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]] 6902; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]] 6903; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]] 6904; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]] 6905; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]] 6906; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]] 6907; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]] 6908; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]] 6909; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]] 6910; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]] 6911; CHECK-NEXT: [[TMP76:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP75]], i64 1 6912; CHECK-NEXT: store <2 x i32> [[TMP76]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 6913; CHECK-NEXT: ret void 6914; 6915; GFX6-LABEL: srem_v2i32_pow2_shl_denom: 6916; GFX6: ; %bb.0: 6917; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 6918; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6919; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6920; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 6921; GFX6-NEXT: s_ashr_i32 s3, s2, 31 6922; GFX6-NEXT: s_add_i32 s2, s2, s3 6923; GFX6-NEXT: s_xor_b32 s6, s2, s3 6924; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 6925; GFX6-NEXT: s_lshl_b32 s7, 0x1000, s7 6926; GFX6-NEXT: s_ashr_i32 s8, s7, 31 6927; GFX6-NEXT: s_add_i32 s7, s7, s8 6928; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6929; GFX6-NEXT: s_xor_b32 s7, s7, s8 6930; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 6931; GFX6-NEXT: s_sub_i32 s9, 0, s6 6932; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6933; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6934; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 6935; GFX6-NEXT: s_ashr_i32 s8, s4, 31 6936; GFX6-NEXT: s_add_i32 s4, s4, s8 6937; GFX6-NEXT: v_mul_lo_u32 v2, s9, v0 6938; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 6939; GFX6-NEXT: s_xor_b32 s4, s4, s8 6940; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 6941; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 6942; GFX6-NEXT: s_sub_i32 s9, 0, s7 6943; GFX6-NEXT: s_mov_b32 s3, 0xf000 6944; GFX6-NEXT: s_mov_b32 s2, -1 6945; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 6946; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 6947; GFX6-NEXT: v_mul_lo_u32 v2, s9, v1 6948; GFX6-NEXT: s_ashr_i32 s9, s5, 31 6949; GFX6-NEXT: s_add_i32 s5, s5, s9 6950; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 6951; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 6952; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 6953; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0 6954; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 6955; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6956; GFX6-NEXT: s_xor_b32 s4, s5, s9 6957; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 6958; GFX6-NEXT: v_mul_hi_u32 v1, s4, v1 6959; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0 6960; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 6961; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 6962; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6963; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 6964; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 6965; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 6966; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 6967; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 6968; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6969; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 6970; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 6971; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6972; GFX6-NEXT: v_xor_b32_e32 v1, s9, v1 6973; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s9, v1 6974; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6975; GFX6-NEXT: s_endpgm 6976; 6977; GFX9-LABEL: srem_v2i32_pow2_shl_denom: 6978; GFX9: ; %bb.0: 6979; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 6980; GFX9-NEXT: v_mov_b32_e32 v2, 0 6981; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6982; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6983; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s6 6984; GFX9-NEXT: s_ashr_i32 s3, s2, 31 6985; GFX9-NEXT: s_add_i32 s2, s2, s3 6986; GFX9-NEXT: s_xor_b32 s2, s2, s3 6987; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 6988; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s7 6989; GFX9-NEXT: s_sub_i32 s7, 0, s2 6990; GFX9-NEXT: s_ashr_i32 s6, s4, 31 6991; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6992; GFX9-NEXT: s_add_i32 s4, s4, s6 6993; GFX9-NEXT: s_xor_b32 s4, s4, s6 6994; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6995; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6996; GFX9-NEXT: v_readfirstlane_b32 s8, v0 6997; GFX9-NEXT: s_mul_i32 s7, s7, s8 6998; GFX9-NEXT: s_mul_hi_u32 s7, s8, s7 6999; GFX9-NEXT: s_add_i32 s8, s8, s7 7000; GFX9-NEXT: s_mul_hi_u32 s7, s4, s8 7001; GFX9-NEXT: s_mul_i32 s7, s7, s2 7002; GFX9-NEXT: s_sub_i32 s4, s4, s7 7003; GFX9-NEXT: s_sub_i32 s7, s4, s2 7004; GFX9-NEXT: s_cmp_ge_u32 s4, s2 7005; GFX9-NEXT: s_cselect_b32 s4, s7, s4 7006; GFX9-NEXT: s_sub_i32 s7, s4, s2 7007; GFX9-NEXT: s_cmp_ge_u32 s4, s2 7008; GFX9-NEXT: s_cselect_b32 s2, s7, s4 7009; GFX9-NEXT: s_ashr_i32 s4, s3, 31 7010; GFX9-NEXT: s_add_i32 s3, s3, s4 7011; GFX9-NEXT: s_xor_b32 s3, s3, s4 7012; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 7013; GFX9-NEXT: s_xor_b32 s2, s2, s6 7014; GFX9-NEXT: s_sub_i32 s2, s2, s6 7015; GFX9-NEXT: s_sub_i32 s6, 0, s3 7016; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 7017; GFX9-NEXT: s_ashr_i32 s4, s5, 31 7018; GFX9-NEXT: s_add_i32 s5, s5, s4 7019; GFX9-NEXT: s_xor_b32 s5, s5, s4 7020; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 7021; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 7022; GFX9-NEXT: v_readfirstlane_b32 s7, v0 7023; GFX9-NEXT: s_mul_i32 s6, s6, s7 7024; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6 7025; GFX9-NEXT: s_add_i32 s7, s7, s6 7026; GFX9-NEXT: s_mul_hi_u32 s6, s5, s7 7027; GFX9-NEXT: s_mul_i32 s6, s6, s3 7028; GFX9-NEXT: s_sub_i32 s5, s5, s6 7029; GFX9-NEXT: s_sub_i32 s6, s5, s3 7030; GFX9-NEXT: s_cmp_ge_u32 s5, s3 7031; GFX9-NEXT: s_cselect_b32 s5, s6, s5 7032; GFX9-NEXT: s_sub_i32 s6, s5, s3 7033; GFX9-NEXT: s_cmp_ge_u32 s5, s3 7034; GFX9-NEXT: s_cselect_b32 s3, s6, s5 7035; GFX9-NEXT: s_xor_b32 s3, s3, s4 7036; GFX9-NEXT: s_sub_i32 s3, s3, s4 7037; GFX9-NEXT: v_mov_b32_e32 v0, s2 7038; GFX9-NEXT: v_mov_b32_e32 v1, s3 7039; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 7040; GFX9-NEXT: s_endpgm 7041 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 7042 %r = srem <2 x i32> %x, %shl.y 7043 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 7044 ret void 7045} 7046 7047define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 7048; CHECK-LABEL: @udiv_i64_oddk_denom( 7049; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943 7050; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 7051; CHECK-NEXT: ret void 7052; 7053; GFX6-LABEL: udiv_i64_oddk_denom: 7054; GFX6: ; %bb.0: 7055; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f176a73 7056; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 7057; GFX6-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 7058; GFX6-NEXT: v_rcp_f32_e32 v0, v0 7059; GFX6-NEXT: s_movk_i32 s4, 0xfee0 7060; GFX6-NEXT: s_mov_b32 s5, 0x68958c89 7061; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 7062; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7063; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7064; GFX6-NEXT: v_trunc_f32_e32 v1, v1 7065; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7066; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 7067; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 7068; GFX6-NEXT: s_movk_i32 s8, 0x11f 7069; GFX6-NEXT: s_mov_b32 s9, 0x976a7377 7070; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 7071; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 7072; GFX6-NEXT: v_mul_lo_u32 v4, v1, s5 7073; GFX6-NEXT: v_mul_lo_u32 v5, v0, s5 7074; GFX6-NEXT: s_mov_b32 s7, 0xf000 7075; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 7076; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7077; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 7078; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 7079; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 7080; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 7081; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 7082; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 7083; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 7084; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 7085; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 7086; GFX6-NEXT: s_mov_b32 s6, -1 7087; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 7088; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 7089; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 7090; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7091; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 7092; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 7093; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 7094; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 7095; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 7096; GFX6-NEXT: v_mul_lo_u32 v4, v1, s5 7097; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7098; GFX6-NEXT: s_mov_b32 s4, s0 7099; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 7100; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 7101; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7102; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 7103; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 7104; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 7105; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 7106; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 7107; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 7108; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 7109; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 7110; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 7111; GFX6-NEXT: s_mov_b32 s5, s1 7112; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 7113; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 7114; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 7115; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7116; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 7117; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 7118; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 7119; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 7120; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 7121; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 7122; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 7123; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 7124; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7125; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 7126; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 7127; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 7128; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7129; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 7130; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 7131; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7132; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 7133; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 7134; GFX6-NEXT: v_mul_hi_u32 v3, v0, s9 7135; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9 7136; GFX6-NEXT: v_mov_b32_e32 v5, 0x11f 7137; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7138; GFX6-NEXT: v_mul_lo_u32 v3, v0, s9 7139; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7140; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 7141; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 7142; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 7143; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s9, v3 7144; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 7145; GFX6-NEXT: s_movk_i32 s2, 0x11e 7146; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s2, v4 7147; GFX6-NEXT: s_mov_b32 s9, 0x976a7376 7148; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 7149; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s9, v5 7150; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 7151; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s8, v4 7152; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 7153; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 7154; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 7155; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 7156; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 7157; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 7158; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] 7159; GFX6-NEXT: v_mov_b32_e32 v6, s3 7160; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 7161; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s2, v2 7162; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 7163; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s9, v3 7164; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 7165; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s8, v2 7166; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 7167; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 7168; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 7169; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 7170; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 7171; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7172; GFX6-NEXT: s_endpgm 7173; 7174; GFX9-LABEL: udiv_i64_oddk_denom: 7175; GFX9: ; %bb.0: 7176; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f176a73 7177; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 7178; GFX9-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 7179; GFX9-NEXT: v_rcp_f32_e32 v0, v0 7180; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 7181; GFX9-NEXT: v_mov_b32_e32 v2, 0 7182; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7183; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7184; GFX9-NEXT: v_trunc_f32_e32 v1, v1 7185; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7186; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 7187; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 7188; GFX9-NEXT: v_readfirstlane_b32 s0, v0 7189; GFX9-NEXT: s_mul_i32 s1, s0, 0xfffffee0 7190; GFX9-NEXT: s_mul_hi_u32 s2, s0, 0x68958c89 7191; GFX9-NEXT: s_add_i32 s1, s2, s1 7192; GFX9-NEXT: v_readfirstlane_b32 s2, v1 7193; GFX9-NEXT: s_mul_i32 s3, s2, 0x68958c89 7194; GFX9-NEXT: s_add_i32 s1, s1, s3 7195; GFX9-NEXT: s_mul_i32 s9, s0, 0x68958c89 7196; GFX9-NEXT: s_mul_hi_u32 s3, s0, s1 7197; GFX9-NEXT: s_mul_i32 s8, s0, s1 7198; GFX9-NEXT: s_mul_hi_u32 s0, s0, s9 7199; GFX9-NEXT: s_add_u32 s0, s0, s8 7200; GFX9-NEXT: s_addc_u32 s3, 0, s3 7201; GFX9-NEXT: s_mul_hi_u32 s10, s2, s9 7202; GFX9-NEXT: s_mul_i32 s9, s2, s9 7203; GFX9-NEXT: s_add_u32 s0, s0, s9 7204; GFX9-NEXT: s_mul_hi_u32 s8, s2, s1 7205; GFX9-NEXT: s_addc_u32 s0, s3, s10 7206; GFX9-NEXT: s_addc_u32 s3, s8, 0 7207; GFX9-NEXT: s_mul_i32 s1, s2, s1 7208; GFX9-NEXT: s_add_u32 s0, s0, s1 7209; GFX9-NEXT: s_addc_u32 s1, 0, s3 7210; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 7211; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 7212; GFX9-NEXT: s_addc_u32 s0, s2, s1 7213; GFX9-NEXT: v_readfirstlane_b32 s2, v0 7214; GFX9-NEXT: s_mul_i32 s3, s2, 0xfffffee0 7215; GFX9-NEXT: s_mul_hi_u32 s8, s2, 0x68958c89 7216; GFX9-NEXT: s_mul_i32 s1, s0, 0x68958c89 7217; GFX9-NEXT: s_add_i32 s3, s8, s3 7218; GFX9-NEXT: s_add_i32 s3, s3, s1 7219; GFX9-NEXT: s_mul_i32 s9, s2, 0x68958c89 7220; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 7221; GFX9-NEXT: s_mul_i32 s8, s2, s3 7222; GFX9-NEXT: s_mul_hi_u32 s2, s2, s9 7223; GFX9-NEXT: s_add_u32 s2, s2, s8 7224; GFX9-NEXT: s_addc_u32 s1, 0, s1 7225; GFX9-NEXT: s_mul_hi_u32 s10, s0, s9 7226; GFX9-NEXT: s_mul_i32 s9, s0, s9 7227; GFX9-NEXT: s_add_u32 s2, s2, s9 7228; GFX9-NEXT: s_mul_hi_u32 s8, s0, s3 7229; GFX9-NEXT: s_addc_u32 s1, s1, s10 7230; GFX9-NEXT: s_addc_u32 s2, s8, 0 7231; GFX9-NEXT: s_mul_i32 s3, s0, s3 7232; GFX9-NEXT: s_add_u32 s1, s1, s3 7233; GFX9-NEXT: s_addc_u32 s2, 0, s2 7234; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s1, v0 7235; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 7236; GFX9-NEXT: s_addc_u32 s0, s0, s2 7237; GFX9-NEXT: v_readfirstlane_b32 s3, v0 7238; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7239; GFX9-NEXT: s_mul_i32 s2, s6, s0 7240; GFX9-NEXT: s_mul_hi_u32 s8, s6, s3 7241; GFX9-NEXT: s_mul_hi_u32 s1, s6, s0 7242; GFX9-NEXT: s_add_u32 s2, s8, s2 7243; GFX9-NEXT: s_addc_u32 s1, 0, s1 7244; GFX9-NEXT: s_mul_hi_u32 s9, s7, s3 7245; GFX9-NEXT: s_mul_i32 s3, s7, s3 7246; GFX9-NEXT: s_add_u32 s2, s2, s3 7247; GFX9-NEXT: s_mul_hi_u32 s8, s7, s0 7248; GFX9-NEXT: s_addc_u32 s1, s1, s9 7249; GFX9-NEXT: s_addc_u32 s2, s8, 0 7250; GFX9-NEXT: s_mul_i32 s0, s7, s0 7251; GFX9-NEXT: s_add_u32 s3, s1, s0 7252; GFX9-NEXT: s_addc_u32 s2, 0, s2 7253; GFX9-NEXT: s_mul_i32 s0, s3, 0x11f 7254; GFX9-NEXT: s_mul_hi_u32 s8, s3, 0x976a7377 7255; GFX9-NEXT: s_add_i32 s0, s8, s0 7256; GFX9-NEXT: s_mul_i32 s8, s2, 0x976a7377 7257; GFX9-NEXT: s_mul_i32 s9, s3, 0x976a7377 7258; GFX9-NEXT: s_add_i32 s8, s0, s8 7259; GFX9-NEXT: v_mov_b32_e32 v0, s9 7260; GFX9-NEXT: s_sub_i32 s0, s7, s8 7261; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 7262; GFX9-NEXT: s_mov_b32 s1, 0x976a7377 7263; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 7264; GFX9-NEXT: s_subb_u32 s6, s0, 0x11f 7265; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s1, v0 7266; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 7267; GFX9-NEXT: s_subb_u32 s6, s6, 0 7268; GFX9-NEXT: s_cmpk_gt_u32 s6, 0x11e 7269; GFX9-NEXT: s_mov_b32 s10, 0x976a7376 7270; GFX9-NEXT: s_cselect_b32 s9, -1, 0 7271; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s10, v1 7272; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x11f 7273; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] 7274; GFX9-NEXT: v_mov_b32_e32 v3, s9 7275; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 7276; GFX9-NEXT: s_add_u32 s6, s3, 2 7277; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[0:1] 7278; GFX9-NEXT: s_addc_u32 s0, s2, 0 7279; GFX9-NEXT: s_add_u32 s9, s3, 1 7280; GFX9-NEXT: s_addc_u32 s1, s2, 0 7281; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 7282; GFX9-NEXT: s_subb_u32 s7, s7, s8 7283; GFX9-NEXT: s_cmpk_gt_u32 s7, 0x11e 7284; GFX9-NEXT: v_mov_b32_e32 v3, s1 7285; GFX9-NEXT: v_mov_b32_e32 v4, s0 7286; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 7287; GFX9-NEXT: s_cselect_b32 s8, -1, 0 7288; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 7289; GFX9-NEXT: s_cmpk_eq_i32 s7, 0x11f 7290; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v4, s[0:1] 7291; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 7292; GFX9-NEXT: v_mov_b32_e32 v3, s8 7293; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 7294; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc 7295; GFX9-NEXT: v_mov_b32_e32 v3, s2 7296; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 7297; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 7298; GFX9-NEXT: v_mov_b32_e32 v0, s9 7299; GFX9-NEXT: v_mov_b32_e32 v3, s6 7300; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 7301; GFX9-NEXT: v_mov_b32_e32 v3, s3 7302; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc 7303; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 7304; GFX9-NEXT: s_endpgm 7305 %r = udiv i64 %x, 1235195949943 7306 store i64 %r, i64 addrspace(1)* %out 7307 ret void 7308} 7309 7310define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 7311; CHECK-LABEL: @udiv_i64_pow2k_denom( 7312; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 4096 7313; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 7314; CHECK-NEXT: ret void 7315; 7316; GFX6-LABEL: udiv_i64_pow2k_denom: 7317; GFX6: ; %bb.0: 7318; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 7319; GFX6-NEXT: s_mov_b32 s7, 0xf000 7320; GFX6-NEXT: s_mov_b32 s6, -1 7321; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7322; GFX6-NEXT: s_mov_b32 s4, s0 7323; GFX6-NEXT: s_mov_b32 s5, s1 7324; GFX6-NEXT: s_lshr_b64 s[0:1], s[2:3], 12 7325; GFX6-NEXT: v_mov_b32_e32 v0, s0 7326; GFX6-NEXT: v_mov_b32_e32 v1, s1 7327; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7328; GFX6-NEXT: s_endpgm 7329; 7330; GFX9-LABEL: udiv_i64_pow2k_denom: 7331; GFX9: ; %bb.0: 7332; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 7333; GFX9-NEXT: v_mov_b32_e32 v2, 0 7334; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7335; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 7336; GFX9-NEXT: v_mov_b32_e32 v0, s2 7337; GFX9-NEXT: v_mov_b32_e32 v1, s3 7338; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 7339; GFX9-NEXT: s_endpgm 7340 %r = udiv i64 %x, 4096 7341 store i64 %r, i64 addrspace(1)* %out 7342 ret void 7343} 7344 7345define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 7346; CHECK-LABEL: @udiv_i64_pow2_shl_denom( 7347; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 7348; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]] 7349; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 7350; CHECK-NEXT: ret void 7351; 7352; GFX6-LABEL: udiv_i64_pow2_shl_denom: 7353; GFX6: ; %bb.0: 7354; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 7355; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd 7356; GFX6-NEXT: s_mov_b32 s3, 0xf000 7357; GFX6-NEXT: s_mov_b32 s2, -1 7358; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7359; GFX6-NEXT: s_mov_b32 s0, s4 7360; GFX6-NEXT: s_add_i32 s8, s8, 12 7361; GFX6-NEXT: s_mov_b32 s1, s5 7362; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 7363; GFX6-NEXT: v_mov_b32_e32 v0, s4 7364; GFX6-NEXT: v_mov_b32_e32 v1, s5 7365; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 7366; GFX6-NEXT: s_endpgm 7367; 7368; GFX9-LABEL: udiv_i64_pow2_shl_denom: 7369; GFX9: ; %bb.0: 7370; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 7371; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 7372; GFX9-NEXT: v_mov_b32_e32 v2, 0 7373; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7374; GFX9-NEXT: s_add_i32 s2, s2, 12 7375; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s2 7376; GFX9-NEXT: v_mov_b32_e32 v0, s0 7377; GFX9-NEXT: v_mov_b32_e32 v1, s1 7378; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 7379; GFX9-NEXT: s_endpgm 7380 %shl.y = shl i64 4096, %y 7381 %r = udiv i64 %x, %shl.y 7382 store i64 %r, i64 addrspace(1)* %out 7383 ret void 7384} 7385 7386define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 7387; CHECK-LABEL: @udiv_v2i64_pow2k_denom( 7388; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 7389; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 7390; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 7391; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 7392; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096 7393; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 7394; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 7395; CHECK-NEXT: ret void 7396; 7397; GFX6-LABEL: udiv_v2i64_pow2k_denom: 7398; GFX6: ; %bb.0: 7399; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 7400; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 7401; GFX6-NEXT: s_mov_b32 s3, 0xf000 7402; GFX6-NEXT: s_mov_b32 s2, -1 7403; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7404; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], 12 7405; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], 12 7406; GFX6-NEXT: v_mov_b32_e32 v0, s4 7407; GFX6-NEXT: v_mov_b32_e32 v1, s5 7408; GFX6-NEXT: v_mov_b32_e32 v2, s6 7409; GFX6-NEXT: v_mov_b32_e32 v3, s7 7410; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 7411; GFX6-NEXT: s_endpgm 7412; 7413; GFX9-LABEL: udiv_v2i64_pow2k_denom: 7414; GFX9: ; %bb.0: 7415; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 7416; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7417; GFX9-NEXT: v_mov_b32_e32 v4, 0 7418; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7419; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 7420; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 12 7421; GFX9-NEXT: v_mov_b32_e32 v0, s0 7422; GFX9-NEXT: v_mov_b32_e32 v1, s1 7423; GFX9-NEXT: v_mov_b32_e32 v2, s4 7424; GFX9-NEXT: v_mov_b32_e32 v3, s5 7425; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 7426; GFX9-NEXT: s_endpgm 7427 %r = udiv <2 x i64> %x, <i64 4096, i64 4096> 7428 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 7429 ret void 7430} 7431 7432define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 7433; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom( 7434; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 7435; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 7436; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 7437; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 7438; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095 7439; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 7440; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 7441; CHECK-NEXT: ret void 7442; 7443; GFX6-LABEL: udiv_v2i64_mixed_pow2k_denom: 7444; GFX6: ; %bb.0: 7445; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 7446; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 7447; GFX6-NEXT: v_rcp_f32_e32 v0, v0 7448; GFX6-NEXT: s_movk_i32 s6, 0xf001 7449; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7450; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 7451; GFX6-NEXT: s_mov_b32 s7, 0xf000 7452; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7453; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7454; GFX6-NEXT: v_trunc_f32_e32 v1, v1 7455; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7456; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 7457; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 7458; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7459; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], 12 7460; GFX6-NEXT: s_movk_i32 s0, 0xfff 7461; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 7462; GFX6-NEXT: v_mul_lo_u32 v4, v1, s6 7463; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 7464; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 7465; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7466; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 7467; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 7468; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 7469; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 7470; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 7471; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 7472; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 7473; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 7474; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 7475; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 7476; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 7477; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 7478; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7479; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 7480; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 7481; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 7482; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 7483; GFX6-NEXT: v_mul_lo_u32 v3, v1, s6 7484; GFX6-NEXT: v_mul_lo_u32 v4, v0, s6 7485; GFX6-NEXT: s_mov_b32 s6, -1 7486; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 7487; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7488; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 7489; GFX6-NEXT: v_mul_hi_u32 v5, v0, v4 7490; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 7491; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 7492; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 7493; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 7494; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 7495; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 7496; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 7497; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 7498; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 7499; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 7500; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7501; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 7502; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 7503; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 7504; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 7505; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 7506; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 7507; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 7508; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 7509; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7510; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 7511; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 7512; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 7513; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7514; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 7515; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 7516; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7517; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 7518; GFX6-NEXT: v_mul_lo_u32 v4, v1, s0 7519; GFX6-NEXT: v_mul_hi_u32 v5, v0, s0 7520; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 7521; GFX6-NEXT: v_mul_lo_u32 v8, v0, s0 7522; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 7523; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 7524; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 7525; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 7526; GFX6-NEXT: v_mov_b32_e32 v5, s3 7527; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s2, v8 7528; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc 7529; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s0, v8 7530; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc 7531; GFX6-NEXT: s_movk_i32 s0, 0xffe 7532; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 7533; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 7534; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 7535; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 7536; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 7537; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 7538; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 7539; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 7540; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] 7541; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 7542; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 7543; GFX6-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] 7544; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc 7545; GFX6-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] 7546; GFX6-NEXT: v_mov_b32_e32 v0, s8 7547; GFX6-NEXT: v_mov_b32_e32 v1, s9 7548; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 7549; GFX6-NEXT: s_endpgm 7550; 7551; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom: 7552; GFX9: ; %bb.0: 7553; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 7554; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 7555; GFX9-NEXT: v_rcp_f32_e32 v0, v0 7556; GFX9-NEXT: s_movk_i32 s2, 0xf001 7557; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7558; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7559; GFX9-NEXT: v_trunc_f32_e32 v1, v1 7560; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7561; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 7562; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 7563; GFX9-NEXT: v_mul_hi_u32 v2, v0, s2 7564; GFX9-NEXT: v_mul_lo_u32 v4, v1, s2 7565; GFX9-NEXT: v_mul_lo_u32 v3, v0, s2 7566; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 7567; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 7568; GFX9-NEXT: v_mul_hi_u32 v5, v0, v3 7569; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2 7570; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 7571; GFX9-NEXT: v_mul_lo_u32 v6, v1, v3 7572; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 7573; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 7574; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 7575; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc 7576; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 7577; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 7578; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc 7579; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc 7580; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7581; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 7582; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 7583; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 7584; GFX9-NEXT: v_mul_hi_u32 v2, v0, s2 7585; GFX9-NEXT: v_mul_lo_u32 v3, v1, s2 7586; GFX9-NEXT: v_mul_lo_u32 v5, v0, s2 7587; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7588; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 7589; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 7590; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 7591; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 7592; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5 7593; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 7594; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 7595; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 7596; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 7597; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 7598; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 7599; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 7600; GFX9-NEXT: s_movk_i32 s0, 0xfff 7601; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7602; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 12 7603; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 7604; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc 7605; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v8, vcc 7606; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7607; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc 7608; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 7609; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 7610; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 7611; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 7612; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 7613; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 7614; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 7615; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7616; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc 7617; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 7618; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 7619; GFX9-NEXT: v_mov_b32_e32 v4, 0 7620; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 7621; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 7622; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v6, vcc 7623; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 7624; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 7625; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 2, v0 7626; GFX9-NEXT: v_mul_lo_u32 v5, v1, s0 7627; GFX9-NEXT: v_mul_hi_u32 v6, v0, s0 7628; GFX9-NEXT: v_mul_lo_u32 v9, v0, s0 7629; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc 7630; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 7631; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc 7632; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 7633; GFX9-NEXT: v_mov_b32_e32 v6, s7 7634; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s6, v9 7635; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc 7636; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s0, v9 7637; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc 7638; GFX9-NEXT: s_movk_i32 s0, 0xffe 7639; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v6 7640; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 7641; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 7642; GFX9-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc 7643; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v9 7644; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 7645; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 7646; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 7647; GFX9-NEXT: v_cndmask_b32_e64 v5, -1, v6, s[0:1] 7648; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc 7649; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 7650; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] 7651; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc 7652; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] 7653; GFX9-NEXT: v_mov_b32_e32 v0, s4 7654; GFX9-NEXT: v_mov_b32_e32 v1, s5 7655; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 7656; GFX9-NEXT: s_endpgm 7657 %r = udiv <2 x i64> %x, <i64 4096, i64 4095> 7658 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 7659 ret void 7660} 7661 7662define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 7663; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom( 7664; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 7665; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 7666; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 7667; CHECK-NEXT: [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]] 7668; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 7669; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 7670; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 7671; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]] 7672; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 7673; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 7674; CHECK-NEXT: ret void 7675; 7676; GFX6-LABEL: udiv_v2i64_pow2_shl_denom: 7677; GFX6: ; %bb.0: 7678; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 7679; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 7680; GFX6-NEXT: s_mov_b32 s3, 0xf000 7681; GFX6-NEXT: s_mov_b32 s2, -1 7682; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7683; GFX6-NEXT: s_add_i32 s8, s8, 12 7684; GFX6-NEXT: s_add_i32 s9, s10, 12 7685; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 7686; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], s9 7687; GFX6-NEXT: v_mov_b32_e32 v0, s4 7688; GFX6-NEXT: v_mov_b32_e32 v1, s5 7689; GFX6-NEXT: v_mov_b32_e32 v2, s6 7690; GFX6-NEXT: v_mov_b32_e32 v3, s7 7691; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 7692; GFX6-NEXT: s_endpgm 7693; 7694; GFX9-LABEL: udiv_v2i64_pow2_shl_denom: 7695; GFX9: ; %bb.0: 7696; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 7697; GFX9-NEXT: v_mov_b32_e32 v4, 0 7698; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 7699; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7700; GFX9-NEXT: s_add_i32 s2, s8, 12 7701; GFX9-NEXT: s_add_i32 s8, s10, 12 7702; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 7703; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 7704; GFX9-NEXT: v_mov_b32_e32 v0, s2 7705; GFX9-NEXT: v_mov_b32_e32 v1, s3 7706; GFX9-NEXT: v_mov_b32_e32 v2, s4 7707; GFX9-NEXT: v_mov_b32_e32 v3, s5 7708; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 7709; GFX9-NEXT: s_endpgm 7710 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 7711 %r = udiv <2 x i64> %x, %shl.y 7712 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 7713 ret void 7714} 7715 7716define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 7717; CHECK-LABEL: @urem_i64_oddk_denom( 7718; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993 7719; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 7720; CHECK-NEXT: ret void 7721; 7722; GFX6-LABEL: urem_i64_oddk_denom: 7723; GFX6: ; %bb.0: 7724; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 7725; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 7726; GFX6-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 7727; GFX6-NEXT: v_rcp_f32_e32 v0, v0 7728; GFX6-NEXT: s_movk_i32 s2, 0xfee0 7729; GFX6-NEXT: s_mov_b32 s3, 0x689e0837 7730; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 7731; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7732; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7733; GFX6-NEXT: v_trunc_f32_e32 v1, v1 7734; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7735; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 7736; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 7737; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7738; GFX6-NEXT: s_mov_b32 s8, s4 7739; GFX6-NEXT: s_movk_i32 s4, 0x11f 7740; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 7741; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 7742; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 7743; GFX6-NEXT: v_mul_lo_u32 v5, v0, s3 7744; GFX6-NEXT: s_mov_b32 s12, 0x9761f7c9 7745; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 7746; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7747; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 7748; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 7749; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 7750; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 7751; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 7752; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 7753; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 7754; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 7755; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 7756; GFX6-NEXT: s_mov_b32 s9, s5 7757; GFX6-NEXT: s_movk_i32 s5, 0x11e 7758; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 7759; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 7760; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 7761; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7762; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 7763; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 7764; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 7765; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 7766; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 7767; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 7768; GFX6-NEXT: s_mov_b32 s11, 0xf000 7769; GFX6-NEXT: s_mov_b32 s10, -1 7770; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 7771; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 7772; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7773; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 7774; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 7775; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 7776; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 7777; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 7778; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 7779; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 7780; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 7781; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 7782; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 7783; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 7784; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 7785; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7786; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 7787; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 7788; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 7789; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 7790; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 7791; GFX6-NEXT: v_mul_hi_u32 v4, s6, v1 7792; GFX6-NEXT: v_mul_hi_u32 v5, s7, v1 7793; GFX6-NEXT: v_mul_lo_u32 v1, s7, v1 7794; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7795; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 7796; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 7797; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 7798; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7799; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 7800; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 7801; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7802; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 7803; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 7804; GFX6-NEXT: v_mul_hi_u32 v3, v0, s12 7805; GFX6-NEXT: v_mul_lo_u32 v1, v1, s12 7806; GFX6-NEXT: v_mul_lo_u32 v0, v0, s12 7807; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 7808; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v2 7809; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 7810; GFX6-NEXT: v_mov_b32_e32 v3, 0x11f 7811; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 7812; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 7813; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 7814; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 7815; GFX6-NEXT: v_cmp_lt_u32_e64 s[2:3], s5, v5 7816; GFX6-NEXT: s_mov_b32 s6, 0x9761f7c8 7817; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 7818; GFX6-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v4 7819; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 7820; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 7821; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v5 7822; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 7823; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 7824; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 7825; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 7826; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 7827; GFX6-NEXT: v_mov_b32_e32 v5, s7 7828; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 7829; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s5, v1 7830; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 7831; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0 7832; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 7833; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 7834; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 7835; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 7836; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 7837; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 7838; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 7839; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 7840; GFX6-NEXT: s_endpgm 7841; 7842; GFX9-LABEL: urem_i64_oddk_denom: 7843; GFX9: ; %bb.0: 7844; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 7845; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 7846; GFX9-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 7847; GFX9-NEXT: v_rcp_f32_e32 v0, v0 7848; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 7849; GFX9-NEXT: s_mov_b32 s12, 0x9761f7c8 7850; GFX9-NEXT: v_mov_b32_e32 v2, 0 7851; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7852; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7853; GFX9-NEXT: v_trunc_f32_e32 v1, v1 7854; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7855; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 7856; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 7857; GFX9-NEXT: v_readfirstlane_b32 s0, v0 7858; GFX9-NEXT: s_mul_i32 s1, s0, 0xfffffee0 7859; GFX9-NEXT: s_mul_hi_u32 s2, s0, 0x689e0837 7860; GFX9-NEXT: s_add_i32 s1, s2, s1 7861; GFX9-NEXT: v_readfirstlane_b32 s2, v1 7862; GFX9-NEXT: s_mul_i32 s3, s2, 0x689e0837 7863; GFX9-NEXT: s_add_i32 s1, s1, s3 7864; GFX9-NEXT: s_mul_i32 s9, s0, 0x689e0837 7865; GFX9-NEXT: s_mul_hi_u32 s3, s0, s1 7866; GFX9-NEXT: s_mul_i32 s8, s0, s1 7867; GFX9-NEXT: s_mul_hi_u32 s0, s0, s9 7868; GFX9-NEXT: s_add_u32 s0, s0, s8 7869; GFX9-NEXT: s_addc_u32 s3, 0, s3 7870; GFX9-NEXT: s_mul_hi_u32 s10, s2, s9 7871; GFX9-NEXT: s_mul_i32 s9, s2, s9 7872; GFX9-NEXT: s_add_u32 s0, s0, s9 7873; GFX9-NEXT: s_mul_hi_u32 s8, s2, s1 7874; GFX9-NEXT: s_addc_u32 s0, s3, s10 7875; GFX9-NEXT: s_addc_u32 s3, s8, 0 7876; GFX9-NEXT: s_mul_i32 s1, s2, s1 7877; GFX9-NEXT: s_add_u32 s0, s0, s1 7878; GFX9-NEXT: s_addc_u32 s1, 0, s3 7879; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 7880; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 7881; GFX9-NEXT: s_addc_u32 s0, s2, s1 7882; GFX9-NEXT: v_readfirstlane_b32 s2, v0 7883; GFX9-NEXT: s_mul_i32 s3, s2, 0xfffffee0 7884; GFX9-NEXT: s_mul_hi_u32 s8, s2, 0x689e0837 7885; GFX9-NEXT: s_mul_i32 s1, s0, 0x689e0837 7886; GFX9-NEXT: s_add_i32 s3, s8, s3 7887; GFX9-NEXT: s_add_i32 s3, s3, s1 7888; GFX9-NEXT: s_mul_i32 s9, s2, 0x689e0837 7889; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 7890; GFX9-NEXT: s_mul_i32 s8, s2, s3 7891; GFX9-NEXT: s_mul_hi_u32 s2, s2, s9 7892; GFX9-NEXT: s_add_u32 s2, s2, s8 7893; GFX9-NEXT: s_addc_u32 s1, 0, s1 7894; GFX9-NEXT: s_mul_hi_u32 s10, s0, s9 7895; GFX9-NEXT: s_mul_i32 s9, s0, s9 7896; GFX9-NEXT: s_add_u32 s2, s2, s9 7897; GFX9-NEXT: s_mul_hi_u32 s8, s0, s3 7898; GFX9-NEXT: s_addc_u32 s1, s1, s10 7899; GFX9-NEXT: s_addc_u32 s2, s8, 0 7900; GFX9-NEXT: s_mul_i32 s3, s0, s3 7901; GFX9-NEXT: s_add_u32 s1, s1, s3 7902; GFX9-NEXT: s_addc_u32 s2, 0, s2 7903; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s1, v0 7904; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 7905; GFX9-NEXT: s_addc_u32 s0, s0, s2 7906; GFX9-NEXT: v_readfirstlane_b32 s3, v0 7907; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7908; GFX9-NEXT: s_mul_i32 s2, s6, s0 7909; GFX9-NEXT: s_mul_hi_u32 s8, s6, s3 7910; GFX9-NEXT: s_mul_hi_u32 s1, s6, s0 7911; GFX9-NEXT: s_add_u32 s2, s8, s2 7912; GFX9-NEXT: s_addc_u32 s1, 0, s1 7913; GFX9-NEXT: s_mul_hi_u32 s9, s7, s3 7914; GFX9-NEXT: s_mul_i32 s3, s7, s3 7915; GFX9-NEXT: s_add_u32 s2, s2, s3 7916; GFX9-NEXT: s_mul_hi_u32 s8, s7, s0 7917; GFX9-NEXT: s_addc_u32 s1, s1, s9 7918; GFX9-NEXT: s_addc_u32 s2, s8, 0 7919; GFX9-NEXT: s_mul_i32 s0, s7, s0 7920; GFX9-NEXT: s_add_u32 s0, s1, s0 7921; GFX9-NEXT: s_addc_u32 s1, 0, s2 7922; GFX9-NEXT: s_mul_i32 s2, s0, 0x11f 7923; GFX9-NEXT: s_mul_hi_u32 s3, s0, 0x9761f7c9 7924; GFX9-NEXT: s_add_i32 s2, s3, s2 7925; GFX9-NEXT: s_mul_i32 s1, s1, 0x9761f7c9 7926; GFX9-NEXT: s_mul_i32 s0, s0, 0x9761f7c9 7927; GFX9-NEXT: s_add_i32 s9, s2, s1 7928; GFX9-NEXT: v_mov_b32_e32 v0, s0 7929; GFX9-NEXT: s_sub_i32 s1, s7, s9 7930; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 7931; GFX9-NEXT: s_mov_b32 s8, 0x9761f7c9 7932; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 7933; GFX9-NEXT: s_subb_u32 s6, s1, 0x11f 7934; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s8, v0 7935; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 7936; GFX9-NEXT: s_subb_u32 s10, s6, 0 7937; GFX9-NEXT: s_cmpk_gt_u32 s10, 0x11e 7938; GFX9-NEXT: s_cselect_b32 s11, -1, 0 7939; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s12, v3 7940; GFX9-NEXT: s_cmpk_eq_i32 s10, 0x11f 7941; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3] 7942; GFX9-NEXT: v_mov_b32_e32 v4, s11 7943; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 7944; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 7945; GFX9-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[2:3] 7946; GFX9-NEXT: s_subb_u32 s2, s6, 0x11f 7947; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v3 7948; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 7949; GFX9-NEXT: s_subb_u32 s0, s2, 0 7950; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 7951; GFX9-NEXT: s_subb_u32 s2, s7, s9 7952; GFX9-NEXT: s_cmpk_gt_u32 s2, 0x11e 7953; GFX9-NEXT: v_mov_b32_e32 v5, s10 7954; GFX9-NEXT: v_mov_b32_e32 v6, s0 7955; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 7956; GFX9-NEXT: s_cselect_b32 s3, -1, 0 7957; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s12, v0 7958; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x11f 7959; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, v6, s[0:1] 7960; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 7961; GFX9-NEXT: v_mov_b32_e32 v6, s3 7962; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 7963; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc 7964; GFX9-NEXT: v_mov_b32_e32 v6, s2 7965; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 7966; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] 7967; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc 7968; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 7969; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 7970; GFX9-NEXT: s_endpgm 7971 %r = urem i64 %x, 1235195393993 7972 store i64 %r, i64 addrspace(1)* %out 7973 ret void 7974} 7975 7976define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 7977; CHECK-LABEL: @urem_i64_pow2k_denom( 7978; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 4096 7979; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 7980; CHECK-NEXT: ret void 7981; 7982; GFX6-LABEL: urem_i64_pow2k_denom: 7983; GFX6: ; %bb.0: 7984; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 7985; GFX6-NEXT: s_mov_b32 s7, 0xf000 7986; GFX6-NEXT: s_mov_b32 s6, -1 7987; GFX6-NEXT: v_mov_b32_e32 v1, 0 7988; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7989; GFX6-NEXT: s_mov_b32 s4, s0 7990; GFX6-NEXT: s_and_b32 s0, s2, 0xfff 7991; GFX6-NEXT: s_mov_b32 s5, s1 7992; GFX6-NEXT: v_mov_b32_e32 v0, s0 7993; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7994; GFX6-NEXT: s_endpgm 7995; 7996; GFX9-LABEL: urem_i64_pow2k_denom: 7997; GFX9: ; %bb.0: 7998; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 7999; GFX9-NEXT: v_mov_b32_e32 v1, 0 8000; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8001; GFX9-NEXT: s_and_b32 s2, s2, 0xfff 8002; GFX9-NEXT: v_mov_b32_e32 v0, s2 8003; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 8004; GFX9-NEXT: s_endpgm 8005 %r = urem i64 %x, 4096 8006 store i64 %r, i64 addrspace(1)* %out 8007 ret void 8008} 8009 8010define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 8011; CHECK-LABEL: @urem_i64_pow2_shl_denom( 8012; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 8013; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]] 8014; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 8015; CHECK-NEXT: ret void 8016; 8017; GFX6-LABEL: urem_i64_pow2_shl_denom: 8018; GFX6: ; %bb.0: 8019; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 8020; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd 8021; GFX6-NEXT: s_mov_b32 s3, 0xf000 8022; GFX6-NEXT: s_mov_b32 s2, -1 8023; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8024; GFX6-NEXT: s_mov_b32 s0, s4 8025; GFX6-NEXT: s_mov_b32 s1, s5 8026; GFX6-NEXT: s_mov_b64 s[4:5], 0x1000 8027; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 8028; GFX6-NEXT: s_add_u32 s4, s4, -1 8029; GFX6-NEXT: s_addc_u32 s5, s5, -1 8030; GFX6-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 8031; GFX6-NEXT: v_mov_b32_e32 v0, s4 8032; GFX6-NEXT: v_mov_b32_e32 v1, s5 8033; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 8034; GFX6-NEXT: s_endpgm 8035; 8036; GFX9-LABEL: urem_i64_pow2_shl_denom: 8037; GFX9: ; %bb.0: 8038; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 8039; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 8040; GFX9-NEXT: s_mov_b64 s[0:1], 0x1000 8041; GFX9-NEXT: v_mov_b32_e32 v2, 0 8042; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8043; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 8044; GFX9-NEXT: s_add_u32 s0, s0, -1 8045; GFX9-NEXT: s_addc_u32 s1, s1, -1 8046; GFX9-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] 8047; GFX9-NEXT: v_mov_b32_e32 v0, s0 8048; GFX9-NEXT: v_mov_b32_e32 v1, s1 8049; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 8050; GFX9-NEXT: s_endpgm 8051 %shl.y = shl i64 4096, %y 8052 %r = urem i64 %x, %shl.y 8053 store i64 %r, i64 addrspace(1)* %out 8054 ret void 8055} 8056 8057define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 8058; CHECK-LABEL: @urem_v2i64_pow2k_denom( 8059; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 8060; CHECK-NEXT: [[TMP2:%.*]] = urem i64 [[TMP1]], 4096 8061; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 8062; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 8063; CHECK-NEXT: [[TMP5:%.*]] = urem i64 [[TMP4]], 4096 8064; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 8065; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 8066; CHECK-NEXT: ret void 8067; 8068; GFX6-LABEL: urem_v2i64_pow2k_denom: 8069; GFX6: ; %bb.0: 8070; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 8071; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 8072; GFX6-NEXT: v_mov_b32_e32 v1, 0 8073; GFX6-NEXT: s_mov_b32 s3, 0xf000 8074; GFX6-NEXT: s_mov_b32 s2, -1 8075; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8076; GFX6-NEXT: s_and_b32 s4, s4, 0xfff 8077; GFX6-NEXT: s_and_b32 s5, s6, 0xfff 8078; GFX6-NEXT: v_mov_b32_e32 v0, s4 8079; GFX6-NEXT: v_mov_b32_e32 v2, s5 8080; GFX6-NEXT: v_mov_b32_e32 v3, v1 8081; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 8082; GFX6-NEXT: s_endpgm 8083; 8084; GFX9-LABEL: urem_v2i64_pow2k_denom: 8085; GFX9: ; %bb.0: 8086; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 8087; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8088; GFX9-NEXT: v_mov_b32_e32 v1, 0 8089; GFX9-NEXT: v_mov_b32_e32 v3, v1 8090; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8091; GFX9-NEXT: s_and_b32 s0, s4, 0xfff 8092; GFX9-NEXT: s_and_b32 s1, s6, 0xfff 8093; GFX9-NEXT: v_mov_b32_e32 v0, s0 8094; GFX9-NEXT: v_mov_b32_e32 v2, s1 8095; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[2:3] 8096; GFX9-NEXT: s_endpgm 8097 %r = urem <2 x i64> %x, <i64 4096, i64 4096> 8098 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 8099 ret void 8100} 8101 8102define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 8103; CHECK-LABEL: @urem_v2i64_pow2_shl_denom( 8104; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 8105; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 8106; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 8107; CHECK-NEXT: [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]] 8108; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 8109; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 8110; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 8111; CHECK-NEXT: [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]] 8112; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 8113; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 8114; CHECK-NEXT: ret void 8115; 8116; GFX6-LABEL: urem_v2i64_pow2_shl_denom: 8117; GFX6: ; %bb.0: 8118; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 8119; GFX6-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0xd 8120; GFX6-NEXT: s_mov_b64 s[12:13], 0x1000 8121; GFX6-NEXT: s_mov_b32 s11, 0xf000 8122; GFX6-NEXT: s_mov_b32 s10, -1 8123; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8124; GFX6-NEXT: s_lshl_b64 s[6:7], s[12:13], s6 8125; GFX6-NEXT: s_lshl_b64 s[4:5], s[12:13], s4 8126; GFX6-NEXT: s_add_u32 s4, s4, -1 8127; GFX6-NEXT: s_addc_u32 s5, s5, -1 8128; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] 8129; GFX6-NEXT: s_add_u32 s4, s6, -1 8130; GFX6-NEXT: s_addc_u32 s5, s7, -1 8131; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5] 8132; GFX6-NEXT: v_mov_b32_e32 v0, s0 8133; GFX6-NEXT: v_mov_b32_e32 v1, s1 8134; GFX6-NEXT: v_mov_b32_e32 v2, s2 8135; GFX6-NEXT: v_mov_b32_e32 v3, s3 8136; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 8137; GFX6-NEXT: s_endpgm 8138; 8139; GFX9-LABEL: urem_v2i64_pow2_shl_denom: 8140; GFX9: ; %bb.0: 8141; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 8142; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 8143; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 8144; GFX9-NEXT: v_mov_b32_e32 v4, 0 8145; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8146; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 8147; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 8148; GFX9-NEXT: s_add_u32 s2, s2, -1 8149; GFX9-NEXT: s_addc_u32 s3, s3, -1 8150; GFX9-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] 8151; GFX9-NEXT: s_add_u32 s4, s10, -1 8152; GFX9-NEXT: s_addc_u32 s5, s11, -1 8153; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 8154; GFX9-NEXT: v_mov_b32_e32 v0, s2 8155; GFX9-NEXT: v_mov_b32_e32 v1, s3 8156; GFX9-NEXT: v_mov_b32_e32 v2, s4 8157; GFX9-NEXT: v_mov_b32_e32 v3, s5 8158; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 8159; GFX9-NEXT: s_endpgm 8160 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 8161 %r = urem <2 x i64> %x, %shl.y 8162 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 8163 ret void 8164} 8165 8166define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 8167; CHECK-LABEL: @sdiv_i64_oddk_denom( 8168; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195 8169; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 8170; CHECK-NEXT: ret void 8171; 8172; GFX6-LABEL: sdiv_i64_oddk_denom: 8173; GFX6: ; %bb.0: 8174; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 8175; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 8176; GFX6-NEXT: v_rcp_f32_e32 v0, v0 8177; GFX6-NEXT: s_mov_b32 s5, 0xffed2705 8178; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 8179; GFX6-NEXT: s_mov_b32 s7, 0xf000 8180; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8181; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8182; GFX6-NEXT: v_trunc_f32_e32 v1, v1 8183; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8184; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 8185; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 8186; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8187; GFX6-NEXT: s_ashr_i32 s8, s3, 31 8188; GFX6-NEXT: s_add_u32 s2, s2, s8 8189; GFX6-NEXT: v_mul_lo_u32 v2, v1, s5 8190; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 8191; GFX6-NEXT: v_mul_lo_u32 v4, v0, s5 8192; GFX6-NEXT: s_mov_b32 s9, s8 8193; GFX6-NEXT: s_addc_u32 s3, s3, s8 8194; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 8195; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 8196; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 8197; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 8198; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 8199; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 8200; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 8201; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 8202; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 8203; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 8204; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 8205; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] 8206; GFX6-NEXT: s_mov_b32 s4, s0 8207; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 8208; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 8209; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 8210; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8211; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 8212; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 8213; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 8214; GFX6-NEXT: v_mul_lo_u32 v2, v1, s5 8215; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 8216; GFX6-NEXT: s_mov_b32 s0, 0x12d8fb 8217; GFX6-NEXT: s_mov_b32 s6, -1 8218; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 8219; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 8220; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 8221; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 8222; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 8223; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 8224; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 8225; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 8226; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 8227; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 8228; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 8229; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 8230; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 8231; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc 8232; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 8233; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8234; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 8235; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 8236; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 8237; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 8238; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 8239; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 8240; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 8241; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 8242; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8243; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 8244; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 8245; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 8246; GFX6-NEXT: s_mov_b32 s5, s1 8247; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 8248; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 8249; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 8250; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 8251; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 8252; GFX6-NEXT: v_mul_lo_u32 v4, v1, s0 8253; GFX6-NEXT: v_mul_hi_u32 v5, v0, s0 8254; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 8255; GFX6-NEXT: v_mul_lo_u32 v8, v0, s0 8256; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 8257; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 8258; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 8259; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 8260; GFX6-NEXT: v_mov_b32_e32 v5, s3 8261; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s2, v8 8262; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc 8263; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s0, v8 8264; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc 8265; GFX6-NEXT: s_mov_b32 s0, 0x12d8fa 8266; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 8267; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 8268; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 8269; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 8270; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 8271; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 8272; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 8273; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 8274; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] 8275; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 8276; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 8277; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 8278; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 8279; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 8280; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 8281; GFX6-NEXT: v_xor_b32_e32 v1, s8, v1 8282; GFX6-NEXT: v_mov_b32_e32 v2, s8 8283; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 8284; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 8285; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 8286; GFX6-NEXT: s_endpgm 8287; 8288; GFX9-LABEL: sdiv_i64_oddk_denom: 8289; GFX9: ; %bb.0: 8290; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 8291; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 8292; GFX9-NEXT: v_rcp_f32_e32 v0, v0 8293; GFX9-NEXT: s_mov_b32 s2, 0xffed2705 8294; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 8295; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8296; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8297; GFX9-NEXT: v_trunc_f32_e32 v1, v1 8298; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8299; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 8300; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 8301; GFX9-NEXT: v_mul_lo_u32 v2, v1, s2 8302; GFX9-NEXT: v_mul_hi_u32 v3, v0, s2 8303; GFX9-NEXT: v_mul_lo_u32 v4, v0, s2 8304; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 8305; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 8306; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 8307; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 8308; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 8309; GFX9-NEXT: v_mul_lo_u32 v5, v1, v4 8310; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 8311; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 8312; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 8313; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 8314; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 8315; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 8316; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc 8317; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc 8318; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 8319; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 8320; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 8321; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 8322; GFX9-NEXT: v_mul_lo_u32 v2, v1, s2 8323; GFX9-NEXT: v_mul_hi_u32 v3, v0, s2 8324; GFX9-NEXT: v_mul_lo_u32 v4, v0, s2 8325; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8326; GFX9-NEXT: s_ashr_i32 s2, s7, 31 8327; GFX9-NEXT: s_add_u32 s0, s6, s2 8328; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 8329; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 8330; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 8331; GFX9-NEXT: v_mul_hi_u32 v7, v0, v4 8332; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 8333; GFX9-NEXT: v_mul_hi_u32 v5, v1, v4 8334; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 8335; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 8336; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 8337; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc 8338; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 8339; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 8340; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v5, vcc 8341; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 8342; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 8343; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 8344; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 8345; GFX9-NEXT: s_mov_b32 s3, s2 8346; GFX9-NEXT: s_addc_u32 s1, s7, s2 8347; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 8348; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 8349; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 8350; GFX9-NEXT: v_mul_hi_u32 v3, s0, v0 8351; GFX9-NEXT: v_mul_hi_u32 v5, s0, v1 8352; GFX9-NEXT: v_mul_hi_u32 v6, s1, v1 8353; GFX9-NEXT: v_mul_lo_u32 v1, s1, v1 8354; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 8355; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc 8356; GFX9-NEXT: v_mul_lo_u32 v5, s1, v0 8357; GFX9-NEXT: v_mul_hi_u32 v0, s1, v0 8358; GFX9-NEXT: s_mov_b32 s3, 0x12d8fb 8359; GFX9-NEXT: v_mov_b32_e32 v4, 0 8360; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 8361; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 8362; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v6, vcc 8363; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 8364; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 8365; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 2, v0 8366; GFX9-NEXT: v_mul_lo_u32 v5, v1, s3 8367; GFX9-NEXT: v_mul_hi_u32 v6, v0, s3 8368; GFX9-NEXT: v_mul_lo_u32 v9, v0, s3 8369; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc 8370; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 8371; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc 8372; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 8373; GFX9-NEXT: v_mov_b32_e32 v6, s1 8374; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s0, v9 8375; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc 8376; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s3, v9 8377; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc 8378; GFX9-NEXT: s_mov_b32 s0, 0x12d8fa 8379; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v6 8380; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 8381; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 8382; GFX9-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc 8383; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v9 8384; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 8385; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 8386; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 8387; GFX9-NEXT: v_cndmask_b32_e64 v5, -1, v6, s[0:1] 8388; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 8389; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc 8390; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc 8391; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 8392; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 8393; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 8394; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 8395; GFX9-NEXT: v_mov_b32_e32 v2, s2 8396; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s2, v0 8397; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc 8398; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] 8399; GFX9-NEXT: s_endpgm 8400 %r = sdiv i64 %x, 1235195 8401 store i64 %r, i64 addrspace(1)* %out 8402 ret void 8403} 8404 8405define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 8406; CHECK-LABEL: @sdiv_i64_pow2k_denom( 8407; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 4096 8408; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 8409; CHECK-NEXT: ret void 8410; 8411; GFX6-LABEL: sdiv_i64_pow2k_denom: 8412; GFX6: ; %bb.0: 8413; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 8414; GFX6-NEXT: s_mov_b32 s7, 0xf000 8415; GFX6-NEXT: s_mov_b32 s6, -1 8416; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8417; GFX6-NEXT: s_mov_b32 s4, s0 8418; GFX6-NEXT: s_ashr_i32 s0, s3, 31 8419; GFX6-NEXT: s_lshr_b32 s0, s0, 20 8420; GFX6-NEXT: s_add_u32 s0, s2, s0 8421; GFX6-NEXT: s_mov_b32 s5, s1 8422; GFX6-NEXT: s_addc_u32 s1, s3, 0 8423; GFX6-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 8424; GFX6-NEXT: v_mov_b32_e32 v0, s0 8425; GFX6-NEXT: v_mov_b32_e32 v1, s1 8426; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 8427; GFX6-NEXT: s_endpgm 8428; 8429; GFX9-LABEL: sdiv_i64_pow2k_denom: 8430; GFX9: ; %bb.0: 8431; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 8432; GFX9-NEXT: v_mov_b32_e32 v2, 0 8433; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8434; GFX9-NEXT: s_ashr_i32 s4, s3, 31 8435; GFX9-NEXT: s_lshr_b32 s4, s4, 20 8436; GFX9-NEXT: s_add_u32 s2, s2, s4 8437; GFX9-NEXT: s_addc_u32 s3, s3, 0 8438; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 8439; GFX9-NEXT: v_mov_b32_e32 v0, s2 8440; GFX9-NEXT: v_mov_b32_e32 v1, s3 8441; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 8442; GFX9-NEXT: s_endpgm 8443 %r = sdiv i64 %x, 4096 8444 store i64 %r, i64 addrspace(1)* %out 8445 ret void 8446} 8447 8448define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 8449; CHECK-LABEL: @sdiv_i64_pow2_shl_denom( 8450; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 8451; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]] 8452; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 8453; CHECK-NEXT: ret void 8454; 8455; GFX6-LABEL: sdiv_i64_pow2_shl_denom: 8456; GFX6: ; %bb.0: 8457; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd 8458; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 8459; GFX6-NEXT: s_mov_b32 s7, 0xf000 8460; GFX6-NEXT: s_mov_b32 s6, -1 8461; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8462; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 8463; GFX6-NEXT: s_ashr_i32 s8, s3, 31 8464; GFX6-NEXT: s_add_u32 s2, s2, s8 8465; GFX6-NEXT: s_mov_b32 s9, s8 8466; GFX6-NEXT: s_addc_u32 s3, s3, s8 8467; GFX6-NEXT: s_xor_b64 s[10:11], s[2:3], s[8:9] 8468; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 8469; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11 8470; GFX6-NEXT: s_sub_u32 s4, 0, s10 8471; GFX6-NEXT: s_subb_u32 s5, 0, s11 8472; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 8473; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 8474; GFX6-NEXT: v_rcp_f32_e32 v0, v0 8475; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8476; GFX6-NEXT: s_ashr_i32 s12, s3, 31 8477; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8478; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8479; GFX6-NEXT: v_trunc_f32_e32 v1, v1 8480; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8481; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 8482; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 8483; GFX6-NEXT: s_add_u32 s2, s2, s12 8484; GFX6-NEXT: s_mov_b32 s13, s12 8485; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 8486; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 8487; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 8488; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 8489; GFX6-NEXT: s_addc_u32 s3, s3, s12 8490; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 8491; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 8492; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 8493; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 8494; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 8495; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 8496; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 8497; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 8498; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 8499; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 8500; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 8501; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] 8502; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 8503; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 8504; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 8505; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8506; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 8507; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 8508; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 8509; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 8510; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 8511; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 8512; GFX6-NEXT: s_mov_b32 s5, s1 8513; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8514; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 8515; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 8516; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 8517; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 8518; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 8519; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 8520; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 8521; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 8522; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 8523; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 8524; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 8525; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 8526; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc 8527; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 8528; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8529; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 8530; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 8531; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 8532; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 8533; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 8534; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 8535; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 8536; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 8537; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8538; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 8539; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 8540; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 8541; GFX6-NEXT: s_mov_b32 s4, s0 8542; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 8543; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 8544; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 8545; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 8546; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 8547; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 8548; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 8549; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 8550; GFX6-NEXT: v_mov_b32_e32 v5, s11 8551; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 8552; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 8553; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 8554; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 8555; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 8556; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 8557; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s10, v3 8558; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 8559; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v4 8560; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 8561; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v5 8562; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 8563; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v4 8564; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 8565; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 8566; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 8567; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 8568; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 8569; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 8570; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] 8571; GFX6-NEXT: v_mov_b32_e32 v6, s3 8572; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 8573; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 8574; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 8575; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 8576; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 8577; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s11, v2 8578; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 8579; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 8580; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 8581; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 8582; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[8:9] 8583; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 8584; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 8585; GFX6-NEXT: v_xor_b32_e32 v1, s1, v1 8586; GFX6-NEXT: v_mov_b32_e32 v2, s1 8587; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 8588; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 8589; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 8590; GFX6-NEXT: s_endpgm 8591; 8592; GFX9-LABEL: sdiv_i64_pow2_shl_denom: 8593; GFX9: ; %bb.0: 8594; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 8595; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 8596; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8597; GFX9-NEXT: s_lshl_b64 s[4:5], s[2:3], s4 8598; GFX9-NEXT: s_ashr_i32 s2, s5, 31 8599; GFX9-NEXT: s_add_u32 s4, s4, s2 8600; GFX9-NEXT: s_mov_b32 s3, s2 8601; GFX9-NEXT: s_addc_u32 s5, s5, s2 8602; GFX9-NEXT: s_xor_b64 s[8:9], s[4:5], s[2:3] 8603; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 8604; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 8605; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 8606; GFX9-NEXT: s_sub_u32 s0, 0, s8 8607; GFX9-NEXT: s_subb_u32 s1, 0, s9 8608; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 8609; GFX9-NEXT: v_rcp_f32_e32 v1, v0 8610; GFX9-NEXT: v_mov_b32_e32 v0, 0 8611; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 8612; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 8613; GFX9-NEXT: v_trunc_f32_e32 v2, v2 8614; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 8615; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 8616; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 8617; GFX9-NEXT: v_readfirstlane_b32 s10, v2 8618; GFX9-NEXT: v_readfirstlane_b32 s11, v1 8619; GFX9-NEXT: s_mul_i32 s12, s0, s10 8620; GFX9-NEXT: s_mul_hi_u32 s14, s0, s11 8621; GFX9-NEXT: s_mul_i32 s13, s1, s11 8622; GFX9-NEXT: s_add_i32 s12, s14, s12 8623; GFX9-NEXT: s_add_i32 s12, s12, s13 8624; GFX9-NEXT: s_mul_i32 s15, s0, s11 8625; GFX9-NEXT: s_mul_hi_u32 s13, s11, s12 8626; GFX9-NEXT: s_mul_i32 s14, s11, s12 8627; GFX9-NEXT: s_mul_hi_u32 s11, s11, s15 8628; GFX9-NEXT: s_add_u32 s11, s11, s14 8629; GFX9-NEXT: s_addc_u32 s13, 0, s13 8630; GFX9-NEXT: s_mul_hi_u32 s16, s10, s15 8631; GFX9-NEXT: s_mul_i32 s15, s10, s15 8632; GFX9-NEXT: s_add_u32 s11, s11, s15 8633; GFX9-NEXT: s_mul_hi_u32 s14, s10, s12 8634; GFX9-NEXT: s_addc_u32 s11, s13, s16 8635; GFX9-NEXT: s_addc_u32 s13, s14, 0 8636; GFX9-NEXT: s_mul_i32 s12, s10, s12 8637; GFX9-NEXT: s_add_u32 s11, s11, s12 8638; GFX9-NEXT: s_addc_u32 s12, 0, s13 8639; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s11, v1 8640; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 8641; GFX9-NEXT: s_addc_u32 s10, s10, s12 8642; GFX9-NEXT: v_readfirstlane_b32 s12, v1 8643; GFX9-NEXT: s_mul_i32 s11, s0, s10 8644; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12 8645; GFX9-NEXT: s_add_i32 s11, s13, s11 8646; GFX9-NEXT: s_mul_i32 s1, s1, s12 8647; GFX9-NEXT: s_add_i32 s11, s11, s1 8648; GFX9-NEXT: s_mul_i32 s0, s0, s12 8649; GFX9-NEXT: s_mul_hi_u32 s13, s10, s0 8650; GFX9-NEXT: s_mul_i32 s14, s10, s0 8651; GFX9-NEXT: s_mul_i32 s16, s12, s11 8652; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0 8653; GFX9-NEXT: s_mul_hi_u32 s15, s12, s11 8654; GFX9-NEXT: s_add_u32 s0, s0, s16 8655; GFX9-NEXT: s_addc_u32 s12, 0, s15 8656; GFX9-NEXT: s_add_u32 s0, s0, s14 8657; GFX9-NEXT: s_mul_hi_u32 s1, s10, s11 8658; GFX9-NEXT: s_addc_u32 s0, s12, s13 8659; GFX9-NEXT: s_addc_u32 s1, s1, 0 8660; GFX9-NEXT: s_mul_i32 s11, s10, s11 8661; GFX9-NEXT: s_add_u32 s0, s0, s11 8662; GFX9-NEXT: s_addc_u32 s1, 0, s1 8663; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 8664; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 8665; GFX9-NEXT: s_addc_u32 s12, s10, s1 8666; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8667; GFX9-NEXT: s_ashr_i32 s10, s7, 31 8668; GFX9-NEXT: s_add_u32 s0, s6, s10 8669; GFX9-NEXT: s_mov_b32 s11, s10 8670; GFX9-NEXT: s_addc_u32 s1, s7, s10 8671; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 8672; GFX9-NEXT: v_readfirstlane_b32 s13, v1 8673; GFX9-NEXT: s_mul_i32 s1, s6, s12 8674; GFX9-NEXT: s_mul_hi_u32 s14, s6, s13 8675; GFX9-NEXT: s_mul_hi_u32 s0, s6, s12 8676; GFX9-NEXT: s_add_u32 s1, s14, s1 8677; GFX9-NEXT: s_addc_u32 s0, 0, s0 8678; GFX9-NEXT: s_mul_hi_u32 s15, s7, s13 8679; GFX9-NEXT: s_mul_i32 s13, s7, s13 8680; GFX9-NEXT: s_add_u32 s1, s1, s13 8681; GFX9-NEXT: s_mul_hi_u32 s14, s7, s12 8682; GFX9-NEXT: s_addc_u32 s0, s0, s15 8683; GFX9-NEXT: s_addc_u32 s1, s14, 0 8684; GFX9-NEXT: s_mul_i32 s12, s7, s12 8685; GFX9-NEXT: s_add_u32 s12, s0, s12 8686; GFX9-NEXT: s_addc_u32 s13, 0, s1 8687; GFX9-NEXT: s_mul_i32 s0, s8, s13 8688; GFX9-NEXT: s_mul_hi_u32 s1, s8, s12 8689; GFX9-NEXT: s_add_i32 s0, s1, s0 8690; GFX9-NEXT: s_mul_i32 s1, s9, s12 8691; GFX9-NEXT: s_add_i32 s14, s0, s1 8692; GFX9-NEXT: s_mul_i32 s1, s8, s12 8693; GFX9-NEXT: v_mov_b32_e32 v1, s1 8694; GFX9-NEXT: s_sub_i32 s0, s7, s14 8695; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s6, v1 8696; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 8697; GFX9-NEXT: s_subb_u32 s6, s0, s9 8698; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s8, v1 8699; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 8700; GFX9-NEXT: s_subb_u32 s6, s6, 0 8701; GFX9-NEXT: s_cmp_ge_u32 s6, s9 8702; GFX9-NEXT: s_cselect_b32 s15, -1, 0 8703; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 8704; GFX9-NEXT: s_cmp_eq_u32 s6, s9 8705; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] 8706; GFX9-NEXT: v_mov_b32_e32 v3, s15 8707; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 8708; GFX9-NEXT: s_add_u32 s6, s12, 2 8709; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1] 8710; GFX9-NEXT: s_addc_u32 s0, s13, 0 8711; GFX9-NEXT: s_add_u32 s15, s12, 1 8712; GFX9-NEXT: s_addc_u32 s1, s13, 0 8713; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 8714; GFX9-NEXT: s_subb_u32 s7, s7, s14 8715; GFX9-NEXT: s_cmp_ge_u32 s7, s9 8716; GFX9-NEXT: v_mov_b32_e32 v3, s1 8717; GFX9-NEXT: v_mov_b32_e32 v4, s0 8718; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 8719; GFX9-NEXT: s_cselect_b32 s14, -1, 0 8720; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 8721; GFX9-NEXT: s_cmp_eq_u32 s7, s9 8722; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] 8723; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc 8724; GFX9-NEXT: v_mov_b32_e32 v3, s14 8725; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 8726; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc 8727; GFX9-NEXT: v_mov_b32_e32 v3, s13 8728; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 8729; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc 8730; GFX9-NEXT: v_mov_b32_e32 v2, s15 8731; GFX9-NEXT: v_mov_b32_e32 v3, s6 8732; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 8733; GFX9-NEXT: v_mov_b32_e32 v3, s12 8734; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 8735; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[2:3] 8736; GFX9-NEXT: v_xor_b32_e32 v2, s0, v2 8737; GFX9-NEXT: v_xor_b32_e32 v3, s1, v1 8738; GFX9-NEXT: v_mov_b32_e32 v4, s1 8739; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s0, v2 8740; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v4, vcc 8741; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[4:5] 8742; GFX9-NEXT: s_endpgm 8743 %shl.y = shl i64 4096, %y 8744 %r = sdiv i64 %x, %shl.y 8745 store i64 %r, i64 addrspace(1)* %out 8746 ret void 8747} 8748 8749define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 8750; CHECK-LABEL: @sdiv_v2i64_pow2k_denom( 8751; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 8752; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 8753; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 8754; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 8755; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096 8756; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 8757; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 8758; CHECK-NEXT: ret void 8759; 8760; GFX6-LABEL: sdiv_v2i64_pow2k_denom: 8761; GFX6: ; %bb.0: 8762; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 8763; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 8764; GFX6-NEXT: s_mov_b32 s3, 0xf000 8765; GFX6-NEXT: s_mov_b32 s2, -1 8766; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8767; GFX6-NEXT: s_ashr_i32 s8, s5, 31 8768; GFX6-NEXT: s_lshr_b32 s8, s8, 20 8769; GFX6-NEXT: s_add_u32 s4, s4, s8 8770; GFX6-NEXT: s_addc_u32 s5, s5, 0 8771; GFX6-NEXT: s_ashr_i32 s8, s7, 31 8772; GFX6-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 8773; GFX6-NEXT: s_lshr_b32 s8, s8, 20 8774; GFX6-NEXT: s_add_u32 s6, s6, s8 8775; GFX6-NEXT: s_addc_u32 s7, s7, 0 8776; GFX6-NEXT: s_ashr_i64 s[6:7], s[6:7], 12 8777; GFX6-NEXT: v_mov_b32_e32 v0, s4 8778; GFX6-NEXT: v_mov_b32_e32 v1, s5 8779; GFX6-NEXT: v_mov_b32_e32 v2, s6 8780; GFX6-NEXT: v_mov_b32_e32 v3, s7 8781; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 8782; GFX6-NEXT: s_endpgm 8783; 8784; GFX9-LABEL: sdiv_v2i64_pow2k_denom: 8785; GFX9: ; %bb.0: 8786; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 8787; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8788; GFX9-NEXT: v_mov_b32_e32 v4, 0 8789; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8790; GFX9-NEXT: s_ashr_i32 s0, s5, 31 8791; GFX9-NEXT: s_lshr_b32 s0, s0, 20 8792; GFX9-NEXT: s_add_u32 s0, s4, s0 8793; GFX9-NEXT: s_addc_u32 s1, s5, 0 8794; GFX9-NEXT: s_ashr_i32 s4, s7, 31 8795; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 8796; GFX9-NEXT: s_lshr_b32 s4, s4, 20 8797; GFX9-NEXT: s_add_u32 s4, s6, s4 8798; GFX9-NEXT: s_addc_u32 s5, s7, 0 8799; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 8800; GFX9-NEXT: v_mov_b32_e32 v0, s0 8801; GFX9-NEXT: v_mov_b32_e32 v1, s1 8802; GFX9-NEXT: v_mov_b32_e32 v2, s4 8803; GFX9-NEXT: v_mov_b32_e32 v3, s5 8804; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 8805; GFX9-NEXT: s_endpgm 8806 %r = sdiv <2 x i64> %x, <i64 4096, i64 4096> 8807 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 8808 ret void 8809} 8810 8811define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 8812; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom( 8813; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 8814; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 8815; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 8816; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 8817; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095 8818; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 8819; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 8820; CHECK-NEXT: ret void 8821; 8822; GFX6-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 8823; GFX6: ; %bb.0: 8824; GFX6-NEXT: v_mov_b32_e32 v0, 0x457ff000 8825; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 8826; GFX6-NEXT: v_mac_f32_e32 v0, 0, v1 8827; GFX6-NEXT: v_rcp_f32_e32 v0, v0 8828; GFX6-NEXT: s_movk_i32 s6, 0xf001 8829; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8830; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 8831; GFX6-NEXT: s_mov_b32 s7, 0xf000 8832; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8833; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8834; GFX6-NEXT: v_trunc_f32_e32 v1, v1 8835; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8836; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 8837; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 8838; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8839; GFX6-NEXT: s_ashr_i32 s8, s1, 31 8840; GFX6-NEXT: s_lshr_b32 s8, s8, 20 8841; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 8842; GFX6-NEXT: v_mul_lo_u32 v3, v1, s6 8843; GFX6-NEXT: s_add_u32 s0, s0, s8 8844; GFX6-NEXT: s_addc_u32 s1, s1, 0 8845; GFX6-NEXT: s_ashr_i64 s[8:9], s[0:1], 12 8846; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8847; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 8848; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 8849; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 8850; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 8851; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 8852; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 8853; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 8854; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 8855; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 8856; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 8857; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 8858; GFX6-NEXT: s_ashr_i32 s10, s3, 31 8859; GFX6-NEXT: s_add_u32 s0, s2, s10 8860; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 8861; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 8862; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 8863; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8864; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 8865; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 8866; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 8867; GFX6-NEXT: v_mul_lo_u32 v2, v1, s6 8868; GFX6-NEXT: v_mul_hi_u32 v3, v0, s6 8869; GFX6-NEXT: s_mov_b32 s11, s10 8870; GFX6-NEXT: s_addc_u32 s1, s3, s10 8871; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[10:11] 8872; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 8873; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 8874; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 8875; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 8876; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 8877; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 8878; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 8879; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 8880; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 8881; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 8882; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 8883; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 8884; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 8885; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc 8886; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 8887; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8888; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 8889; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 8890; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 8891; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 8892; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 8893; GFX6-NEXT: v_mul_hi_u32 v4, s0, v1 8894; GFX6-NEXT: v_mul_hi_u32 v5, s1, v1 8895; GFX6-NEXT: v_mul_lo_u32 v1, s1, v1 8896; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8897; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 8898; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 8899; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 8900; GFX6-NEXT: s_movk_i32 s2, 0xfff 8901; GFX6-NEXT: s_mov_b32 s6, -1 8902; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 8903; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 8904; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 8905; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 8906; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 8907; GFX6-NEXT: v_mul_lo_u32 v4, v1, s2 8908; GFX6-NEXT: v_mul_hi_u32 v5, v0, s2 8909; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 8910; GFX6-NEXT: v_mul_lo_u32 v8, v0, s2 8911; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 8912; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 8913; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 8914; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 8915; GFX6-NEXT: v_mov_b32_e32 v5, s1 8916; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s0, v8 8917; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc 8918; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s2, v8 8919; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc 8920; GFX6-NEXT: s_movk_i32 s0, 0xffe 8921; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 8922; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 8923; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 8924; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 8925; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 8926; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 8927; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 8928; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 8929; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] 8930; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 8931; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 8932; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 8933; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 8934; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 8935; GFX6-NEXT: v_xor_b32_e32 v0, s10, v0 8936; GFX6-NEXT: v_xor_b32_e32 v1, s10, v1 8937; GFX6-NEXT: v_mov_b32_e32 v3, s10 8938; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s10, v0 8939; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc 8940; GFX6-NEXT: v_mov_b32_e32 v0, s8 8941; GFX6-NEXT: v_mov_b32_e32 v1, s9 8942; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 8943; GFX6-NEXT: s_endpgm 8944; 8945; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 8946; GFX9: ; %bb.0: 8947; GFX9-NEXT: v_mov_b32_e32 v0, 0x457ff000 8948; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 8949; GFX9-NEXT: v_mac_f32_e32 v0, 0, v1 8950; GFX9-NEXT: v_rcp_f32_e32 v0, v0 8951; GFX9-NEXT: s_movk_i32 s8, 0xf001 8952; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 8953; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8954; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8955; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8956; GFX9-NEXT: v_trunc_f32_e32 v1, v1 8957; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8958; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 8959; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 8960; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8961; GFX9-NEXT: s_ashr_i32 s0, s5, 31 8962; GFX9-NEXT: s_lshr_b32 s0, s0, 20 8963; GFX9-NEXT: v_mul_hi_u32 v2, v0, s8 8964; GFX9-NEXT: v_mul_lo_u32 v3, v1, s8 8965; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 8966; GFX9-NEXT: s_add_u32 s0, s4, s0 8967; GFX9-NEXT: s_addc_u32 s1, s5, 0 8968; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 8969; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 8970; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 8971; GFX9-NEXT: v_mul_hi_u32 v5, v0, v4 8972; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 8973; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 8974; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 8975; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 8976; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc 8977; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 8978; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 8979; GFX9-NEXT: s_ashr_i64 s[4:5], s[0:1], 12 8980; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 8981; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc 8982; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc 8983; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 8984; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 8985; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 8986; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 8987; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 8988; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 8989; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 8990; GFX9-NEXT: s_ashr_i32 s8, s7, 31 8991; GFX9-NEXT: s_add_u32 s0, s6, s8 8992; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 8993; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 8994; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 8995; GFX9-NEXT: v_mul_hi_u32 v7, v0, v4 8996; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 8997; GFX9-NEXT: v_mul_hi_u32 v5, v1, v4 8998; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 8999; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 9000; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 9001; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc 9002; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 9003; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 9004; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v5, vcc 9005; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 9006; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 9007; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 9008; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 9009; GFX9-NEXT: s_mov_b32 s9, s8 9010; GFX9-NEXT: s_addc_u32 s1, s7, s8 9011; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 9012; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] 9013; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 9014; GFX9-NEXT: v_mul_hi_u32 v3, s0, v0 9015; GFX9-NEXT: v_mul_hi_u32 v5, s0, v1 9016; GFX9-NEXT: v_mul_hi_u32 v6, s1, v1 9017; GFX9-NEXT: v_mul_lo_u32 v1, s1, v1 9018; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9019; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc 9020; GFX9-NEXT: v_mul_lo_u32 v5, s1, v0 9021; GFX9-NEXT: v_mul_hi_u32 v0, s1, v0 9022; GFX9-NEXT: s_movk_i32 s6, 0xfff 9023; GFX9-NEXT: v_mov_b32_e32 v4, 0 9024; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 9025; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 9026; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v6, vcc 9027; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 9028; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 9029; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 2, v0 9030; GFX9-NEXT: v_mul_lo_u32 v5, v1, s6 9031; GFX9-NEXT: v_mul_hi_u32 v6, v0, s6 9032; GFX9-NEXT: v_mul_lo_u32 v9, v0, s6 9033; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc 9034; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 9035; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc 9036; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 9037; GFX9-NEXT: v_mov_b32_e32 v6, s1 9038; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s0, v9 9039; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc 9040; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v9 9041; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc 9042; GFX9-NEXT: s_movk_i32 s0, 0xffe 9043; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v6 9044; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 9045; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 9046; GFX9-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc 9047; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v9 9048; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 9049; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 9050; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 9051; GFX9-NEXT: v_cndmask_b32_e64 v5, -1, v6, s[0:1] 9052; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 9053; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc 9054; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc 9055; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 9056; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 9057; GFX9-NEXT: v_xor_b32_e32 v0, s8, v0 9058; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 9059; GFX9-NEXT: v_mov_b32_e32 v3, s8 9060; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s8, v0 9061; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc 9062; GFX9-NEXT: v_mov_b32_e32 v0, s4 9063; GFX9-NEXT: v_mov_b32_e32 v1, s5 9064; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 9065; GFX9-NEXT: s_endpgm 9066 %r = sdiv <2 x i64> %x, <i64 4096, i64 4095> 9067 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 9068 ret void 9069} 9070 9071define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 9072; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom( 9073; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 9074; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 9075; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 9076; CHECK-NEXT: [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]] 9077; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 9078; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 9079; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 9080; CHECK-NEXT: [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]] 9081; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 9082; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 9083; CHECK-NEXT: ret void 9084; 9085; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom: 9086; GFX6: ; %bb.0: 9087; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 9088; GFX6-NEXT: s_mov_b64 s[12:13], 0x1000 9089; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9090; GFX6-NEXT: s_lshl_b64 s[8:9], s[12:13], s8 9091; GFX6-NEXT: s_lshl_b64 s[2:3], s[12:13], s10 9092; GFX6-NEXT: s_ashr_i32 s14, s9, 31 9093; GFX6-NEXT: s_add_u32 s8, s8, s14 9094; GFX6-NEXT: s_mov_b32 s15, s14 9095; GFX6-NEXT: s_addc_u32 s9, s9, s14 9096; GFX6-NEXT: s_xor_b64 s[12:13], s[8:9], s[14:15] 9097; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 9098; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s13 9099; GFX6-NEXT: s_sub_u32 s10, 0, s12 9100; GFX6-NEXT: s_subb_u32 s11, 0, s13 9101; GFX6-NEXT: s_ashr_i32 s16, s5, 31 9102; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 9103; GFX6-NEXT: v_rcp_f32_e32 v0, v0 9104; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 9105; GFX6-NEXT: s_add_u32 s0, s4, s16 9106; GFX6-NEXT: s_mov_b32 s17, s16 9107; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9108; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9109; GFX6-NEXT: v_trunc_f32_e32 v1, v1 9110; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9111; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9112; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9113; GFX6-NEXT: s_addc_u32 s1, s5, s16 9114; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[16:17] 9115; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 9116; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 9117; GFX6-NEXT: v_mul_lo_u32 v5, s11, v0 9118; GFX6-NEXT: v_mul_lo_u32 v4, s10, v0 9119; GFX6-NEXT: s_xor_b64 s[14:15], s[16:17], s[14:15] 9120; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 9121; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 9122; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 9123; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 9124; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 9125; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 9126; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 9127; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 9128; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc 9129; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 9130; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9131; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 9132; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 9133; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 9134; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9135; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9136; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9137; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9138; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 9139; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 9140; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 9141; GFX6-NEXT: s_mov_b32 s11, 0xf000 9142; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9143; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 9144; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 9145; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 9146; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 9147; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 9148; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 9149; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 9150; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 9151; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 9152; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 9153; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9154; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 9155; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc 9156; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 9157; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9158; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9159; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9160; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9161; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 9162; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 9163; GFX6-NEXT: v_mul_hi_u32 v4, s4, v1 9164; GFX6-NEXT: v_mul_hi_u32 v5, s5, v1 9165; GFX6-NEXT: v_mul_lo_u32 v1, s5, v1 9166; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9167; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9168; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 9169; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 9170; GFX6-NEXT: s_mov_b32 s10, -1 9171; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9172; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 9173; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 9174; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 9175; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 9176; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 9177; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 9178; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 9179; GFX6-NEXT: v_mov_b32_e32 v5, s13 9180; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 9181; GFX6-NEXT: v_mul_lo_u32 v3, s12, v0 9182; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 9183; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v2 9184; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 9185; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 9186; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s12, v3 9187; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 9188; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 9189; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 9190; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v5 9191; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 9192; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v4 9193; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 9194; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 9195; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 9196; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 9197; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 9198; GFX6-NEXT: s_ashr_i32 s4, s3, 31 9199; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 9200; GFX6-NEXT: s_add_u32 s2, s2, s4 9201; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] 9202; GFX6-NEXT: v_mov_b32_e32 v6, s5 9203; GFX6-NEXT: s_mov_b32 s5, s4 9204; GFX6-NEXT: s_addc_u32 s3, s3, s4 9205; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] 9206; GFX6-NEXT: v_cvt_f32_u32_e32 v8, s2 9207; GFX6-NEXT: v_cvt_f32_u32_e32 v9, s3 9208; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 9209; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 9210; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 9211; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 9212; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 9213; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 9214; GFX6-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 9215; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 9216; GFX6-NEXT: v_rcp_f32_e32 v3, v8 9217; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 9218; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 9219; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 9220; GFX6-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 9221; GFX6-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 9222; GFX6-NEXT: v_trunc_f32_e32 v4, v4 9223; GFX6-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 9224; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 9225; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 9226; GFX6-NEXT: s_sub_u32 s0, 0, s2 9227; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 9228; GFX6-NEXT: v_mul_hi_u32 v2, s0, v3 9229; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 9230; GFX6-NEXT: s_subb_u32 s1, 0, s3 9231; GFX6-NEXT: v_mul_lo_u32 v6, s1, v3 9232; GFX6-NEXT: s_ashr_i32 s12, s7, 31 9233; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 9234; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 9235; GFX6-NEXT: v_add_i32_e32 v2, vcc, v6, v2 9236; GFX6-NEXT: v_mul_lo_u32 v6, v3, v2 9237; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 9238; GFX6-NEXT: v_mul_hi_u32 v8, v3, v2 9239; GFX6-NEXT: v_mul_hi_u32 v9, v4, v2 9240; GFX6-NEXT: v_mul_lo_u32 v2, v4, v2 9241; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 9242; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 9243; GFX6-NEXT: v_mul_lo_u32 v8, v4, v5 9244; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 9245; GFX6-NEXT: s_mov_b32 s13, s12 9246; GFX6-NEXT: v_xor_b32_e32 v0, s14, v0 9247; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 9248; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc 9249; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc 9250; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 9251; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 9252; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9253; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 9254; GFX6-NEXT: v_mul_lo_u32 v4, s0, v3 9255; GFX6-NEXT: v_mul_hi_u32 v5, s0, v2 9256; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2 9257; GFX6-NEXT: v_xor_b32_e32 v1, s15, v1 9258; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 9259; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 9260; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 9261; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 9262; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 9263; GFX6-NEXT: v_mul_hi_u32 v10, v2, v4 9264; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 9265; GFX6-NEXT: v_mul_lo_u32 v5, v3, v5 9266; GFX6-NEXT: v_mul_hi_u32 v6, v3, v4 9267; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 9268; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 9269; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 9270; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 9271; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc 9272; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc 9273; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 9274; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 9275; GFX6-NEXT: s_add_u32 s0, s6, s12 9276; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9277; GFX6-NEXT: s_addc_u32 s1, s7, s12 9278; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc 9279; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] 9280; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 9281; GFX6-NEXT: v_mul_hi_u32 v5, s6, v2 9282; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 9283; GFX6-NEXT: v_mul_hi_u32 v8, s7, v3 9284; GFX6-NEXT: v_mul_lo_u32 v3, s7, v3 9285; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 9286; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc 9287; GFX6-NEXT: v_mul_lo_u32 v7, s7, v2 9288; GFX6-NEXT: v_mul_hi_u32 v2, s7, v2 9289; GFX6-NEXT: v_mov_b32_e32 v6, s15 9290; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 9291; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc 9292; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc 9293; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 9294; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9295; GFX6-NEXT: v_mul_lo_u32 v4, s2, v3 9296; GFX6-NEXT: v_mul_hi_u32 v5, s2, v2 9297; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s14, v0 9298; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc 9299; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 9300; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 9301; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 9302; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 9303; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s7, v4 9304; GFX6-NEXT: v_mov_b32_e32 v7, s3 9305; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s6, v5 9306; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc 9307; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s2, v5 9308; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] 9309; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6 9310; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 9311; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v7 9312; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 9313; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6 9314; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] 9315; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v2 9316; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] 9317; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v2 9318; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] 9319; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 9320; GFX6-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[0:1] 9321; GFX6-NEXT: v_mov_b32_e32 v8, s7 9322; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc 9323; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v4 9324; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 9325; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v5 9326; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 9327; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v4 9328; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc 9329; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 9330; GFX6-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] 9331; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 9332; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[4:5] 9333; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 9334; GFX6-NEXT: v_xor_b32_e32 v2, s0, v2 9335; GFX6-NEXT: v_xor_b32_e32 v3, s1, v3 9336; GFX6-NEXT: v_mov_b32_e32 v4, s1 9337; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v2 9338; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc 9339; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9340; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 9341; GFX6-NEXT: s_endpgm 9342; 9343; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom: 9344; GFX9: ; %bb.0: 9345; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 9346; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 9347; GFX9-NEXT: v_mov_b32_e32 v4, 0 9348; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9349; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 9350; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 9351; GFX9-NEXT: s_ashr_i32 s8, s3, 31 9352; GFX9-NEXT: s_add_u32 s2, s2, s8 9353; GFX9-NEXT: s_mov_b32 s9, s8 9354; GFX9-NEXT: s_addc_u32 s3, s3, s8 9355; GFX9-NEXT: s_xor_b64 s[12:13], s[2:3], s[8:9] 9356; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 9357; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 9358; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 9359; GFX9-NEXT: s_sub_u32 s0, 0, s12 9360; GFX9-NEXT: s_subb_u32 s1, 0, s13 9361; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 9362; GFX9-NEXT: v_rcp_f32_e32 v0, v0 9363; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9364; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9365; GFX9-NEXT: v_trunc_f32_e32 v1, v1 9366; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9367; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 9368; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 9369; GFX9-NEXT: v_readfirstlane_b32 s14, v1 9370; GFX9-NEXT: v_readfirstlane_b32 s15, v0 9371; GFX9-NEXT: s_mul_i32 s16, s0, s14 9372; GFX9-NEXT: s_mul_hi_u32 s18, s0, s15 9373; GFX9-NEXT: s_mul_i32 s17, s1, s15 9374; GFX9-NEXT: s_add_i32 s16, s18, s16 9375; GFX9-NEXT: s_add_i32 s16, s16, s17 9376; GFX9-NEXT: s_mul_i32 s19, s0, s15 9377; GFX9-NEXT: s_mul_hi_u32 s17, s15, s16 9378; GFX9-NEXT: s_mul_i32 s18, s15, s16 9379; GFX9-NEXT: s_mul_hi_u32 s15, s15, s19 9380; GFX9-NEXT: s_add_u32 s15, s15, s18 9381; GFX9-NEXT: s_addc_u32 s17, 0, s17 9382; GFX9-NEXT: s_mul_hi_u32 s20, s14, s19 9383; GFX9-NEXT: s_mul_i32 s19, s14, s19 9384; GFX9-NEXT: s_add_u32 s15, s15, s19 9385; GFX9-NEXT: s_mul_hi_u32 s18, s14, s16 9386; GFX9-NEXT: s_addc_u32 s15, s17, s20 9387; GFX9-NEXT: s_addc_u32 s17, s18, 0 9388; GFX9-NEXT: s_mul_i32 s16, s14, s16 9389; GFX9-NEXT: s_add_u32 s15, s15, s16 9390; GFX9-NEXT: s_addc_u32 s16, 0, s17 9391; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s15, v0 9392; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9393; GFX9-NEXT: s_addc_u32 s14, s14, s16 9394; GFX9-NEXT: v_readfirstlane_b32 s16, v0 9395; GFX9-NEXT: s_mul_i32 s15, s0, s14 9396; GFX9-NEXT: s_mul_hi_u32 s17, s0, s16 9397; GFX9-NEXT: s_add_i32 s15, s17, s15 9398; GFX9-NEXT: s_mul_i32 s1, s1, s16 9399; GFX9-NEXT: s_add_i32 s15, s15, s1 9400; GFX9-NEXT: s_mul_i32 s0, s0, s16 9401; GFX9-NEXT: s_mul_hi_u32 s17, s14, s0 9402; GFX9-NEXT: s_mul_i32 s18, s14, s0 9403; GFX9-NEXT: s_mul_i32 s20, s16, s15 9404; GFX9-NEXT: s_mul_hi_u32 s0, s16, s0 9405; GFX9-NEXT: s_mul_hi_u32 s19, s16, s15 9406; GFX9-NEXT: s_add_u32 s0, s0, s20 9407; GFX9-NEXT: s_addc_u32 s16, 0, s19 9408; GFX9-NEXT: s_add_u32 s0, s0, s18 9409; GFX9-NEXT: s_mul_hi_u32 s1, s14, s15 9410; GFX9-NEXT: s_addc_u32 s0, s16, s17 9411; GFX9-NEXT: s_addc_u32 s1, s1, 0 9412; GFX9-NEXT: s_mul_i32 s15, s14, s15 9413; GFX9-NEXT: s_add_u32 s0, s0, s15 9414; GFX9-NEXT: s_addc_u32 s1, 0, s1 9415; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 9416; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9417; GFX9-NEXT: s_addc_u32 s16, s14, s1 9418; GFX9-NEXT: s_ashr_i32 s14, s5, 31 9419; GFX9-NEXT: s_add_u32 s0, s4, s14 9420; GFX9-NEXT: s_mov_b32 s15, s14 9421; GFX9-NEXT: s_addc_u32 s1, s5, s14 9422; GFX9-NEXT: s_xor_b64 s[4:5], s[0:1], s[14:15] 9423; GFX9-NEXT: v_readfirstlane_b32 s17, v0 9424; GFX9-NEXT: s_mul_i32 s1, s4, s16 9425; GFX9-NEXT: s_mul_hi_u32 s18, s4, s17 9426; GFX9-NEXT: s_mul_hi_u32 s0, s4, s16 9427; GFX9-NEXT: s_add_u32 s1, s18, s1 9428; GFX9-NEXT: s_addc_u32 s0, 0, s0 9429; GFX9-NEXT: s_mul_hi_u32 s19, s5, s17 9430; GFX9-NEXT: s_mul_i32 s17, s5, s17 9431; GFX9-NEXT: s_add_u32 s1, s1, s17 9432; GFX9-NEXT: s_mul_hi_u32 s18, s5, s16 9433; GFX9-NEXT: s_addc_u32 s0, s0, s19 9434; GFX9-NEXT: s_addc_u32 s1, s18, 0 9435; GFX9-NEXT: s_mul_i32 s16, s5, s16 9436; GFX9-NEXT: s_add_u32 s16, s0, s16 9437; GFX9-NEXT: s_addc_u32 s17, 0, s1 9438; GFX9-NEXT: s_mul_i32 s0, s12, s17 9439; GFX9-NEXT: s_mul_hi_u32 s1, s12, s16 9440; GFX9-NEXT: s_add_i32 s0, s1, s0 9441; GFX9-NEXT: s_mul_i32 s1, s13, s16 9442; GFX9-NEXT: s_add_i32 s18, s0, s1 9443; GFX9-NEXT: s_mul_i32 s1, s12, s16 9444; GFX9-NEXT: v_mov_b32_e32 v0, s1 9445; GFX9-NEXT: s_sub_i32 s0, s5, s18 9446; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 9447; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9448; GFX9-NEXT: s_subb_u32 s4, s0, s13 9449; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s12, v0 9450; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 9451; GFX9-NEXT: s_subb_u32 s4, s4, 0 9452; GFX9-NEXT: s_cmp_ge_u32 s4, s13 9453; GFX9-NEXT: s_cselect_b32 s19, -1, 0 9454; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v1 9455; GFX9-NEXT: s_cmp_eq_u32 s4, s13 9456; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] 9457; GFX9-NEXT: v_mov_b32_e32 v2, s19 9458; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 9459; GFX9-NEXT: s_add_u32 s4, s16, 2 9460; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1] 9461; GFX9-NEXT: s_addc_u32 s0, s17, 0 9462; GFX9-NEXT: s_add_u32 s19, s16, 1 9463; GFX9-NEXT: s_addc_u32 s1, s17, 0 9464; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9465; GFX9-NEXT: s_subb_u32 s5, s5, s18 9466; GFX9-NEXT: s_cmp_ge_u32 s5, s13 9467; GFX9-NEXT: v_mov_b32_e32 v2, s1 9468; GFX9-NEXT: v_mov_b32_e32 v3, s0 9469; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 9470; GFX9-NEXT: s_cselect_b32 s18, -1, 0 9471; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 9472; GFX9-NEXT: s_cmp_eq_u32 s5, s13 9473; GFX9-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[0:1] 9474; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc 9475; GFX9-NEXT: v_mov_b32_e32 v2, s18 9476; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 9477; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc 9478; GFX9-NEXT: v_mov_b32_e32 v2, s17 9479; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 9480; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc 9481; GFX9-NEXT: v_mov_b32_e32 v1, s19 9482; GFX9-NEXT: v_mov_b32_e32 v2, s4 9483; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 9484; GFX9-NEXT: s_xor_b64 s[0:1], s[14:15], s[8:9] 9485; GFX9-NEXT: s_ashr_i32 s4, s11, 31 9486; GFX9-NEXT: s_add_u32 s8, s10, s4 9487; GFX9-NEXT: s_mov_b32 s5, s4 9488; GFX9-NEXT: s_addc_u32 s9, s11, s4 9489; GFX9-NEXT: v_mov_b32_e32 v2, s16 9490; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5] 9491; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc 9492; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8 9493; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s9 9494; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 9495; GFX9-NEXT: v_xor_b32_e32 v5, s1, v0 9496; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v1 9497; GFX9-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 9498; GFX9-NEXT: v_rcp_f32_e32 v2, v2 9499; GFX9-NEXT: s_sub_u32 s0, 0, s8 9500; GFX9-NEXT: v_mov_b32_e32 v6, s1 9501; GFX9-NEXT: s_subb_u32 s1, 0, s9 9502; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 9503; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 9504; GFX9-NEXT: v_trunc_f32_e32 v3, v3 9505; GFX9-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 9506; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 9507; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 9508; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v6, vcc 9509; GFX9-NEXT: v_readfirstlane_b32 s10, v2 9510; GFX9-NEXT: v_readfirstlane_b32 s13, v3 9511; GFX9-NEXT: s_mul_hi_u32 s12, s0, s10 9512; GFX9-NEXT: s_mul_i32 s14, s0, s13 9513; GFX9-NEXT: s_mul_i32 s11, s1, s10 9514; GFX9-NEXT: s_add_i32 s12, s12, s14 9515; GFX9-NEXT: s_add_i32 s12, s12, s11 9516; GFX9-NEXT: s_mul_i32 s15, s0, s10 9517; GFX9-NEXT: s_mul_hi_u32 s11, s10, s12 9518; GFX9-NEXT: s_mul_i32 s14, s10, s12 9519; GFX9-NEXT: s_mul_hi_u32 s10, s10, s15 9520; GFX9-NEXT: s_add_u32 s10, s10, s14 9521; GFX9-NEXT: s_addc_u32 s11, 0, s11 9522; GFX9-NEXT: s_mul_hi_u32 s16, s13, s15 9523; GFX9-NEXT: s_mul_i32 s15, s13, s15 9524; GFX9-NEXT: s_add_u32 s10, s10, s15 9525; GFX9-NEXT: s_mul_hi_u32 s14, s13, s12 9526; GFX9-NEXT: s_addc_u32 s10, s11, s16 9527; GFX9-NEXT: s_addc_u32 s11, s14, 0 9528; GFX9-NEXT: s_mul_i32 s12, s13, s12 9529; GFX9-NEXT: s_add_u32 s10, s10, s12 9530; GFX9-NEXT: s_addc_u32 s11, 0, s11 9531; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s10, v2 9532; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9533; GFX9-NEXT: s_addc_u32 s10, s13, s11 9534; GFX9-NEXT: v_readfirstlane_b32 s12, v2 9535; GFX9-NEXT: s_mul_i32 s11, s0, s10 9536; GFX9-NEXT: s_mul_hi_u32 s13, s0, s12 9537; GFX9-NEXT: s_add_i32 s11, s13, s11 9538; GFX9-NEXT: s_mul_i32 s1, s1, s12 9539; GFX9-NEXT: s_add_i32 s11, s11, s1 9540; GFX9-NEXT: s_mul_i32 s0, s0, s12 9541; GFX9-NEXT: s_mul_hi_u32 s13, s10, s0 9542; GFX9-NEXT: s_mul_i32 s14, s10, s0 9543; GFX9-NEXT: s_mul_i32 s16, s12, s11 9544; GFX9-NEXT: s_mul_hi_u32 s0, s12, s0 9545; GFX9-NEXT: s_mul_hi_u32 s15, s12, s11 9546; GFX9-NEXT: s_add_u32 s0, s0, s16 9547; GFX9-NEXT: s_addc_u32 s12, 0, s15 9548; GFX9-NEXT: s_add_u32 s0, s0, s14 9549; GFX9-NEXT: s_mul_hi_u32 s1, s10, s11 9550; GFX9-NEXT: s_addc_u32 s0, s12, s13 9551; GFX9-NEXT: s_addc_u32 s1, s1, 0 9552; GFX9-NEXT: s_mul_i32 s11, s10, s11 9553; GFX9-NEXT: s_add_u32 s0, s0, s11 9554; GFX9-NEXT: s_addc_u32 s1, 0, s1 9555; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 9556; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9557; GFX9-NEXT: s_addc_u32 s12, s10, s1 9558; GFX9-NEXT: s_ashr_i32 s10, s7, 31 9559; GFX9-NEXT: s_add_u32 s0, s6, s10 9560; GFX9-NEXT: s_mov_b32 s11, s10 9561; GFX9-NEXT: s_addc_u32 s1, s7, s10 9562; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 9563; GFX9-NEXT: v_readfirstlane_b32 s13, v2 9564; GFX9-NEXT: s_mul_i32 s1, s6, s12 9565; GFX9-NEXT: s_mul_hi_u32 s14, s6, s13 9566; GFX9-NEXT: s_mul_hi_u32 s0, s6, s12 9567; GFX9-NEXT: s_add_u32 s1, s14, s1 9568; GFX9-NEXT: s_addc_u32 s0, 0, s0 9569; GFX9-NEXT: s_mul_hi_u32 s15, s7, s13 9570; GFX9-NEXT: s_mul_i32 s13, s7, s13 9571; GFX9-NEXT: s_add_u32 s1, s1, s13 9572; GFX9-NEXT: s_mul_hi_u32 s14, s7, s12 9573; GFX9-NEXT: s_addc_u32 s0, s0, s15 9574; GFX9-NEXT: s_addc_u32 s1, s14, 0 9575; GFX9-NEXT: s_mul_i32 s12, s7, s12 9576; GFX9-NEXT: s_add_u32 s12, s0, s12 9577; GFX9-NEXT: s_addc_u32 s13, 0, s1 9578; GFX9-NEXT: s_mul_i32 s0, s8, s13 9579; GFX9-NEXT: s_mul_hi_u32 s1, s8, s12 9580; GFX9-NEXT: s_add_i32 s0, s1, s0 9581; GFX9-NEXT: s_mul_i32 s1, s9, s12 9582; GFX9-NEXT: s_add_i32 s14, s0, s1 9583; GFX9-NEXT: s_mul_i32 s1, s8, s12 9584; GFX9-NEXT: v_mov_b32_e32 v2, s1 9585; GFX9-NEXT: s_sub_i32 s0, s7, s14 9586; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s6, v2 9587; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9588; GFX9-NEXT: s_subb_u32 s6, s0, s9 9589; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s8, v2 9590; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 9591; GFX9-NEXT: s_subb_u32 s6, s6, 0 9592; GFX9-NEXT: s_cmp_ge_u32 s6, s9 9593; GFX9-NEXT: s_cselect_b32 s15, -1, 0 9594; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 9595; GFX9-NEXT: s_cmp_eq_u32 s6, s9 9596; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] 9597; GFX9-NEXT: v_mov_b32_e32 v5, s15 9598; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 9599; GFX9-NEXT: s_add_u32 s6, s12, 2 9600; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1] 9601; GFX9-NEXT: s_addc_u32 s0, s13, 0 9602; GFX9-NEXT: s_add_u32 s15, s12, 1 9603; GFX9-NEXT: s_addc_u32 s1, s13, 0 9604; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9605; GFX9-NEXT: s_subb_u32 s7, s7, s14 9606; GFX9-NEXT: s_cmp_ge_u32 s7, s9 9607; GFX9-NEXT: v_mov_b32_e32 v5, s1 9608; GFX9-NEXT: v_mov_b32_e32 v6, s0 9609; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 9610; GFX9-NEXT: s_cselect_b32 s14, -1, 0 9611; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 9612; GFX9-NEXT: s_cmp_eq_u32 s7, s9 9613; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] 9614; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc 9615; GFX9-NEXT: v_mov_b32_e32 v5, s14 9616; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 9617; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc 9618; GFX9-NEXT: v_mov_b32_e32 v5, s13 9619; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 9620; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc 9621; GFX9-NEXT: v_mov_b32_e32 v3, s15 9622; GFX9-NEXT: v_mov_b32_e32 v5, s6 9623; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 9624; GFX9-NEXT: v_mov_b32_e32 v5, s12 9625; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc 9626; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[4:5] 9627; GFX9-NEXT: v_xor_b32_e32 v3, s0, v3 9628; GFX9-NEXT: v_xor_b32_e32 v5, s1, v2 9629; GFX9-NEXT: v_mov_b32_e32 v6, s1 9630; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v3 9631; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc 9632; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9633; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 9634; GFX9-NEXT: s_endpgm 9635 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 9636 %r = sdiv <2 x i64> %x, %shl.y 9637 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 9638 ret void 9639} 9640 9641define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 9642; CHECK-LABEL: @srem_i64_oddk_denom( 9643; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 1235195 9644; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 9645; CHECK-NEXT: ret void 9646; 9647; GFX6-LABEL: srem_i64_oddk_denom: 9648; GFX6: ; %bb.0: 9649; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 9650; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 9651; GFX6-NEXT: v_rcp_f32_e32 v0, v0 9652; GFX6-NEXT: s_mov_b32 s4, 0xffed2705 9653; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 9654; GFX6-NEXT: s_mov_b32 s7, 0xf000 9655; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9656; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9657; GFX6-NEXT: v_trunc_f32_e32 v1, v1 9658; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9659; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9660; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9661; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9662; GFX6-NEXT: s_ashr_i32 s8, s3, 31 9663; GFX6-NEXT: s_add_u32 s2, s2, s8 9664; GFX6-NEXT: v_mul_lo_u32 v2, v1, s4 9665; GFX6-NEXT: v_mul_hi_u32 v3, v0, s4 9666; GFX6-NEXT: v_mul_lo_u32 v4, v0, s4 9667; GFX6-NEXT: s_mov_b32 s9, s8 9668; GFX6-NEXT: s_addc_u32 s3, s3, s8 9669; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 9670; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v0 9671; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 9672; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 9673; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 9674; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 9675; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9676; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 9677; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 9678; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 9679; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 9680; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] 9681; GFX6-NEXT: s_mov_b32 s5, s1 9682; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 9683; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 9684; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 9685; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9686; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9687; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9688; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9689; GFX6-NEXT: v_mul_lo_u32 v2, v1, s4 9690; GFX6-NEXT: v_mul_hi_u32 v3, v0, s4 9691; GFX6-NEXT: s_mov_b32 s6, -1 9692; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9693; GFX6-NEXT: v_mul_lo_u32 v3, v0, s4 9694; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 9695; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 9696; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 9697; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 9698; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 9699; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 9700; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 9701; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 9702; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 9703; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9704; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 9705; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc 9706; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 9707; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9708; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9709; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9710; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9711; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 9712; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 9713; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 9714; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 9715; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 9716; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9717; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9718; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 9719; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 9720; GFX6-NEXT: s_mov_b32 s4, s0 9721; GFX6-NEXT: s_mov_b32 s0, 0x12d8fb 9722; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9723; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 9724; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 9725; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 9726; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 9727; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 9728; GFX6-NEXT: v_mul_hi_u32 v2, v0, s0 9729; GFX6-NEXT: v_mul_lo_u32 v0, v0, s0 9730; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 9731; GFX6-NEXT: v_mov_b32_e32 v2, s3 9732; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 9733; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 9734; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v0 9735; GFX6-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc 9736; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s0, v2 9737; GFX6-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v3, vcc 9738; GFX6-NEXT: s_mov_b32 s0, 0x12d8fa 9739; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v2 9740; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 9741; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 9742; GFX6-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc 9743; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 9744; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v0 9745; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 9746; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 9747; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 9748; GFX6-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[0:1] 9749; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 9750; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 9751; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 9752; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 9753; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 9754; GFX6-NEXT: v_xor_b32_e32 v1, s8, v1 9755; GFX6-NEXT: v_mov_b32_e32 v2, s8 9756; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 9757; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 9758; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 9759; GFX6-NEXT: s_endpgm 9760; 9761; GFX9-LABEL: srem_i64_oddk_denom: 9762; GFX9: ; %bb.0: 9763; GFX9-NEXT: v_mov_b32_e32 v0, 0x4996c7d8 9764; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 9765; GFX9-NEXT: v_mac_f32_e32 v0, 0, v1 9766; GFX9-NEXT: v_rcp_f32_e32 v0, v0 9767; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 9768; GFX9-NEXT: v_mov_b32_e32 v2, 0 9769; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9770; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9771; GFX9-NEXT: v_trunc_f32_e32 v1, v1 9772; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9773; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 9774; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 9775; GFX9-NEXT: v_readfirstlane_b32 s0, v1 9776; GFX9-NEXT: v_readfirstlane_b32 s1, v0 9777; GFX9-NEXT: s_mul_hi_u32 s2, s1, 0xffed2705 9778; GFX9-NEXT: s_mul_i32 s3, s0, 0xffed2705 9779; GFX9-NEXT: s_add_i32 s2, s2, s3 9780; GFX9-NEXT: s_sub_i32 s2, s2, s1 9781; GFX9-NEXT: s_mul_i32 s9, s1, 0xffed2705 9782; GFX9-NEXT: s_mul_hi_u32 s3, s1, s2 9783; GFX9-NEXT: s_mul_i32 s8, s1, s2 9784; GFX9-NEXT: s_mul_hi_u32 s1, s1, s9 9785; GFX9-NEXT: s_add_u32 s1, s1, s8 9786; GFX9-NEXT: s_addc_u32 s3, 0, s3 9787; GFX9-NEXT: s_mul_hi_u32 s10, s0, s9 9788; GFX9-NEXT: s_mul_i32 s9, s0, s9 9789; GFX9-NEXT: s_add_u32 s1, s1, s9 9790; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2 9791; GFX9-NEXT: s_addc_u32 s1, s3, s10 9792; GFX9-NEXT: s_addc_u32 s3, s8, 0 9793; GFX9-NEXT: s_mul_i32 s2, s0, s2 9794; GFX9-NEXT: s_add_u32 s1, s1, s2 9795; GFX9-NEXT: s_addc_u32 s2, 0, s3 9796; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s1, v0 9797; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9798; GFX9-NEXT: s_addc_u32 s0, s0, s2 9799; GFX9-NEXT: v_readfirstlane_b32 s2, v0 9800; GFX9-NEXT: s_mul_i32 s1, s0, 0xffed2705 9801; GFX9-NEXT: s_mul_hi_u32 s3, s2, 0xffed2705 9802; GFX9-NEXT: s_add_i32 s3, s3, s1 9803; GFX9-NEXT: s_sub_i32 s1, s3, s2 9804; GFX9-NEXT: s_mul_i32 s8, s2, 0xffed2705 9805; GFX9-NEXT: s_mul_hi_u32 s11, s2, s1 9806; GFX9-NEXT: s_mul_i32 s12, s2, s1 9807; GFX9-NEXT: s_mul_hi_u32 s2, s2, s8 9808; GFX9-NEXT: s_add_u32 s2, s2, s12 9809; GFX9-NEXT: s_mul_hi_u32 s9, s0, s8 9810; GFX9-NEXT: s_mul_i32 s10, s0, s8 9811; GFX9-NEXT: s_addc_u32 s8, 0, s11 9812; GFX9-NEXT: s_add_u32 s2, s2, s10 9813; GFX9-NEXT: s_mul_hi_u32 s3, s0, s1 9814; GFX9-NEXT: s_addc_u32 s2, s8, s9 9815; GFX9-NEXT: s_addc_u32 s3, s3, 0 9816; GFX9-NEXT: s_mul_i32 s1, s0, s1 9817; GFX9-NEXT: s_add_u32 s1, s2, s1 9818; GFX9-NEXT: s_addc_u32 s2, 0, s3 9819; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s1, v0 9820; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9821; GFX9-NEXT: s_addc_u32 s8, s0, s2 9822; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9823; GFX9-NEXT: s_ashr_i32 s2, s7, 31 9824; GFX9-NEXT: s_add_u32 s0, s6, s2 9825; GFX9-NEXT: s_mov_b32 s3, s2 9826; GFX9-NEXT: s_addc_u32 s1, s7, s2 9827; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 9828; GFX9-NEXT: v_readfirstlane_b32 s7, v0 9829; GFX9-NEXT: s_mul_i32 s6, s0, s8 9830; GFX9-NEXT: s_mul_hi_u32 s9, s0, s7 9831; GFX9-NEXT: s_mul_hi_u32 s3, s0, s8 9832; GFX9-NEXT: s_add_u32 s6, s9, s6 9833; GFX9-NEXT: s_addc_u32 s3, 0, s3 9834; GFX9-NEXT: s_mul_hi_u32 s10, s1, s7 9835; GFX9-NEXT: s_mul_i32 s7, s1, s7 9836; GFX9-NEXT: s_add_u32 s6, s6, s7 9837; GFX9-NEXT: s_mul_hi_u32 s9, s1, s8 9838; GFX9-NEXT: s_addc_u32 s3, s3, s10 9839; GFX9-NEXT: s_addc_u32 s6, s9, 0 9840; GFX9-NEXT: s_mul_i32 s7, s1, s8 9841; GFX9-NEXT: s_add_u32 s3, s3, s7 9842; GFX9-NEXT: s_addc_u32 s6, 0, s6 9843; GFX9-NEXT: s_mul_hi_u32 s8, s3, 0x12d8fb 9844; GFX9-NEXT: s_mul_i32 s3, s3, 0x12d8fb 9845; GFX9-NEXT: s_mul_i32 s6, s6, 0x12d8fb 9846; GFX9-NEXT: v_mov_b32_e32 v0, s3 9847; GFX9-NEXT: s_add_i32 s8, s8, s6 9848; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 9849; GFX9-NEXT: s_mov_b32 s7, 0x12d8fb 9850; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9851; GFX9-NEXT: s_subb_u32 s3, s1, s8 9852; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s7, v0 9853; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9854; GFX9-NEXT: s_subb_u32 s0, s3, 0 9855; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s7, v1 9856; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 9857; GFX9-NEXT: s_subb_u32 s1, s0, 0 9858; GFX9-NEXT: s_mov_b32 s6, 0x12d8fa 9859; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 9860; GFX9-NEXT: s_cmp_eq_u32 s0, 0 9861; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 9862; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 9863; GFX9-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 9864; GFX9-NEXT: v_mov_b32_e32 v5, s0 9865; GFX9-NEXT: v_mov_b32_e32 v6, s1 9866; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 9867; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s6, v0 9868; GFX9-NEXT: s_cmp_eq_u32 s3, 0 9869; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc 9870; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 9871; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0 9872; GFX9-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[0:1] 9873; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 9874; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 9875; GFX9-NEXT: v_mov_b32_e32 v6, s3 9876; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[0:1] 9877; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[0:1] 9878; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 9879; GFX9-NEXT: v_xor_b32_e32 v1, s2, v4 9880; GFX9-NEXT: v_mov_b32_e32 v3, s2 9881; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s2, v0 9882; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 9883; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 9884; GFX9-NEXT: s_endpgm 9885 %r = srem i64 %x, 1235195 9886 store i64 %r, i64 addrspace(1)* %out 9887 ret void 9888} 9889 9890define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 9891; CHECK-LABEL: @srem_i64_pow2k_denom( 9892; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 4096 9893; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 9894; CHECK-NEXT: ret void 9895; 9896; GFX6-LABEL: srem_i64_pow2k_denom: 9897; GFX6: ; %bb.0: 9898; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 9899; GFX6-NEXT: s_mov_b32 s7, 0xf000 9900; GFX6-NEXT: s_mov_b32 s6, -1 9901; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9902; GFX6-NEXT: s_mov_b32 s4, s0 9903; GFX6-NEXT: s_ashr_i32 s0, s3, 31 9904; GFX6-NEXT: s_lshr_b32 s0, s0, 20 9905; GFX6-NEXT: s_add_u32 s0, s2, s0 9906; GFX6-NEXT: s_mov_b32 s5, s1 9907; GFX6-NEXT: s_addc_u32 s1, s3, 0 9908; GFX6-NEXT: s_and_b32 s0, s0, 0xfffff000 9909; GFX6-NEXT: s_sub_u32 s0, s2, s0 9910; GFX6-NEXT: s_subb_u32 s1, s3, s1 9911; GFX6-NEXT: v_mov_b32_e32 v0, s0 9912; GFX6-NEXT: v_mov_b32_e32 v1, s1 9913; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 9914; GFX6-NEXT: s_endpgm 9915; 9916; GFX9-LABEL: srem_i64_pow2k_denom: 9917; GFX9: ; %bb.0: 9918; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 9919; GFX9-NEXT: v_mov_b32_e32 v2, 0 9920; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9921; GFX9-NEXT: s_ashr_i32 s4, s3, 31 9922; GFX9-NEXT: s_lshr_b32 s4, s4, 20 9923; GFX9-NEXT: s_add_u32 s4, s2, s4 9924; GFX9-NEXT: s_addc_u32 s5, s3, 0 9925; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 9926; GFX9-NEXT: s_sub_u32 s2, s2, s4 9927; GFX9-NEXT: s_subb_u32 s3, s3, s5 9928; GFX9-NEXT: v_mov_b32_e32 v0, s2 9929; GFX9-NEXT: v_mov_b32_e32 v1, s3 9930; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 9931; GFX9-NEXT: s_endpgm 9932 %r = srem i64 %x, 4096 9933 store i64 %r, i64 addrspace(1)* %out 9934 ret void 9935} 9936 9937define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 9938; CHECK-LABEL: @srem_i64_pow2_shl_denom( 9939; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 9940; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]] 9941; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 9942; CHECK-NEXT: ret void 9943; 9944; GFX6-LABEL: srem_i64_pow2_shl_denom: 9945; GFX6: ; %bb.0: 9946; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd 9947; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 9948; GFX6-NEXT: s_mov_b32 s7, 0xf000 9949; GFX6-NEXT: s_mov_b32 s6, -1 9950; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9951; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 9952; GFX6-NEXT: s_ashr_i32 s4, s3, 31 9953; GFX6-NEXT: s_add_u32 s2, s2, s4 9954; GFX6-NEXT: s_mov_b32 s5, s4 9955; GFX6-NEXT: s_addc_u32 s3, s3, s4 9956; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] 9957; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 9958; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 9959; GFX6-NEXT: s_sub_u32 s4, 0, s8 9960; GFX6-NEXT: s_subb_u32 s5, 0, s9 9961; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 9962; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 9963; GFX6-NEXT: v_rcp_f32_e32 v0, v0 9964; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9965; GFX6-NEXT: s_ashr_i32 s10, s3, 31 9966; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9967; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9968; GFX6-NEXT: v_trunc_f32_e32 v1, v1 9969; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9970; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9971; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9972; GFX6-NEXT: s_add_u32 s2, s2, s10 9973; GFX6-NEXT: s_mov_b32 s11, s10 9974; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 9975; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 9976; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 9977; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 9978; GFX6-NEXT: s_addc_u32 s3, s3, s10 9979; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 9980; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 9981; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 9982; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 9983; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 9984; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 9985; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9986; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 9987; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 9988; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 9989; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 9990; GFX6-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] 9991; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 9992; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 9993; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 9994; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9995; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9996; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9997; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9998; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 9999; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 10000; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 10001; GFX6-NEXT: s_mov_b32 s5, s1 10002; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10003; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 10004; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 10005; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 10006; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 10007; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 10008; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 10009; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 10010; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 10011; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 10012; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 10013; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 10014; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 10015; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc 10016; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 10017; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10018; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 10019; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 10020; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 10021; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 10022; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 10023; GFX6-NEXT: v_mul_hi_u32 v4, s12, v1 10024; GFX6-NEXT: v_mul_hi_u32 v5, s13, v1 10025; GFX6-NEXT: v_mul_lo_u32 v1, s13, v1 10026; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10027; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 10028; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 10029; GFX6-NEXT: v_mul_hi_u32 v0, s13, v0 10030; GFX6-NEXT: s_mov_b32 s4, s0 10031; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 10032; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 10033; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 10034; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 10035; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 10036; GFX6-NEXT: v_mul_lo_u32 v1, s8, v1 10037; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0 10038; GFX6-NEXT: v_mul_lo_u32 v3, s9, v0 10039; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 10040; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 10041; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 10042; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s13, v1 10043; GFX6-NEXT: v_mov_b32_e32 v3, s9 10044; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 10045; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 10046; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s8, v0 10047; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 10048; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v5 10049; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 10050; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v4 10051; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 10052; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 10053; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v5 10054; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s8, v4 10055; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 10056; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 10057; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 10058; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 10059; GFX6-NEXT: v_mov_b32_e32 v5, s13 10060; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 10061; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 10062; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 10063; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 10064; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 10065; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 10066; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 10067; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 10068; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 10069; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 10070; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 10071; GFX6-NEXT: v_xor_b32_e32 v0, s10, v0 10072; GFX6-NEXT: v_xor_b32_e32 v1, s10, v1 10073; GFX6-NEXT: v_mov_b32_e32 v2, s10 10074; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0 10075; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 10076; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 10077; GFX6-NEXT: s_endpgm 10078; 10079; GFX9-LABEL: srem_i64_pow2_shl_denom: 10080; GFX9: ; %bb.0: 10081; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 10082; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 10083; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10084; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 10085; GFX9-NEXT: s_ashr_i32 s4, s3, 31 10086; GFX9-NEXT: s_add_u32 s2, s2, s4 10087; GFX9-NEXT: s_mov_b32 s5, s4 10088; GFX9-NEXT: s_addc_u32 s3, s3, s4 10089; GFX9-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] 10090; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 10091; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 10092; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 10093; GFX9-NEXT: s_sub_u32 s0, 0, s8 10094; GFX9-NEXT: s_subb_u32 s1, 0, s9 10095; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 10096; GFX9-NEXT: v_rcp_f32_e32 v1, v0 10097; GFX9-NEXT: v_mov_b32_e32 v0, 0 10098; GFX9-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1 10099; GFX9-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1 10100; GFX9-NEXT: v_trunc_f32_e32 v2, v2 10101; GFX9-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 10102; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 10103; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 10104; GFX9-NEXT: v_readfirstlane_b32 s2, v2 10105; GFX9-NEXT: v_readfirstlane_b32 s3, v1 10106; GFX9-NEXT: s_mul_i32 s10, s0, s2 10107; GFX9-NEXT: s_mul_hi_u32 s12, s0, s3 10108; GFX9-NEXT: s_mul_i32 s11, s1, s3 10109; GFX9-NEXT: s_add_i32 s10, s12, s10 10110; GFX9-NEXT: s_add_i32 s10, s10, s11 10111; GFX9-NEXT: s_mul_i32 s13, s0, s3 10112; GFX9-NEXT: s_mul_hi_u32 s11, s3, s10 10113; GFX9-NEXT: s_mul_i32 s12, s3, s10 10114; GFX9-NEXT: s_mul_hi_u32 s3, s3, s13 10115; GFX9-NEXT: s_add_u32 s3, s3, s12 10116; GFX9-NEXT: s_addc_u32 s11, 0, s11 10117; GFX9-NEXT: s_mul_hi_u32 s14, s2, s13 10118; GFX9-NEXT: s_mul_i32 s13, s2, s13 10119; GFX9-NEXT: s_add_u32 s3, s3, s13 10120; GFX9-NEXT: s_mul_hi_u32 s12, s2, s10 10121; GFX9-NEXT: s_addc_u32 s3, s11, s14 10122; GFX9-NEXT: s_addc_u32 s11, s12, 0 10123; GFX9-NEXT: s_mul_i32 s10, s2, s10 10124; GFX9-NEXT: s_add_u32 s3, s3, s10 10125; GFX9-NEXT: s_addc_u32 s10, 0, s11 10126; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s3, v1 10127; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 10128; GFX9-NEXT: s_addc_u32 s2, s2, s10 10129; GFX9-NEXT: v_readfirstlane_b32 s10, v1 10130; GFX9-NEXT: s_mul_i32 s3, s0, s2 10131; GFX9-NEXT: s_mul_hi_u32 s11, s0, s10 10132; GFX9-NEXT: s_add_i32 s3, s11, s3 10133; GFX9-NEXT: s_mul_i32 s1, s1, s10 10134; GFX9-NEXT: s_add_i32 s3, s3, s1 10135; GFX9-NEXT: s_mul_i32 s0, s0, s10 10136; GFX9-NEXT: s_mul_hi_u32 s11, s2, s0 10137; GFX9-NEXT: s_mul_i32 s12, s2, s0 10138; GFX9-NEXT: s_mul_i32 s14, s10, s3 10139; GFX9-NEXT: s_mul_hi_u32 s0, s10, s0 10140; GFX9-NEXT: s_mul_hi_u32 s13, s10, s3 10141; GFX9-NEXT: s_add_u32 s0, s0, s14 10142; GFX9-NEXT: s_addc_u32 s10, 0, s13 10143; GFX9-NEXT: s_add_u32 s0, s0, s12 10144; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 10145; GFX9-NEXT: s_addc_u32 s0, s10, s11 10146; GFX9-NEXT: s_addc_u32 s1, s1, 0 10147; GFX9-NEXT: s_mul_i32 s3, s2, s3 10148; GFX9-NEXT: s_add_u32 s0, s0, s3 10149; GFX9-NEXT: s_addc_u32 s1, 0, s1 10150; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 10151; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 10152; GFX9-NEXT: s_addc_u32 s2, s2, s1 10153; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10154; GFX9-NEXT: s_ashr_i32 s10, s7, 31 10155; GFX9-NEXT: s_add_u32 s0, s6, s10 10156; GFX9-NEXT: s_mov_b32 s11, s10 10157; GFX9-NEXT: s_addc_u32 s1, s7, s10 10158; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 10159; GFX9-NEXT: v_readfirstlane_b32 s3, v1 10160; GFX9-NEXT: s_mul_i32 s1, s6, s2 10161; GFX9-NEXT: s_mul_hi_u32 s11, s6, s3 10162; GFX9-NEXT: s_mul_hi_u32 s0, s6, s2 10163; GFX9-NEXT: s_add_u32 s1, s11, s1 10164; GFX9-NEXT: s_addc_u32 s0, 0, s0 10165; GFX9-NEXT: s_mul_hi_u32 s12, s7, s3 10166; GFX9-NEXT: s_mul_i32 s3, s7, s3 10167; GFX9-NEXT: s_add_u32 s1, s1, s3 10168; GFX9-NEXT: s_mul_hi_u32 s11, s7, s2 10169; GFX9-NEXT: s_addc_u32 s0, s0, s12 10170; GFX9-NEXT: s_addc_u32 s1, s11, 0 10171; GFX9-NEXT: s_mul_i32 s2, s7, s2 10172; GFX9-NEXT: s_add_u32 s0, s0, s2 10173; GFX9-NEXT: s_addc_u32 s1, 0, s1 10174; GFX9-NEXT: s_mul_i32 s1, s8, s1 10175; GFX9-NEXT: s_mul_hi_u32 s2, s8, s0 10176; GFX9-NEXT: s_add_i32 s1, s2, s1 10177; GFX9-NEXT: s_mul_i32 s2, s9, s0 10178; GFX9-NEXT: s_mul_i32 s0, s8, s0 10179; GFX9-NEXT: s_add_i32 s11, s1, s2 10180; GFX9-NEXT: v_mov_b32_e32 v1, s0 10181; GFX9-NEXT: s_sub_i32 s1, s7, s11 10182; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s6, v1 10183; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 10184; GFX9-NEXT: s_subb_u32 s6, s1, s9 10185; GFX9-NEXT: v_subrev_co_u32_e64 v2, s[0:1], s8, v1 10186; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 10187; GFX9-NEXT: s_subb_u32 s12, s6, 0 10188; GFX9-NEXT: s_cmp_ge_u32 s12, s9 10189; GFX9-NEXT: s_cselect_b32 s13, -1, 0 10190; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v2 10191; GFX9-NEXT: s_cmp_eq_u32 s12, s9 10192; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[2:3] 10193; GFX9-NEXT: v_mov_b32_e32 v4, s13 10194; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 10195; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 10196; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[2:3] 10197; GFX9-NEXT: s_subb_u32 s2, s6, s9 10198; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v2 10199; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 10200; GFX9-NEXT: s_subb_u32 s0, s2, 0 10201; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 10202; GFX9-NEXT: s_subb_u32 s2, s7, s11 10203; GFX9-NEXT: s_cmp_ge_u32 s2, s9 10204; GFX9-NEXT: v_mov_b32_e32 v5, s12 10205; GFX9-NEXT: v_mov_b32_e32 v6, s0 10206; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 10207; GFX9-NEXT: s_cselect_b32 s3, -1, 0 10208; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 10209; GFX9-NEXT: s_cmp_eq_u32 s2, s9 10210; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v6, s[0:1] 10211; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 10212; GFX9-NEXT: v_mov_b32_e32 v6, s3 10213; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 10214; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc 10215; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 10216; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 10217; GFX9-NEXT: v_mov_b32_e32 v6, s2 10218; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 10219; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc 10220; GFX9-NEXT: v_xor_b32_e32 v1, s10, v1 10221; GFX9-NEXT: v_xor_b32_e32 v2, s10, v3 10222; GFX9-NEXT: v_mov_b32_e32 v3, s10 10223; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s10, v1 10224; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc 10225; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[4:5] 10226; GFX9-NEXT: s_endpgm 10227 %shl.y = shl i64 4096, %y 10228 %r = srem i64 %x, %shl.y 10229 store i64 %r, i64 addrspace(1)* %out 10230 ret void 10231} 10232 10233define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 10234; CHECK-LABEL: @srem_v2i64_pow2k_denom( 10235; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 10236; CHECK-NEXT: [[TMP2:%.*]] = srem i64 [[TMP1]], 4096 10237; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 10238; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 10239; CHECK-NEXT: [[TMP5:%.*]] = srem i64 [[TMP4]], 4096 10240; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 10241; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 10242; CHECK-NEXT: ret void 10243; 10244; GFX6-LABEL: srem_v2i64_pow2k_denom: 10245; GFX6: ; %bb.0: 10246; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 10247; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 10248; GFX6-NEXT: s_mov_b32 s3, 0xf000 10249; GFX6-NEXT: s_mov_b32 s2, -1 10250; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10251; GFX6-NEXT: s_ashr_i32 s8, s5, 31 10252; GFX6-NEXT: s_lshr_b32 s8, s8, 20 10253; GFX6-NEXT: s_add_u32 s8, s4, s8 10254; GFX6-NEXT: s_addc_u32 s9, s5, 0 10255; GFX6-NEXT: s_and_b32 s8, s8, 0xfffff000 10256; GFX6-NEXT: s_sub_u32 s4, s4, s8 10257; GFX6-NEXT: s_subb_u32 s5, s5, s9 10258; GFX6-NEXT: s_ashr_i32 s8, s7, 31 10259; GFX6-NEXT: s_lshr_b32 s8, s8, 20 10260; GFX6-NEXT: s_add_u32 s8, s6, s8 10261; GFX6-NEXT: s_addc_u32 s9, s7, 0 10262; GFX6-NEXT: s_and_b32 s8, s8, 0xfffff000 10263; GFX6-NEXT: s_sub_u32 s6, s6, s8 10264; GFX6-NEXT: s_subb_u32 s7, s7, s9 10265; GFX6-NEXT: v_mov_b32_e32 v0, s4 10266; GFX6-NEXT: v_mov_b32_e32 v1, s5 10267; GFX6-NEXT: v_mov_b32_e32 v2, s6 10268; GFX6-NEXT: v_mov_b32_e32 v3, s7 10269; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 10270; GFX6-NEXT: s_endpgm 10271; 10272; GFX9-LABEL: srem_v2i64_pow2k_denom: 10273; GFX9: ; %bb.0: 10274; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 10275; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10276; GFX9-NEXT: v_mov_b32_e32 v4, 0 10277; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10278; GFX9-NEXT: s_ashr_i32 s0, s5, 31 10279; GFX9-NEXT: s_lshr_b32 s0, s0, 20 10280; GFX9-NEXT: s_add_u32 s0, s4, s0 10281; GFX9-NEXT: s_addc_u32 s1, s5, 0 10282; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 10283; GFX9-NEXT: s_sub_u32 s0, s4, s0 10284; GFX9-NEXT: s_subb_u32 s1, s5, s1 10285; GFX9-NEXT: s_ashr_i32 s4, s7, 31 10286; GFX9-NEXT: s_lshr_b32 s4, s4, 20 10287; GFX9-NEXT: s_add_u32 s4, s6, s4 10288; GFX9-NEXT: s_addc_u32 s5, s7, 0 10289; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 10290; GFX9-NEXT: s_sub_u32 s4, s6, s4 10291; GFX9-NEXT: s_subb_u32 s5, s7, s5 10292; GFX9-NEXT: v_mov_b32_e32 v0, s0 10293; GFX9-NEXT: v_mov_b32_e32 v1, s1 10294; GFX9-NEXT: v_mov_b32_e32 v2, s4 10295; GFX9-NEXT: v_mov_b32_e32 v3, s5 10296; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 10297; GFX9-NEXT: s_endpgm 10298 %r = srem <2 x i64> %x, <i64 4096, i64 4096> 10299 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 10300 ret void 10301} 10302 10303define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 10304; CHECK-LABEL: @srem_v2i64_pow2_shl_denom( 10305; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 10306; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 10307; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 10308; CHECK-NEXT: [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]] 10309; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 10310; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 10311; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 10312; CHECK-NEXT: [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]] 10313; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 10314; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 10315; CHECK-NEXT: ret void 10316; 10317; GFX6-LABEL: srem_v2i64_pow2_shl_denom: 10318; GFX6: ; %bb.0: 10319; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 10320; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 10321; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10322; GFX6-NEXT: s_mov_b32 s11, 0xf000 10323; GFX6-NEXT: s_lshl_b64 s[14:15], s[2:3], s10 10324; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 10325; GFX6-NEXT: s_ashr_i32 s8, s3, 31 10326; GFX6-NEXT: s_add_u32 s2, s2, s8 10327; GFX6-NEXT: s_mov_b32 s9, s8 10328; GFX6-NEXT: s_addc_u32 s3, s3, s8 10329; GFX6-NEXT: s_xor_b64 s[16:17], s[2:3], s[8:9] 10330; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s16 10331; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s17 10332; GFX6-NEXT: s_sub_u32 s2, 0, s16 10333; GFX6-NEXT: s_subb_u32 s3, 0, s17 10334; GFX6-NEXT: s_ashr_i32 s12, s5, 31 10335; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 10336; GFX6-NEXT: v_rcp_f32_e32 v0, v0 10337; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 10338; GFX6-NEXT: s_add_u32 s0, s4, s12 10339; GFX6-NEXT: s_mov_b32 s13, s12 10340; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10341; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10342; GFX6-NEXT: v_trunc_f32_e32 v1, v1 10343; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10344; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 10345; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 10346; GFX6-NEXT: s_addc_u32 s1, s5, s12 10347; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[12:13] 10348; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 10349; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 10350; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 10351; GFX6-NEXT: v_mul_lo_u32 v4, s2, v0 10352; GFX6-NEXT: s_mov_b32 s10, -1 10353; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 10354; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 10355; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 10356; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 10357; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 10358; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 10359; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 10360; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 10361; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc 10362; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 10363; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 10364; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 10365; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 10366; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 10367; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10368; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 10369; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 10370; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 10371; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 10372; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 10373; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 10374; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10375; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 10376; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 10377; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 10378; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 10379; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 10380; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 10381; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 10382; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 10383; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 10384; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 10385; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 10386; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 10387; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc 10388; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 10389; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10390; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 10391; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 10392; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 10393; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 10394; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 10395; GFX6-NEXT: v_mul_hi_u32 v4, s4, v1 10396; GFX6-NEXT: v_mul_hi_u32 v5, s5, v1 10397; GFX6-NEXT: v_mul_lo_u32 v1, s5, v1 10398; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10399; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 10400; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 10401; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 10402; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 10403; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 10404; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 10405; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 10406; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 10407; GFX6-NEXT: v_mul_lo_u32 v1, s16, v1 10408; GFX6-NEXT: v_mul_hi_u32 v2, s16, v0 10409; GFX6-NEXT: v_mul_lo_u32 v3, s17, v0 10410; GFX6-NEXT: v_mul_lo_u32 v0, s16, v0 10411; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 10412; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 10413; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v1 10414; GFX6-NEXT: v_mov_b32_e32 v3, s17 10415; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 10416; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 10417; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s16, v0 10418; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 10419; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s17, v5 10420; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 10421; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v4 10422; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 10423; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 10424; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v5 10425; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s16, v4 10426; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 10427; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 10428; GFX6-NEXT: s_ashr_i32 s2, s15, 31 10429; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 10430; GFX6-NEXT: s_add_u32 s4, s14, s2 10431; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 10432; GFX6-NEXT: v_mov_b32_e32 v5, s5 10433; GFX6-NEXT: s_mov_b32 s3, s2 10434; GFX6-NEXT: s_addc_u32 s5, s15, s2 10435; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] 10436; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 10437; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s5 10438; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 10439; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s17, v1 10440; GFX6-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 10441; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 10442; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s16, v0 10443; GFX6-NEXT: v_rcp_f32_e32 v6, v6 10444; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 10445; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s17, v1 10446; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc 10447; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 10448; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 10449; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 10450; GFX6-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v6 10451; GFX6-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 10452; GFX6-NEXT: v_trunc_f32_e32 v4, v4 10453; GFX6-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 10454; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 10455; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 10456; GFX6-NEXT: s_sub_u32 s0, 0, s4 10457; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 10458; GFX6-NEXT: v_mul_hi_u32 v2, s0, v3 10459; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 10460; GFX6-NEXT: s_subb_u32 s1, 0, s5 10461; GFX6-NEXT: v_mul_lo_u32 v6, s1, v3 10462; GFX6-NEXT: s_ashr_i32 s14, s7, 31 10463; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 10464; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 10465; GFX6-NEXT: v_add_i32_e32 v2, vcc, v6, v2 10466; GFX6-NEXT: v_mul_lo_u32 v6, v3, v2 10467; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 10468; GFX6-NEXT: v_mul_hi_u32 v8, v3, v2 10469; GFX6-NEXT: v_mul_hi_u32 v9, v4, v2 10470; GFX6-NEXT: v_mul_lo_u32 v2, v4, v2 10471; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 10472; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 10473; GFX6-NEXT: v_mul_lo_u32 v8, v4, v5 10474; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 10475; GFX6-NEXT: s_mov_b32 s15, s14 10476; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 10477; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 10478; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc 10479; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc 10480; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 10481; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 10482; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10483; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 10484; GFX6-NEXT: v_mul_lo_u32 v4, s0, v3 10485; GFX6-NEXT: v_mul_hi_u32 v5, s0, v2 10486; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2 10487; GFX6-NEXT: v_xor_b32_e32 v1, s12, v1 10488; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 10489; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 10490; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 10491; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 10492; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 10493; GFX6-NEXT: v_mul_hi_u32 v10, v2, v4 10494; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 10495; GFX6-NEXT: v_mul_lo_u32 v5, v3, v5 10496; GFX6-NEXT: v_mul_hi_u32 v6, v3, v4 10497; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 10498; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 10499; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 10500; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 10501; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc 10502; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc 10503; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 10504; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 10505; GFX6-NEXT: s_add_u32 s0, s6, s14 10506; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 10507; GFX6-NEXT: s_addc_u32 s1, s7, s14 10508; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc 10509; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[14:15] 10510; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 10511; GFX6-NEXT: v_mul_hi_u32 v5, s6, v2 10512; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 10513; GFX6-NEXT: v_mul_hi_u32 v8, s7, v3 10514; GFX6-NEXT: v_mul_lo_u32 v3, s7, v3 10515; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 10516; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc 10517; GFX6-NEXT: v_mul_lo_u32 v7, s7, v2 10518; GFX6-NEXT: v_mul_hi_u32 v2, s7, v2 10519; GFX6-NEXT: v_mov_b32_e32 v6, s12 10520; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 10521; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc 10522; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc 10523; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 10524; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 10525; GFX6-NEXT: v_mul_lo_u32 v3, s4, v3 10526; GFX6-NEXT: v_mul_hi_u32 v4, s4, v2 10527; GFX6-NEXT: v_mul_lo_u32 v5, s5, v2 10528; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 10529; GFX6-NEXT: v_mul_lo_u32 v2, s4, v2 10530; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc 10531; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 10532; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 10533; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s7, v3 10534; GFX6-NEXT: v_mov_b32_e32 v5, s5 10535; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 10536; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 10537; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s4, v2 10538; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] 10539; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v7 10540; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 10541; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v6 10542; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1] 10543; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 10544; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s5, v7 10545; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s4, v6 10546; GFX6-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 10547; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 10548; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 10549; GFX6-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] 10550; GFX6-NEXT: v_mov_b32_e32 v7, s7 10551; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc 10552; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 10553; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 10554; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 10555; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 10556; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s5, v3 10557; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 10558; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 10559; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 10560; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 10561; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 10562; GFX6-NEXT: v_xor_b32_e32 v2, s14, v2 10563; GFX6-NEXT: v_xor_b32_e32 v3, s14, v3 10564; GFX6-NEXT: v_mov_b32_e32 v4, s14 10565; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s14, v2 10566; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc 10567; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10568; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 10569; GFX6-NEXT: s_endpgm 10570; 10571; GFX9-LABEL: srem_v2i64_pow2_shl_denom: 10572; GFX9: ; %bb.0: 10573; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 10574; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 10575; GFX9-NEXT: v_mov_b32_e32 v4, 0 10576; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10577; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 10578; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 10579; GFX9-NEXT: s_ashr_i32 s8, s3, 31 10580; GFX9-NEXT: s_add_u32 s2, s2, s8 10581; GFX9-NEXT: s_mov_b32 s9, s8 10582; GFX9-NEXT: s_addc_u32 s3, s3, s8 10583; GFX9-NEXT: s_xor_b64 s[12:13], s[2:3], s[8:9] 10584; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 10585; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 10586; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 10587; GFX9-NEXT: s_sub_u32 s0, 0, s12 10588; GFX9-NEXT: s_subb_u32 s1, 0, s13 10589; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 10590; GFX9-NEXT: v_rcp_f32_e32 v0, v0 10591; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10592; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10593; GFX9-NEXT: v_trunc_f32_e32 v1, v1 10594; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10595; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 10596; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 10597; GFX9-NEXT: v_readfirstlane_b32 s2, v1 10598; GFX9-NEXT: v_readfirstlane_b32 s3, v0 10599; GFX9-NEXT: s_mul_i32 s14, s0, s2 10600; GFX9-NEXT: s_mul_hi_u32 s16, s0, s3 10601; GFX9-NEXT: s_mul_i32 s15, s1, s3 10602; GFX9-NEXT: s_add_i32 s14, s16, s14 10603; GFX9-NEXT: s_add_i32 s14, s14, s15 10604; GFX9-NEXT: s_mul_i32 s17, s0, s3 10605; GFX9-NEXT: s_mul_hi_u32 s15, s3, s14 10606; GFX9-NEXT: s_mul_i32 s16, s3, s14 10607; GFX9-NEXT: s_mul_hi_u32 s3, s3, s17 10608; GFX9-NEXT: s_add_u32 s3, s3, s16 10609; GFX9-NEXT: s_addc_u32 s15, 0, s15 10610; GFX9-NEXT: s_mul_hi_u32 s18, s2, s17 10611; GFX9-NEXT: s_mul_i32 s17, s2, s17 10612; GFX9-NEXT: s_add_u32 s3, s3, s17 10613; GFX9-NEXT: s_mul_hi_u32 s16, s2, s14 10614; GFX9-NEXT: s_addc_u32 s3, s15, s18 10615; GFX9-NEXT: s_addc_u32 s15, s16, 0 10616; GFX9-NEXT: s_mul_i32 s14, s2, s14 10617; GFX9-NEXT: s_add_u32 s3, s3, s14 10618; GFX9-NEXT: s_addc_u32 s14, 0, s15 10619; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s3, v0 10620; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 10621; GFX9-NEXT: s_addc_u32 s2, s2, s14 10622; GFX9-NEXT: v_readfirstlane_b32 s14, v0 10623; GFX9-NEXT: s_mul_i32 s3, s0, s2 10624; GFX9-NEXT: s_mul_hi_u32 s15, s0, s14 10625; GFX9-NEXT: s_add_i32 s3, s15, s3 10626; GFX9-NEXT: s_mul_i32 s1, s1, s14 10627; GFX9-NEXT: s_add_i32 s3, s3, s1 10628; GFX9-NEXT: s_mul_i32 s0, s0, s14 10629; GFX9-NEXT: s_mul_hi_u32 s15, s2, s0 10630; GFX9-NEXT: s_mul_i32 s16, s2, s0 10631; GFX9-NEXT: s_mul_i32 s18, s14, s3 10632; GFX9-NEXT: s_mul_hi_u32 s0, s14, s0 10633; GFX9-NEXT: s_mul_hi_u32 s17, s14, s3 10634; GFX9-NEXT: s_add_u32 s0, s0, s18 10635; GFX9-NEXT: s_addc_u32 s14, 0, s17 10636; GFX9-NEXT: s_add_u32 s0, s0, s16 10637; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 10638; GFX9-NEXT: s_addc_u32 s0, s14, s15 10639; GFX9-NEXT: s_addc_u32 s1, s1, 0 10640; GFX9-NEXT: s_mul_i32 s3, s2, s3 10641; GFX9-NEXT: s_add_u32 s0, s0, s3 10642; GFX9-NEXT: s_addc_u32 s1, 0, s1 10643; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 10644; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 10645; GFX9-NEXT: s_addc_u32 s2, s2, s1 10646; GFX9-NEXT: s_ashr_i32 s14, s5, 31 10647; GFX9-NEXT: s_add_u32 s0, s4, s14 10648; GFX9-NEXT: s_mov_b32 s15, s14 10649; GFX9-NEXT: s_addc_u32 s1, s5, s14 10650; GFX9-NEXT: s_xor_b64 s[4:5], s[0:1], s[14:15] 10651; GFX9-NEXT: v_readfirstlane_b32 s3, v0 10652; GFX9-NEXT: s_mul_i32 s1, s4, s2 10653; GFX9-NEXT: s_mul_hi_u32 s15, s4, s3 10654; GFX9-NEXT: s_mul_hi_u32 s0, s4, s2 10655; GFX9-NEXT: s_add_u32 s1, s15, s1 10656; GFX9-NEXT: s_addc_u32 s0, 0, s0 10657; GFX9-NEXT: s_mul_hi_u32 s16, s5, s3 10658; GFX9-NEXT: s_mul_i32 s3, s5, s3 10659; GFX9-NEXT: s_add_u32 s1, s1, s3 10660; GFX9-NEXT: s_mul_hi_u32 s15, s5, s2 10661; GFX9-NEXT: s_addc_u32 s0, s0, s16 10662; GFX9-NEXT: s_addc_u32 s1, s15, 0 10663; GFX9-NEXT: s_mul_i32 s2, s5, s2 10664; GFX9-NEXT: s_add_u32 s0, s0, s2 10665; GFX9-NEXT: s_addc_u32 s1, 0, s1 10666; GFX9-NEXT: s_mul_i32 s1, s12, s1 10667; GFX9-NEXT: s_mul_hi_u32 s2, s12, s0 10668; GFX9-NEXT: s_add_i32 s1, s2, s1 10669; GFX9-NEXT: s_mul_i32 s2, s13, s0 10670; GFX9-NEXT: s_mul_i32 s0, s12, s0 10671; GFX9-NEXT: s_add_i32 s15, s1, s2 10672; GFX9-NEXT: v_mov_b32_e32 v0, s0 10673; GFX9-NEXT: s_sub_i32 s1, s5, s15 10674; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 10675; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 10676; GFX9-NEXT: s_subb_u32 s4, s1, s13 10677; GFX9-NEXT: v_subrev_co_u32_e64 v1, s[0:1], s12, v0 10678; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 10679; GFX9-NEXT: s_subb_u32 s16, s4, 0 10680; GFX9-NEXT: s_cmp_ge_u32 s16, s13 10681; GFX9-NEXT: s_cselect_b32 s17, -1, 0 10682; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v1 10683; GFX9-NEXT: s_cmp_eq_u32 s16, s13 10684; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3] 10685; GFX9-NEXT: v_mov_b32_e32 v3, s17 10686; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 10687; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 10688; GFX9-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[2:3] 10689; GFX9-NEXT: s_subb_u32 s2, s4, s13 10690; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s12, v1 10691; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 10692; GFX9-NEXT: s_subb_u32 s0, s2, 0 10693; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 10694; GFX9-NEXT: s_subb_u32 s2, s5, s15 10695; GFX9-NEXT: s_cmp_ge_u32 s2, s13 10696; GFX9-NEXT: v_mov_b32_e32 v5, s16 10697; GFX9-NEXT: v_mov_b32_e32 v6, s0 10698; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 10699; GFX9-NEXT: s_cselect_b32 s3, -1, 0 10700; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 10701; GFX9-NEXT: s_cmp_eq_u32 s2, s13 10702; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v6, s[0:1] 10703; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 10704; GFX9-NEXT: v_mov_b32_e32 v6, s3 10705; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 10706; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 10707; GFX9-NEXT: s_ashr_i32 s0, s11, 31 10708; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc 10709; GFX9-NEXT: v_mov_b32_e32 v6, s2 10710; GFX9-NEXT: s_add_u32 s2, s10, s0 10711; GFX9-NEXT: s_mov_b32 s1, s0 10712; GFX9-NEXT: s_addc_u32 s3, s11, s0 10713; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 10714; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], s[0:1] 10715; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 10716; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 10717; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s5 10718; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 10719; GFX9-NEXT: v_xor_b32_e32 v0, s14, v0 10720; GFX9-NEXT: v_xor_b32_e32 v2, s14, v2 10721; GFX9-NEXT: v_mac_f32_e32 v1, 0x4f800000, v3 10722; GFX9-NEXT: v_rcp_f32_e32 v3, v1 10723; GFX9-NEXT: v_mov_b32_e32 v5, s14 10724; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s14, v0 10725; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v5, vcc 10726; GFX9-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v3 10727; GFX9-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 10728; GFX9-NEXT: v_trunc_f32_e32 v3, v3 10729; GFX9-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3 10730; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 10731; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 10732; GFX9-NEXT: s_sub_u32 s0, 0, s4 10733; GFX9-NEXT: s_subb_u32 s1, 0, s5 10734; GFX9-NEXT: v_readfirstlane_b32 s2, v2 10735; GFX9-NEXT: v_readfirstlane_b32 s11, v3 10736; GFX9-NEXT: s_mul_hi_u32 s10, s0, s2 10737; GFX9-NEXT: s_mul_i32 s12, s0, s11 10738; GFX9-NEXT: s_mul_i32 s3, s1, s2 10739; GFX9-NEXT: s_add_i32 s10, s10, s12 10740; GFX9-NEXT: s_add_i32 s10, s10, s3 10741; GFX9-NEXT: s_mul_i32 s13, s0, s2 10742; GFX9-NEXT: s_mul_hi_u32 s3, s2, s10 10743; GFX9-NEXT: s_mul_i32 s12, s2, s10 10744; GFX9-NEXT: s_mul_hi_u32 s2, s2, s13 10745; GFX9-NEXT: s_add_u32 s2, s2, s12 10746; GFX9-NEXT: s_addc_u32 s3, 0, s3 10747; GFX9-NEXT: s_mul_hi_u32 s14, s11, s13 10748; GFX9-NEXT: s_mul_i32 s13, s11, s13 10749; GFX9-NEXT: s_add_u32 s2, s2, s13 10750; GFX9-NEXT: s_mul_hi_u32 s12, s11, s10 10751; GFX9-NEXT: s_addc_u32 s2, s3, s14 10752; GFX9-NEXT: s_addc_u32 s3, s12, 0 10753; GFX9-NEXT: s_mul_i32 s10, s11, s10 10754; GFX9-NEXT: s_add_u32 s2, s2, s10 10755; GFX9-NEXT: s_addc_u32 s3, 0, s3 10756; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2 10757; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 10758; GFX9-NEXT: s_addc_u32 s2, s11, s3 10759; GFX9-NEXT: v_readfirstlane_b32 s10, v2 10760; GFX9-NEXT: s_mul_i32 s3, s0, s2 10761; GFX9-NEXT: s_mul_hi_u32 s11, s0, s10 10762; GFX9-NEXT: s_add_i32 s3, s11, s3 10763; GFX9-NEXT: s_mul_i32 s1, s1, s10 10764; GFX9-NEXT: s_add_i32 s3, s3, s1 10765; GFX9-NEXT: s_mul_i32 s0, s0, s10 10766; GFX9-NEXT: s_mul_hi_u32 s11, s2, s0 10767; GFX9-NEXT: s_mul_i32 s12, s2, s0 10768; GFX9-NEXT: s_mul_i32 s14, s10, s3 10769; GFX9-NEXT: s_mul_hi_u32 s0, s10, s0 10770; GFX9-NEXT: s_mul_hi_u32 s13, s10, s3 10771; GFX9-NEXT: s_add_u32 s0, s0, s14 10772; GFX9-NEXT: s_addc_u32 s10, 0, s13 10773; GFX9-NEXT: s_add_u32 s0, s0, s12 10774; GFX9-NEXT: s_mul_hi_u32 s1, s2, s3 10775; GFX9-NEXT: s_addc_u32 s0, s10, s11 10776; GFX9-NEXT: s_addc_u32 s1, s1, 0 10777; GFX9-NEXT: s_mul_i32 s3, s2, s3 10778; GFX9-NEXT: s_add_u32 s0, s0, s3 10779; GFX9-NEXT: s_addc_u32 s1, 0, s1 10780; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 10781; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 10782; GFX9-NEXT: s_addc_u32 s2, s2, s1 10783; GFX9-NEXT: s_ashr_i32 s10, s7, 31 10784; GFX9-NEXT: s_add_u32 s0, s6, s10 10785; GFX9-NEXT: s_mov_b32 s11, s10 10786; GFX9-NEXT: s_addc_u32 s1, s7, s10 10787; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 10788; GFX9-NEXT: v_readfirstlane_b32 s3, v2 10789; GFX9-NEXT: s_mul_i32 s1, s6, s2 10790; GFX9-NEXT: s_mul_hi_u32 s11, s6, s3 10791; GFX9-NEXT: s_mul_hi_u32 s0, s6, s2 10792; GFX9-NEXT: s_add_u32 s1, s11, s1 10793; GFX9-NEXT: s_addc_u32 s0, 0, s0 10794; GFX9-NEXT: s_mul_hi_u32 s12, s7, s3 10795; GFX9-NEXT: s_mul_i32 s3, s7, s3 10796; GFX9-NEXT: s_add_u32 s1, s1, s3 10797; GFX9-NEXT: s_mul_hi_u32 s11, s7, s2 10798; GFX9-NEXT: s_addc_u32 s0, s0, s12 10799; GFX9-NEXT: s_addc_u32 s1, s11, 0 10800; GFX9-NEXT: s_mul_i32 s2, s7, s2 10801; GFX9-NEXT: s_add_u32 s0, s0, s2 10802; GFX9-NEXT: s_addc_u32 s1, 0, s1 10803; GFX9-NEXT: s_mul_i32 s1, s4, s1 10804; GFX9-NEXT: s_mul_hi_u32 s2, s4, s0 10805; GFX9-NEXT: s_add_i32 s1, s2, s1 10806; GFX9-NEXT: s_mul_i32 s2, s5, s0 10807; GFX9-NEXT: s_mul_i32 s0, s4, s0 10808; GFX9-NEXT: s_add_i32 s11, s1, s2 10809; GFX9-NEXT: v_mov_b32_e32 v2, s0 10810; GFX9-NEXT: s_sub_i32 s1, s7, s11 10811; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s6, v2 10812; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 10813; GFX9-NEXT: s_subb_u32 s6, s1, s5 10814; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s4, v2 10815; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 10816; GFX9-NEXT: s_subb_u32 s12, s6, 0 10817; GFX9-NEXT: s_cmp_ge_u32 s12, s5 10818; GFX9-NEXT: s_cselect_b32 s13, -1, 0 10819; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v3 10820; GFX9-NEXT: s_cmp_eq_u32 s12, s5 10821; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[2:3] 10822; GFX9-NEXT: v_mov_b32_e32 v6, s13 10823; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 10824; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 10825; GFX9-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[2:3] 10826; GFX9-NEXT: s_subb_u32 s2, s6, s5 10827; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s4, v3 10828; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 10829; GFX9-NEXT: s_subb_u32 s0, s2, 0 10830; GFX9-NEXT: s_cmp_lg_u64 vcc, 0 10831; GFX9-NEXT: s_subb_u32 s2, s7, s11 10832; GFX9-NEXT: s_cmp_ge_u32 s2, s5 10833; GFX9-NEXT: v_mov_b32_e32 v7, s12 10834; GFX9-NEXT: v_mov_b32_e32 v8, s0 10835; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 10836; GFX9-NEXT: s_cselect_b32 s3, -1, 0 10837; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 10838; GFX9-NEXT: s_cmp_eq_u32 s2, s5 10839; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v8, s[0:1] 10840; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 10841; GFX9-NEXT: v_mov_b32_e32 v8, s3 10842; GFX9-NEXT: s_cselect_b64 vcc, -1, 0 10843; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc 10844; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 10845; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 10846; GFX9-NEXT: v_mov_b32_e32 v8, s2 10847; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 10848; GFX9-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc 10849; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2 10850; GFX9-NEXT: v_xor_b32_e32 v3, s10, v5 10851; GFX9-NEXT: v_mov_b32_e32 v5, s10 10852; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s10, v2 10853; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc 10854; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10855; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] 10856; GFX9-NEXT: s_endpgm 10857 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 10858 %r = srem <2 x i64> %x, %shl.y 10859 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 10860 ret void 10861} 10862