1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update 2; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s 3; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s 5 6define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 7; CHECK-LABEL: @udiv_i32( 8; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 9; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 10; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000 11; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 12; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]] 13; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]] 14; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64 15; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 16; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 17; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 18; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 19; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 20; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]] 21; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64 22; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 23; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 24; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 25; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 26; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 27; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]] 28; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]] 29; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]] 30; CHECK-NEXT: [[TMP23:%.*]] = add i32 [[TMP19]], 1 31; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP19]] 32; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP21]], [[Y]] 33; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP22]], i32 [[TMP25]], i32 [[TMP21]] 34; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]] 35; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP24]], 1 36; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]] 37; CHECK-NEXT: store i32 [[TMP29]], i32 addrspace(1)* [[OUT:%.*]], align 4 38; CHECK-NEXT: ret void 39; 40; GFX6-LABEL: udiv_i32: 41; GFX6: ; %bb.0: 42; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 43; GFX6-NEXT: s_mov_b32 s7, 0xf000 44; GFX6-NEXT: s_mov_b32 s6, -1 45; GFX6-NEXT: s_waitcnt lgkmcnt(0) 46; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 47; GFX6-NEXT: s_sub_i32 s4, 0, s3 48; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 49; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 50; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 51; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 52; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 53; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 54; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 55; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 56; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 57; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 58; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 59; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 60; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 61; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 62; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 63; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 64; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 65; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 66; GFX6-NEXT: s_waitcnt lgkmcnt(0) 67; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 68; GFX6-NEXT: s_endpgm 69; 70; GFX9-LABEL: udiv_i32: 71; GFX9: ; %bb.0: 72; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 73; GFX9-NEXT: v_mov_b32_e32 v2, 0 74; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 75; GFX9-NEXT: s_waitcnt lgkmcnt(0) 76; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 77; GFX9-NEXT: s_sub_i32 s4, 0, s3 78; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 79; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 80; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 81; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 82; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 83; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 84; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 85; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3 86; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 87; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 88; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 89; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 90; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 91; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 92; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 93; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 94; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 95; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 96; GFX9-NEXT: s_endpgm 97 %r = udiv i32 %x, %y 98 store i32 %r, i32 addrspace(1)* %out 99 ret void 100} 101 102define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 103; CHECK-LABEL: @urem_i32( 104; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 105; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 106; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000 107; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 108; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]] 109; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]] 110; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64 111; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 112; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 113; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 114; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 115; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 116; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]] 117; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64 118; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 119; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 120; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 121; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 122; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 123; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]] 124; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]] 125; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]] 126; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP21]], [[Y]] 127; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP21]] 128; CHECK-NEXT: [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]] 129; CHECK-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]] 130; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]] 131; CHECK-NEXT: store i32 [[TMP27]], i32 addrspace(1)* [[OUT:%.*]], align 4 132; CHECK-NEXT: ret void 133; 134; GFX6-LABEL: urem_i32: 135; GFX6: ; %bb.0: 136; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 137; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 138; GFX6-NEXT: s_mov_b32 s3, 0xf000 139; GFX6-NEXT: s_waitcnt lgkmcnt(0) 140; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 141; GFX6-NEXT: s_sub_i32 s2, 0, s5 142; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 143; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 144; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 145; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 146; GFX6-NEXT: s_mov_b32 s2, -1 147; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 148; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 149; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 150; GFX6-NEXT: v_mul_lo_u32 v0, v0, s5 151; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 152; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s5, v0 153; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 154; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 155; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s5, v0 156; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 157; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 158; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 159; GFX6-NEXT: s_endpgm 160; 161; GFX9-LABEL: urem_i32: 162; GFX9: ; %bb.0: 163; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 164; GFX9-NEXT: s_waitcnt lgkmcnt(0) 165; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 166; GFX9-NEXT: s_sub_i32 s4, 0, s3 167; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 168; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 169; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 170; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 171; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 172; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 173; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 174; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 175; GFX9-NEXT: v_mov_b32_e32 v1, 0 176; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 177; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 178; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 179; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 180; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 181; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 182; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 183; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 184; GFX9-NEXT: s_waitcnt lgkmcnt(0) 185; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 186; GFX9-NEXT: s_endpgm 187 %r = urem i32 %x, %y 188 store i32 %r, i32 addrspace(1)* %out 189 ret void 190} 191 192define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 193; CHECK-LABEL: @sdiv_i32( 194; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 195; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 196; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 197; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[X]], [[TMP1]] 198; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]] 199; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]] 200; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]] 201; CHECK-NEXT: [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float 202; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]]) 203; CHECK-NEXT: [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41EFFFFFC0000000 204; CHECK-NEXT: [[TMP11:%.*]] = fptoui float [[TMP10]] to i32 205; CHECK-NEXT: [[TMP12:%.*]] = sub i32 0, [[TMP7]] 206; CHECK-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP11]] 207; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP11]] to i64 208; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 209; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 210; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 211; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 212; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 213; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP11]], [[TMP19]] 214; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP6]] to i64 215; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP20]] to i64 216; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]] 217; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 218; CHECK-NEXT: [[TMP25:%.*]] = lshr i64 [[TMP23]], 32 219; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 220; CHECK-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], [[TMP7]] 221; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP6]], [[TMP27]] 222; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP7]] 223; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 224; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 225; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP28]], [[TMP7]] 226; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP29]], i32 [[TMP32]], i32 [[TMP28]] 227; CHECK-NEXT: [[TMP34:%.*]] = icmp uge i32 [[TMP33]], [[TMP7]] 228; CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP31]], 1 229; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]] 230; CHECK-NEXT: [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]] 231; CHECK-NEXT: [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]] 232; CHECK-NEXT: store i32 [[TMP38]], i32 addrspace(1)* [[OUT:%.*]], align 4 233; CHECK-NEXT: ret void 234; 235; GFX6-LABEL: sdiv_i32: 236; GFX6: ; %bb.0: 237; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 238; GFX6-NEXT: s_mov_b32 s7, 0xf000 239; GFX6-NEXT: s_mov_b32 s6, -1 240; GFX6-NEXT: s_waitcnt lgkmcnt(0) 241; GFX6-NEXT: s_ashr_i32 s8, s3, 31 242; GFX6-NEXT: s_add_i32 s3, s3, s8 243; GFX6-NEXT: s_xor_b32 s3, s3, s8 244; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 245; GFX6-NEXT: s_sub_i32 s4, 0, s3 246; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 247; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 248; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 249; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 250; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 251; GFX6-NEXT: s_ashr_i32 s0, s2, 31 252; GFX6-NEXT: s_add_i32 s1, s2, s0 253; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 254; GFX6-NEXT: s_xor_b32 s1, s1, s0 255; GFX6-NEXT: s_xor_b32 s2, s0, s8 256; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 257; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 258; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 259; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 260; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 261; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 262; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 263; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 264; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 265; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 266; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 267; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 268; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 269; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 270; GFX6-NEXT: s_waitcnt lgkmcnt(0) 271; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 272; GFX6-NEXT: s_endpgm 273; 274; GFX9-LABEL: sdiv_i32: 275; GFX9: ; %bb.0: 276; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 277; GFX9-NEXT: v_mov_b32_e32 v2, 0 278; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 279; GFX9-NEXT: s_waitcnt lgkmcnt(0) 280; GFX9-NEXT: s_ashr_i32 s4, s3, 31 281; GFX9-NEXT: s_add_i32 s3, s3, s4 282; GFX9-NEXT: s_xor_b32 s3, s3, s4 283; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 284; GFX9-NEXT: s_sub_i32 s5, 0, s3 285; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 286; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 287; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 288; GFX9-NEXT: v_mul_lo_u32 v1, s5, v0 289; GFX9-NEXT: s_ashr_i32 s5, s2, 31 290; GFX9-NEXT: s_add_i32 s2, s2, s5 291; GFX9-NEXT: s_xor_b32 s2, s2, s5 292; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 293; GFX9-NEXT: s_xor_b32 s4, s5, s4 294; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 295; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 296; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3 297; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 298; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 299; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 300; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 301; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 302; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 303; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 304; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 305; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 306; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 307; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 308; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 309; GFX9-NEXT: s_endpgm 310 %r = sdiv i32 %x, %y 311 store i32 %r, i32 addrspace(1)* %out 312 ret void 313} 314 315define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 316; CHECK-LABEL: @srem_i32( 317; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 318; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 319; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[X]], [[TMP1]] 320; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]] 321; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]] 322; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]] 323; CHECK-NEXT: [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float 324; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 325; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41EFFFFFC0000000 326; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP9]] to i32 327; CHECK-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP6]] 328; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP10]] 329; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP10]] to i64 330; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64 331; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]] 332; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 333; CHECK-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP15]], 32 334; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 335; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP10]], [[TMP18]] 336; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64 337; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP19]] to i64 338; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]] 339; CHECK-NEXT: [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32 340; CHECK-NEXT: [[TMP24:%.*]] = lshr i64 [[TMP22]], 32 341; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 342; CHECK-NEXT: [[TMP26:%.*]] = mul i32 [[TMP25]], [[TMP6]] 343; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP5]], [[TMP26]] 344; CHECK-NEXT: [[TMP28:%.*]] = icmp uge i32 [[TMP27]], [[TMP6]] 345; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP6]] 346; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP28]], i32 [[TMP29]], i32 [[TMP27]] 347; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP6]] 348; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP30]], [[TMP6]] 349; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]] 350; CHECK-NEXT: [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]] 351; CHECK-NEXT: [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]] 352; CHECK-NEXT: store i32 [[TMP35]], i32 addrspace(1)* [[OUT:%.*]], align 4 353; CHECK-NEXT: ret void 354; 355; GFX6-LABEL: srem_i32: 356; GFX6: ; %bb.0: 357; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 358; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 359; GFX6-NEXT: s_waitcnt lgkmcnt(0) 360; GFX6-NEXT: s_ashr_i32 s4, s3, 31 361; GFX6-NEXT: s_add_i32 s3, s3, s4 362; GFX6-NEXT: s_xor_b32 s4, s3, s4 363; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 364; GFX6-NEXT: s_sub_i32 s3, 0, s4 365; GFX6-NEXT: s_ashr_i32 s5, s2, 31 366; GFX6-NEXT: s_add_i32 s2, s2, s5 367; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 368; GFX6-NEXT: s_xor_b32 s6, s2, s5 369; GFX6-NEXT: s_mov_b32 s2, -1 370; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 371; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 372; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 373; GFX6-NEXT: s_mov_b32 s3, 0xf000 374; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 375; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 376; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 377; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 378; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 379; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 380; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 381; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 382; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 383; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 384; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 385; GFX6-NEXT: v_xor_b32_e32 v0, s5, v0 386; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s5, v0 387; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 388; GFX6-NEXT: s_endpgm 389; 390; GFX9-LABEL: srem_i32: 391; GFX9: ; %bb.0: 392; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 393; GFX9-NEXT: s_waitcnt lgkmcnt(0) 394; GFX9-NEXT: s_ashr_i32 s4, s3, 31 395; GFX9-NEXT: s_add_i32 s3, s3, s4 396; GFX9-NEXT: s_xor_b32 s3, s3, s4 397; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 398; GFX9-NEXT: s_sub_i32 s4, 0, s3 399; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 400; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 401; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 402; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 403; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 404; GFX9-NEXT: s_ashr_i32 s4, s2, 31 405; GFX9-NEXT: s_add_i32 s2, s2, s4 406; GFX9-NEXT: s_xor_b32 s2, s2, s4 407; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 408; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 409; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 410; GFX9-NEXT: v_mov_b32_e32 v1, 0 411; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 412; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 413; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 414; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 415; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 416; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 417; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 418; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 419; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 420; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 421; GFX9-NEXT: s_waitcnt lgkmcnt(0) 422; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 423; GFX9-NEXT: s_endpgm 424 %r = srem i32 %x, %y 425 store i32 %r, i32 addrspace(1)* %out 426 ret void 427} 428 429define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 430; CHECK-LABEL: @udiv_i16( 431; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 432; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 433; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 434; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 435; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 436; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 437; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 438; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 439; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 440; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 441; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 442; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 443; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 444; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 445; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 446; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 65535 447; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 448; CHECK-NEXT: store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]], align 2 449; CHECK-NEXT: ret void 450; 451; GFX6-LABEL: udiv_i16: 452; GFX6: ; %bb.0: 453; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb 454; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 455; GFX6-NEXT: s_waitcnt lgkmcnt(0) 456; GFX6-NEXT: s_lshr_b32 s3, s2, 16 457; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 458; GFX6-NEXT: s_and_b32 s2, s2, 0xffff 459; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s2 460; GFX6-NEXT: s_mov_b32 s3, 0xf000 461; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 462; GFX6-NEXT: s_mov_b32 s2, -1 463; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 464; GFX6-NEXT: v_trunc_f32_e32 v2, v2 465; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 466; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 467; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 468; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 469; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 470; GFX6-NEXT: s_endpgm 471; 472; GFX9-LABEL: udiv_i16: 473; GFX9: ; %bb.0: 474; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 475; GFX9-NEXT: v_mov_b32_e32 v3, 0 476; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 477; GFX9-NEXT: s_waitcnt lgkmcnt(0) 478; GFX9-NEXT: s_lshr_b32 s3, s2, 16 479; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 480; GFX9-NEXT: s_and_b32 s2, s2, 0xffff 481; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 482; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 483; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 484; GFX9-NEXT: v_trunc_f32_e32 v2, v2 485; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 486; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 487; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 488; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 489; GFX9-NEXT: global_store_short v3, v0, s[0:1] 490; GFX9-NEXT: s_endpgm 491 %r = udiv i16 %x, %y 492 store i16 %r, i16 addrspace(1)* %out 493 ret void 494} 495 496define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 497; CHECK-LABEL: @urem_i16( 498; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 499; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 500; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 501; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 502; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 503; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 504; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 505; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 506; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 507; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 508; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 509; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 510; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 511; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 512; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 513; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 514; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 515; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 516; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 517; CHECK-NEXT: store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]], align 2 518; CHECK-NEXT: ret void 519; 520; GFX6-LABEL: urem_i16: 521; GFX6: ; %bb.0: 522; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 523; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 524; GFX6-NEXT: s_waitcnt lgkmcnt(0) 525; GFX6-NEXT: s_lshr_b32 s2, s4, 16 526; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 527; GFX6-NEXT: s_and_b32 s3, s4, 0xffff 528; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 529; GFX6-NEXT: s_mov_b32 s3, 0xf000 530; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 531; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 532; GFX6-NEXT: v_trunc_f32_e32 v2, v2 533; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 534; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 535; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 536; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 537; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 538; GFX6-NEXT: s_mov_b32 s2, -1 539; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 540; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 541; GFX6-NEXT: s_endpgm 542; 543; GFX9-LABEL: urem_i16: 544; GFX9: ; %bb.0: 545; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 546; GFX9-NEXT: s_waitcnt lgkmcnt(0) 547; GFX9-NEXT: s_lshr_b32 s3, s2, 16 548; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 549; GFX9-NEXT: s_and_b32 s4, s2, 0xffff 550; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 551; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 552; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 553; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 554; GFX9-NEXT: v_trunc_f32_e32 v2, v2 555; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 556; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 557; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 558; GFX9-NEXT: v_mov_b32_e32 v1, 0 559; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 560; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 561; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 562; GFX9-NEXT: s_waitcnt lgkmcnt(0) 563; GFX9-NEXT: global_store_short v1, v0, s[0:1] 564; GFX9-NEXT: s_endpgm 565 %r = urem i16 %x, %y 566 store i16 %r, i16 addrspace(1)* %out 567 ret void 568} 569 570define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 571; CHECK-LABEL: @sdiv_i16( 572; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 573; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 574; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 575; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 576; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 577; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 578; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 579; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 580; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 581; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 582; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 583; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 584; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 585; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 586; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 587; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 588; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 589; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 590; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 16 591; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 16 592; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 593; CHECK-NEXT: store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]], align 2 594; CHECK-NEXT: ret void 595; 596; GFX6-LABEL: sdiv_i16: 597; GFX6: ; %bb.0: 598; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 599; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 600; GFX6-NEXT: s_mov_b32 s3, 0xf000 601; GFX6-NEXT: s_mov_b32 s2, -1 602; GFX6-NEXT: s_waitcnt lgkmcnt(0) 603; GFX6-NEXT: s_ashr_i32 s5, s4, 16 604; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 605; GFX6-NEXT: s_sext_i32_i16 s4, s4 606; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 607; GFX6-NEXT: s_xor_b32 s4, s4, s5 608; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 609; GFX6-NEXT: s_ashr_i32 s4, s4, 30 610; GFX6-NEXT: s_or_b32 s4, s4, 1 611; GFX6-NEXT: v_mov_b32_e32 v3, s4 612; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 613; GFX6-NEXT: v_trunc_f32_e32 v2, v2 614; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 615; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 616; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 617; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 618; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 619; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 620; GFX6-NEXT: s_endpgm 621; 622; GFX9-LABEL: sdiv_i16: 623; GFX9: ; %bb.0: 624; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 625; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 626; GFX9-NEXT: v_mov_b32_e32 v1, 0 627; GFX9-NEXT: s_waitcnt lgkmcnt(0) 628; GFX9-NEXT: s_ashr_i32 s0, s4, 16 629; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 630; GFX9-NEXT: s_sext_i32_i16 s1, s4 631; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 632; GFX9-NEXT: s_xor_b32 s0, s1, s0 633; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 634; GFX9-NEXT: s_ashr_i32 s0, s0, 30 635; GFX9-NEXT: s_or_b32 s4, s0, 1 636; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 637; GFX9-NEXT: v_trunc_f32_e32 v3, v3 638; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 639; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 640; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 641; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 642; GFX9-NEXT: s_cselect_b32 s0, s4, 0 643; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 644; GFX9-NEXT: global_store_short v1, v0, s[2:3] 645; GFX9-NEXT: s_endpgm 646 %r = sdiv i16 %x, %y 647 store i16 %r, i16 addrspace(1)* %out 648 ret void 649} 650 651define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 652; CHECK-LABEL: @srem_i16( 653; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 654; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 655; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 656; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 657; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 658; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 659; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 660; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 661; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 662; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 663; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 664; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 665; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 666; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 667; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 668; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 669; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 670; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 671; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 672; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 673; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 674; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 675; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 676; CHECK-NEXT: store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]], align 2 677; CHECK-NEXT: ret void 678; 679; GFX6-LABEL: srem_i16: 680; GFX6: ; %bb.0: 681; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 682; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 683; GFX6-NEXT: s_waitcnt lgkmcnt(0) 684; GFX6-NEXT: s_ashr_i32 s2, s4, 16 685; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 686; GFX6-NEXT: s_sext_i32_i16 s3, s4 687; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 688; GFX6-NEXT: s_xor_b32 s3, s3, s2 689; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 690; GFX6-NEXT: s_ashr_i32 s3, s3, 30 691; GFX6-NEXT: s_or_b32 s3, s3, 1 692; GFX6-NEXT: v_mov_b32_e32 v3, s3 693; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 694; GFX6-NEXT: v_trunc_f32_e32 v2, v2 695; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 696; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 697; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 698; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 699; GFX6-NEXT: s_mov_b32 s3, 0xf000 700; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 701; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 702; GFX6-NEXT: s_mov_b32 s2, -1 703; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 704; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 705; GFX6-NEXT: s_endpgm 706; 707; GFX9-LABEL: srem_i16: 708; GFX9: ; %bb.0: 709; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 710; GFX9-NEXT: s_waitcnt lgkmcnt(0) 711; GFX9-NEXT: s_ashr_i32 s5, s4, 16 712; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5 713; GFX9-NEXT: s_sext_i32_i16 s2, s4 714; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 715; GFX9-NEXT: s_xor_b32 s2, s2, s5 716; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 717; GFX9-NEXT: s_ashr_i32 s2, s2, 30 718; GFX9-NEXT: s_or_b32 s6, s2, 1 719; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 720; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 721; GFX9-NEXT: v_trunc_f32_e32 v2, v2 722; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 723; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 724; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 725; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 726; GFX9-NEXT: s_cselect_b32 s2, s6, 0 727; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 728; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 729; GFX9-NEXT: v_mov_b32_e32 v1, 0 730; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 731; GFX9-NEXT: s_waitcnt lgkmcnt(0) 732; GFX9-NEXT: global_store_short v1, v0, s[0:1] 733; GFX9-NEXT: s_endpgm 734 %r = srem i16 %x, %y 735 store i16 %r, i16 addrspace(1)* %out 736 ret void 737} 738 739define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 740; CHECK-LABEL: @udiv_i8( 741; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 742; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 743; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 744; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 745; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 746; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 747; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 748; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 749; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 750; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 751; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 752; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 753; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 754; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 755; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 756; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 255 757; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 758; CHECK-NEXT: store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]], align 1 759; CHECK-NEXT: ret void 760; 761; GFX6-LABEL: udiv_i8: 762; GFX6: ; %bb.0: 763; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 764; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 765; GFX6-NEXT: s_mov_b32 s3, 0xf000 766; GFX6-NEXT: s_mov_b32 s2, -1 767; GFX6-NEXT: s_waitcnt lgkmcnt(0) 768; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 769; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 770; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 771; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 772; GFX6-NEXT: v_trunc_f32_e32 v1, v1 773; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 774; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 775; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 776; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 777; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 778; GFX6-NEXT: s_endpgm 779; 780; GFX9-LABEL: udiv_i8: 781; GFX9: ; %bb.0: 782; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 783; GFX9-NEXT: v_mov_b32_e32 v2, 0 784; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 785; GFX9-NEXT: s_waitcnt lgkmcnt(0) 786; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 787; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 788; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 789; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 790; GFX9-NEXT: v_trunc_f32_e32 v1, v1 791; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 792; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v3 793; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 794; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 795; GFX9-NEXT: global_store_byte v2, v0, s[0:1] 796; GFX9-NEXT: s_endpgm 797 %r = udiv i8 %x, %y 798 store i8 %r, i8 addrspace(1)* %out 799 ret void 800} 801 802define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 803; CHECK-LABEL: @urem_i8( 804; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 805; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 806; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 807; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 808; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 809; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 810; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 811; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 812; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 813; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 814; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 815; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 816; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 817; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 818; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 819; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 820; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 821; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 255 822; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8 823; CHECK-NEXT: store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]], align 1 824; CHECK-NEXT: ret void 825; 826; GFX6-LABEL: urem_i8: 827; GFX6: ; %bb.0: 828; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 829; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 830; GFX6-NEXT: s_mov_b32 s3, 0xf000 831; GFX6-NEXT: s_waitcnt lgkmcnt(0) 832; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 833; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 834; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 835; GFX6-NEXT: s_lshr_b32 s2, s4, 8 836; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 837; GFX6-NEXT: v_trunc_f32_e32 v1, v1 838; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 839; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 840; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 841; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 842; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 843; GFX6-NEXT: s_mov_b32 s2, -1 844; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 845; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 846; GFX6-NEXT: s_endpgm 847; 848; GFX9-LABEL: urem_i8: 849; GFX9: ; %bb.0: 850; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 851; GFX9-NEXT: s_waitcnt lgkmcnt(0) 852; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 853; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 854; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 855; GFX9-NEXT: s_lshr_b32 s3, s2, 8 856; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 857; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 858; GFX9-NEXT: v_trunc_f32_e32 v1, v1 859; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 860; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2 861; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 862; GFX9-NEXT: v_mov_b32_e32 v1, 0 863; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 864; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 865; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 866; GFX9-NEXT: s_waitcnt lgkmcnt(0) 867; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 868; GFX9-NEXT: s_endpgm 869 %r = urem i8 %x, %y 870 store i8 %r, i8 addrspace(1)* %out 871 ret void 872} 873 874define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 875; CHECK-LABEL: @sdiv_i8( 876; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 877; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 878; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 879; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 880; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 881; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 882; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 883; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 884; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 885; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 886; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 887; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 888; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 889; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 890; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 891; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 892; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 893; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 894; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 24 895; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 24 896; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8 897; CHECK-NEXT: store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]], align 1 898; CHECK-NEXT: ret void 899; 900; GFX6-LABEL: sdiv_i8: 901; GFX6: ; %bb.0: 902; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 903; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 904; GFX6-NEXT: s_mov_b32 s3, 0xf000 905; GFX6-NEXT: s_mov_b32 s2, -1 906; GFX6-NEXT: s_waitcnt lgkmcnt(0) 907; GFX6-NEXT: s_bfe_i32 s5, s4, 0x80008 908; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 909; GFX6-NEXT: s_sext_i32_i8 s4, s4 910; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 911; GFX6-NEXT: s_xor_b32 s4, s4, s5 912; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 913; GFX6-NEXT: s_ashr_i32 s4, s4, 30 914; GFX6-NEXT: s_or_b32 s4, s4, 1 915; GFX6-NEXT: v_mov_b32_e32 v3, s4 916; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 917; GFX6-NEXT: v_trunc_f32_e32 v2, v2 918; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 919; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 920; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 921; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 922; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 923; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 924; GFX6-NEXT: s_endpgm 925; 926; GFX9-LABEL: sdiv_i8: 927; GFX9: ; %bb.0: 928; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 929; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 930; GFX9-NEXT: v_mov_b32_e32 v1, 0 931; GFX9-NEXT: s_waitcnt lgkmcnt(0) 932; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80008 933; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 934; GFX9-NEXT: s_sext_i32_i8 s1, s4 935; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 936; GFX9-NEXT: s_xor_b32 s0, s1, s0 937; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 938; GFX9-NEXT: s_ashr_i32 s0, s0, 30 939; GFX9-NEXT: s_or_b32 s4, s0, 1 940; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 941; GFX9-NEXT: v_trunc_f32_e32 v3, v3 942; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 943; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 944; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 945; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 946; GFX9-NEXT: s_cselect_b32 s0, s4, 0 947; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 948; GFX9-NEXT: global_store_byte v1, v0, s[2:3] 949; GFX9-NEXT: s_endpgm 950 %r = sdiv i8 %x, %y 951 store i8 %r, i8 addrspace(1)* %out 952 ret void 953} 954 955define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 956; CHECK-LABEL: @srem_i8( 957; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 958; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 959; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 960; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 961; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 962; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 963; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 964; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 965; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 966; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 967; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 968; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 969; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 970; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 971; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 972; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 973; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 974; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 975; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 976; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 977; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 24 978; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 24 979; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8 980; CHECK-NEXT: store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]], align 1 981; CHECK-NEXT: ret void 982; 983; GFX6-LABEL: srem_i8: 984; GFX6: ; %bb.0: 985; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 986; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 987; GFX6-NEXT: s_waitcnt lgkmcnt(0) 988; GFX6-NEXT: s_bfe_i32 s2, s4, 0x80008 989; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 990; GFX6-NEXT: s_sext_i32_i8 s5, s4 991; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 992; GFX6-NEXT: s_xor_b32 s2, s5, s2 993; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 994; GFX6-NEXT: s_ashr_i32 s2, s2, 30 995; GFX6-NEXT: s_or_b32 s2, s2, 1 996; GFX6-NEXT: v_mov_b32_e32 v3, s2 997; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 998; GFX6-NEXT: v_trunc_f32_e32 v2, v2 999; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 1000; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 1001; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 1002; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 1003; GFX6-NEXT: s_lshr_b32 s3, s4, 8 1004; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1005; GFX6-NEXT: v_mul_lo_u32 v0, v0, s3 1006; GFX6-NEXT: s_mov_b32 s3, 0xf000 1007; GFX6-NEXT: s_mov_b32 s2, -1 1008; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1009; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 1010; GFX6-NEXT: s_endpgm 1011; 1012; GFX9-LABEL: srem_i8: 1013; GFX9: ; %bb.0: 1014; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 1015; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1016; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1017; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80008 1018; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 1019; GFX9-NEXT: s_sext_i32_i8 s1, s4 1020; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 1021; GFX9-NEXT: s_xor_b32 s0, s1, s0 1022; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 1023; GFX9-NEXT: s_ashr_i32 s0, s0, 30 1024; GFX9-NEXT: s_lshr_b32 s5, s4, 8 1025; GFX9-NEXT: s_or_b32 s6, s0, 1 1026; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 1027; GFX9-NEXT: v_trunc_f32_e32 v2, v2 1028; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 1029; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 1030; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 1031; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 1032; GFX9-NEXT: s_cselect_b32 s0, s6, 0 1033; GFX9-NEXT: v_add_u32_e32 v0, s0, v2 1034; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 1035; GFX9-NEXT: v_mov_b32_e32 v1, 0 1036; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 1037; GFX9-NEXT: global_store_byte v1, v0, s[2:3] 1038; GFX9-NEXT: s_endpgm 1039 %r = srem i8 %x, %y 1040 store i8 %r, i8 addrspace(1)* %out 1041 ret void 1042} 1043 1044define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1045; CHECK-LABEL: @udiv_v4i32( 1046; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1047; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1048; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 1049; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 1050; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 1051; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 1052; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 1053; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 1054; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 1055; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 1056; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 1057; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 1058; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 1059; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 1060; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 1061; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 1062; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1063; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1064; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1065; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1066; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1067; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 1068; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 1069; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 1070; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1 1071; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]] 1072; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]] 1073; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]] 1074; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]] 1075; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 1076; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 1077; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i64 0 1078; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[X]], i64 1 1079; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1080; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float 1081; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 1082; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000 1083; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32 1084; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]] 1085; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]] 1086; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64 1087; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64 1088; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]] 1089; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 1090; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32 1091; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32 1092; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]] 1093; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64 1094; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64 1095; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]] 1096; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 1097; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32 1098; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32 1099; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]] 1100; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]] 1101; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]] 1102; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1 1103; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]] 1104; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]] 1105; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]] 1106; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]] 1107; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1 1108; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]] 1109; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP63]], i64 1 1110; CHECK-NEXT: [[TMP65:%.*]] = extractelement <4 x i32> [[X]], i64 2 1111; CHECK-NEXT: [[TMP66:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1112; CHECK-NEXT: [[TMP67:%.*]] = uitofp i32 [[TMP66]] to float 1113; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP67]]) 1114; CHECK-NEXT: [[TMP69:%.*]] = fmul fast float [[TMP68]], 0x41EFFFFFC0000000 1115; CHECK-NEXT: [[TMP70:%.*]] = fptoui float [[TMP69]] to i32 1116; CHECK-NEXT: [[TMP71:%.*]] = sub i32 0, [[TMP66]] 1117; CHECK-NEXT: [[TMP72:%.*]] = mul i32 [[TMP71]], [[TMP70]] 1118; CHECK-NEXT: [[TMP73:%.*]] = zext i32 [[TMP70]] to i64 1119; CHECK-NEXT: [[TMP74:%.*]] = zext i32 [[TMP72]] to i64 1120; CHECK-NEXT: [[TMP75:%.*]] = mul i64 [[TMP73]], [[TMP74]] 1121; CHECK-NEXT: [[TMP76:%.*]] = trunc i64 [[TMP75]] to i32 1122; CHECK-NEXT: [[TMP77:%.*]] = lshr i64 [[TMP75]], 32 1123; CHECK-NEXT: [[TMP78:%.*]] = trunc i64 [[TMP77]] to i32 1124; CHECK-NEXT: [[TMP79:%.*]] = add i32 [[TMP70]], [[TMP78]] 1125; CHECK-NEXT: [[TMP80:%.*]] = zext i32 [[TMP65]] to i64 1126; CHECK-NEXT: [[TMP81:%.*]] = zext i32 [[TMP79]] to i64 1127; CHECK-NEXT: [[TMP82:%.*]] = mul i64 [[TMP80]], [[TMP81]] 1128; CHECK-NEXT: [[TMP83:%.*]] = trunc i64 [[TMP82]] to i32 1129; CHECK-NEXT: [[TMP84:%.*]] = lshr i64 [[TMP82]], 32 1130; CHECK-NEXT: [[TMP85:%.*]] = trunc i64 [[TMP84]] to i32 1131; CHECK-NEXT: [[TMP86:%.*]] = mul i32 [[TMP85]], [[TMP66]] 1132; CHECK-NEXT: [[TMP87:%.*]] = sub i32 [[TMP65]], [[TMP86]] 1133; CHECK-NEXT: [[TMP88:%.*]] = icmp uge i32 [[TMP87]], [[TMP66]] 1134; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP85]], 1 1135; CHECK-NEXT: [[TMP90:%.*]] = select i1 [[TMP88]], i32 [[TMP89]], i32 [[TMP85]] 1136; CHECK-NEXT: [[TMP91:%.*]] = sub i32 [[TMP87]], [[TMP66]] 1137; CHECK-NEXT: [[TMP92:%.*]] = select i1 [[TMP88]], i32 [[TMP91]], i32 [[TMP87]] 1138; CHECK-NEXT: [[TMP93:%.*]] = icmp uge i32 [[TMP92]], [[TMP66]] 1139; CHECK-NEXT: [[TMP94:%.*]] = add i32 [[TMP90]], 1 1140; CHECK-NEXT: [[TMP95:%.*]] = select i1 [[TMP93]], i32 [[TMP94]], i32 [[TMP90]] 1141; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP95]], i64 2 1142; CHECK-NEXT: [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 3 1143; CHECK-NEXT: [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1144; CHECK-NEXT: [[TMP99:%.*]] = uitofp i32 [[TMP98]] to float 1145; CHECK-NEXT: [[TMP100:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP99]]) 1146; CHECK-NEXT: [[TMP101:%.*]] = fmul fast float [[TMP100]], 0x41EFFFFFC0000000 1147; CHECK-NEXT: [[TMP102:%.*]] = fptoui float [[TMP101]] to i32 1148; CHECK-NEXT: [[TMP103:%.*]] = sub i32 0, [[TMP98]] 1149; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP102]] 1150; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP102]] to i64 1151; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64 1152; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 1153; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 1154; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 1155; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 1156; CHECK-NEXT: [[TMP111:%.*]] = add i32 [[TMP102]], [[TMP110]] 1157; CHECK-NEXT: [[TMP112:%.*]] = zext i32 [[TMP97]] to i64 1158; CHECK-NEXT: [[TMP113:%.*]] = zext i32 [[TMP111]] to i64 1159; CHECK-NEXT: [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]] 1160; CHECK-NEXT: [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32 1161; CHECK-NEXT: [[TMP116:%.*]] = lshr i64 [[TMP114]], 32 1162; CHECK-NEXT: [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32 1163; CHECK-NEXT: [[TMP118:%.*]] = mul i32 [[TMP117]], [[TMP98]] 1164; CHECK-NEXT: [[TMP119:%.*]] = sub i32 [[TMP97]], [[TMP118]] 1165; CHECK-NEXT: [[TMP120:%.*]] = icmp uge i32 [[TMP119]], [[TMP98]] 1166; CHECK-NEXT: [[TMP121:%.*]] = add i32 [[TMP117]], 1 1167; CHECK-NEXT: [[TMP122:%.*]] = select i1 [[TMP120]], i32 [[TMP121]], i32 [[TMP117]] 1168; CHECK-NEXT: [[TMP123:%.*]] = sub i32 [[TMP119]], [[TMP98]] 1169; CHECK-NEXT: [[TMP124:%.*]] = select i1 [[TMP120]], i32 [[TMP123]], i32 [[TMP119]] 1170; CHECK-NEXT: [[TMP125:%.*]] = icmp uge i32 [[TMP124]], [[TMP98]] 1171; CHECK-NEXT: [[TMP126:%.*]] = add i32 [[TMP122]], 1 1172; CHECK-NEXT: [[TMP127:%.*]] = select i1 [[TMP125]], i32 [[TMP126]], i32 [[TMP122]] 1173; CHECK-NEXT: [[TMP128:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP127]], i64 3 1174; CHECK-NEXT: store <4 x i32> [[TMP128]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1175; CHECK-NEXT: ret void 1176; 1177; GFX6-LABEL: udiv_v4i32: 1178; GFX6: ; %bb.0: 1179; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1180; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 1181; GFX6-NEXT: s_mov_b32 s15, 0xf000 1182; GFX6-NEXT: s_mov_b32 s14, -1 1183; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1184; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 1185; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 1186; GFX6-NEXT: s_sub_i32 s2, 0, s8 1187; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s10 1188; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 1189; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 1190; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s11 1191; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1192; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 1193; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 1194; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 1195; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 1196; GFX6-NEXT: s_sub_i32 s2, 0, s9 1197; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 1198; GFX6-NEXT: s_sub_i32 s2, 0, s10 1199; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 1200; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 1201; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1202; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 1203; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 1204; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 1205; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 1206; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 1207; GFX6-NEXT: v_mul_lo_u32 v5, v1, s9 1208; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 1209; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 1210; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 1211; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v2 1212; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 1213; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 1214; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 1215; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v4 1216; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1217; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s5, v5 1218; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 1219; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 1220; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1221; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v3 1222; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 1223; GFX6-NEXT: v_mul_lo_u32 v4, s2, v2 1224; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s9, v3 1225; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1226; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4 1227; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v1 1228; GFX6-NEXT: s_sub_i32 s0, 0, s11 1229; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 1230; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v6 1231; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 1232; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1233; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 1234; GFX6-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 1235; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 1236; GFX6-NEXT: v_mul_lo_u32 v3, v2, s10 1237; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v2 1238; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 1239; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 1240; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v3 1241; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 1242; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 1243; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s10, v3 1244; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 1245; GFX6-NEXT: v_mul_hi_u32 v4, s7, v4 1246; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 1247; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v2 1248; GFX6-NEXT: v_mul_lo_u32 v6, v4, s11 1249; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 1250; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1251; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1252; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v6 1253; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v3 1254; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] 1255; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s11, v3 1256; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1257; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1258; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1259; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 1260; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 1261; GFX6-NEXT: s_endpgm 1262; 1263; GFX9-LABEL: udiv_v4i32: 1264; GFX9: ; %bb.0: 1265; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 1266; GFX9-NEXT: v_mov_b32_e32 v4, 0 1267; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1268; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1269; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 1270; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 1271; GFX9-NEXT: s_sub_i32 s2, 0, s8 1272; GFX9-NEXT: s_sub_i32 s3, 0, s9 1273; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1274; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 1275; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s10 1276; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s11 1277; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1278; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1279; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 1280; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 1281; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 1282; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 1283; GFX9-NEXT: s_sub_i32 s2, 0, s10 1284; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 1285; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v6 1286; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 1287; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 1288; GFX9-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 1289; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 1290; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 1291; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 1292; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v5 1293; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 1294; GFX9-NEXT: v_mul_lo_u32 v3, v0, s8 1295; GFX9-NEXT: v_add_u32_e32 v7, 1, v0 1296; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 1297; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 1298; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 1299; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 1300; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc 1301; GFX9-NEXT: v_subrev_u32_e32 v7, s8, v3 1302; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc 1303; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 1304; GFX9-NEXT: v_mul_lo_u32 v3, s2, v2 1305; GFX9-NEXT: s_sub_i32 s2, 0, s11 1306; GFX9-NEXT: v_mul_lo_u32 v5, v1, s9 1307; GFX9-NEXT: v_add_u32_e32 v7, 1, v0 1308; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 1309; GFX9-NEXT: v_add_u32_e32 v8, 1, v1 1310; GFX9-NEXT: v_sub_u32_e32 v5, s5, v5 1311; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc 1312; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 1313; GFX9-NEXT: v_mul_lo_u32 v3, s2, v6 1314; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 1315; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 1316; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc 1317; GFX9-NEXT: v_mul_hi_u32 v3, v6, v3 1318; GFX9-NEXT: v_mul_lo_u32 v8, v2, s10 1319; GFX9-NEXT: v_subrev_u32_e32 v7, s9, v5 1320; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc 1321; GFX9-NEXT: v_add_u32_e32 v3, v6, v3 1322; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 1323; GFX9-NEXT: v_add_u32_e32 v7, 1, v1 1324; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 1325; GFX9-NEXT: v_sub_u32_e32 v5, s6, v8 1326; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc 1327; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v5 1328; GFX9-NEXT: v_subrev_u32_e32 v6, s10, v5 1329; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 1330; GFX9-NEXT: v_mul_lo_u32 v6, v3, s11 1331; GFX9-NEXT: v_add_u32_e32 v7, 1, v2 1332; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc 1333; GFX9-NEXT: v_add_u32_e32 v7, 1, v2 1334; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v5 1335; GFX9-NEXT: v_sub_u32_e32 v5, s7, v6 1336; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc 1337; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 1338; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 1339; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1340; GFX9-NEXT: v_subrev_u32_e32 v6, s11, v5 1341; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 1342; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 1343; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 1344; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1345; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1346; GFX9-NEXT: s_endpgm 1347 %r = udiv <4 x i32> %x, %y 1348 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1349 ret void 1350} 1351 1352define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1353; CHECK-LABEL: @urem_v4i32( 1354; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1355; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1356; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 1357; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 1358; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 1359; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 1360; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 1361; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 1362; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 1363; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 1364; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 1365; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 1366; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 1367; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 1368; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 1369; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 1370; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1371; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1372; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1373; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1374; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1375; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 1376; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 1377; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 1378; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]] 1379; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]] 1380; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]] 1381; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]] 1382; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]] 1383; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> undef, i32 [[TMP29]], i64 0 1384; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[X]], i64 1 1385; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1386; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float 1387; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 1388; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000 1389; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32 1390; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]] 1391; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]] 1392; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64 1393; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64 1394; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]] 1395; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32 1396; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32 1397; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 1398; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]] 1399; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64 1400; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64 1401; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]] 1402; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32 1403; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32 1404; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 1405; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]] 1406; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]] 1407; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]] 1408; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]] 1409; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]] 1410; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]] 1411; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]] 1412; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]] 1413; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP59]], i64 1 1414; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i32> [[X]], i64 2 1415; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1416; CHECK-NEXT: [[TMP63:%.*]] = uitofp i32 [[TMP62]] to float 1417; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP63]]) 1418; CHECK-NEXT: [[TMP65:%.*]] = fmul fast float [[TMP64]], 0x41EFFFFFC0000000 1419; CHECK-NEXT: [[TMP66:%.*]] = fptoui float [[TMP65]] to i32 1420; CHECK-NEXT: [[TMP67:%.*]] = sub i32 0, [[TMP62]] 1421; CHECK-NEXT: [[TMP68:%.*]] = mul i32 [[TMP67]], [[TMP66]] 1422; CHECK-NEXT: [[TMP69:%.*]] = zext i32 [[TMP66]] to i64 1423; CHECK-NEXT: [[TMP70:%.*]] = zext i32 [[TMP68]] to i64 1424; CHECK-NEXT: [[TMP71:%.*]] = mul i64 [[TMP69]], [[TMP70]] 1425; CHECK-NEXT: [[TMP72:%.*]] = trunc i64 [[TMP71]] to i32 1426; CHECK-NEXT: [[TMP73:%.*]] = lshr i64 [[TMP71]], 32 1427; CHECK-NEXT: [[TMP74:%.*]] = trunc i64 [[TMP73]] to i32 1428; CHECK-NEXT: [[TMP75:%.*]] = add i32 [[TMP66]], [[TMP74]] 1429; CHECK-NEXT: [[TMP76:%.*]] = zext i32 [[TMP61]] to i64 1430; CHECK-NEXT: [[TMP77:%.*]] = zext i32 [[TMP75]] to i64 1431; CHECK-NEXT: [[TMP78:%.*]] = mul i64 [[TMP76]], [[TMP77]] 1432; CHECK-NEXT: [[TMP79:%.*]] = trunc i64 [[TMP78]] to i32 1433; CHECK-NEXT: [[TMP80:%.*]] = lshr i64 [[TMP78]], 32 1434; CHECK-NEXT: [[TMP81:%.*]] = trunc i64 [[TMP80]] to i32 1435; CHECK-NEXT: [[TMP82:%.*]] = mul i32 [[TMP81]], [[TMP62]] 1436; CHECK-NEXT: [[TMP83:%.*]] = sub i32 [[TMP61]], [[TMP82]] 1437; CHECK-NEXT: [[TMP84:%.*]] = icmp uge i32 [[TMP83]], [[TMP62]] 1438; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP83]], [[TMP62]] 1439; CHECK-NEXT: [[TMP86:%.*]] = select i1 [[TMP84]], i32 [[TMP85]], i32 [[TMP83]] 1440; CHECK-NEXT: [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP62]] 1441; CHECK-NEXT: [[TMP88:%.*]] = sub i32 [[TMP86]], [[TMP62]] 1442; CHECK-NEXT: [[TMP89:%.*]] = select i1 [[TMP87]], i32 [[TMP88]], i32 [[TMP86]] 1443; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP89]], i64 2 1444; CHECK-NEXT: [[TMP91:%.*]] = extractelement <4 x i32> [[X]], i64 3 1445; CHECK-NEXT: [[TMP92:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1446; CHECK-NEXT: [[TMP93:%.*]] = uitofp i32 [[TMP92]] to float 1447; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP93]]) 1448; CHECK-NEXT: [[TMP95:%.*]] = fmul fast float [[TMP94]], 0x41EFFFFFC0000000 1449; CHECK-NEXT: [[TMP96:%.*]] = fptoui float [[TMP95]] to i32 1450; CHECK-NEXT: [[TMP97:%.*]] = sub i32 0, [[TMP92]] 1451; CHECK-NEXT: [[TMP98:%.*]] = mul i32 [[TMP97]], [[TMP96]] 1452; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP96]] to i64 1453; CHECK-NEXT: [[TMP100:%.*]] = zext i32 [[TMP98]] to i64 1454; CHECK-NEXT: [[TMP101:%.*]] = mul i64 [[TMP99]], [[TMP100]] 1455; CHECK-NEXT: [[TMP102:%.*]] = trunc i64 [[TMP101]] to i32 1456; CHECK-NEXT: [[TMP103:%.*]] = lshr i64 [[TMP101]], 32 1457; CHECK-NEXT: [[TMP104:%.*]] = trunc i64 [[TMP103]] to i32 1458; CHECK-NEXT: [[TMP105:%.*]] = add i32 [[TMP96]], [[TMP104]] 1459; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP91]] to i64 1460; CHECK-NEXT: [[TMP107:%.*]] = zext i32 [[TMP105]] to i64 1461; CHECK-NEXT: [[TMP108:%.*]] = mul i64 [[TMP106]], [[TMP107]] 1462; CHECK-NEXT: [[TMP109:%.*]] = trunc i64 [[TMP108]] to i32 1463; CHECK-NEXT: [[TMP110:%.*]] = lshr i64 [[TMP108]], 32 1464; CHECK-NEXT: [[TMP111:%.*]] = trunc i64 [[TMP110]] to i32 1465; CHECK-NEXT: [[TMP112:%.*]] = mul i32 [[TMP111]], [[TMP92]] 1466; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP91]], [[TMP112]] 1467; CHECK-NEXT: [[TMP114:%.*]] = icmp uge i32 [[TMP113]], [[TMP92]] 1468; CHECK-NEXT: [[TMP115:%.*]] = sub i32 [[TMP113]], [[TMP92]] 1469; CHECK-NEXT: [[TMP116:%.*]] = select i1 [[TMP114]], i32 [[TMP115]], i32 [[TMP113]] 1470; CHECK-NEXT: [[TMP117:%.*]] = icmp uge i32 [[TMP116]], [[TMP92]] 1471; CHECK-NEXT: [[TMP118:%.*]] = sub i32 [[TMP116]], [[TMP92]] 1472; CHECK-NEXT: [[TMP119:%.*]] = select i1 [[TMP117]], i32 [[TMP118]], i32 [[TMP116]] 1473; CHECK-NEXT: [[TMP120:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP119]], i64 3 1474; CHECK-NEXT: store <4 x i32> [[TMP120]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1475; CHECK-NEXT: ret void 1476; 1477; GFX6-LABEL: urem_v4i32: 1478; GFX6: ; %bb.0: 1479; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1480; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1481; GFX6-NEXT: s_mov_b32 s3, 0xf000 1482; GFX6-NEXT: s_mov_b32 s2, -1 1483; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1484; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 1485; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 1486; GFX6-NEXT: s_sub_i32 s12, 0, s8 1487; GFX6-NEXT: s_sub_i32 s13, 0, s9 1488; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 1489; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 1490; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 1491; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s11 1492; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1493; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 1494; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 1495; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 1496; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 1497; GFX6-NEXT: v_mul_lo_u32 v2, s12, v0 1498; GFX6-NEXT: v_mul_lo_u32 v4, s13, v1 1499; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 1500; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 1501; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1502; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 1503; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 1504; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 1505; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 1506; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v3 1507; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 1508; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 1509; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1510; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 1511; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1512; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1513; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 1514; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1515; GFX6-NEXT: s_sub_i32 s4, 0, s10 1516; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1517; GFX6-NEXT: v_mul_lo_u32 v3, s4, v2 1518; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 1519; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 1520; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1521; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 1522; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1523; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 1524; GFX6-NEXT: s_sub_i32 s4, 0, s11 1525; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 1526; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v4 1527; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 1528; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 1529; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 1530; GFX6-NEXT: v_mul_lo_u32 v5, s4, v3 1531; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1532; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1533; GFX6-NEXT: v_mul_lo_u32 v2, v2, s10 1534; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 1535; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 1536; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 1537; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 1538; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 1539; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1540; GFX6-NEXT: v_mul_lo_u32 v3, v3, s11 1541; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1542; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s10, v2 1543; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1544; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 1545; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 1546; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 1547; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1548; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1549; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 1550; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1551; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1552; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1553; GFX6-NEXT: s_endpgm 1554; 1555; GFX9-LABEL: urem_v4i32: 1556; GFX9: ; %bb.0: 1557; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 1558; GFX9-NEXT: v_mov_b32_e32 v4, 0 1559; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1560; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1561; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 1562; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 1563; GFX9-NEXT: s_sub_i32 s2, 0, s8 1564; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s10 1565; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1566; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 1567; GFX9-NEXT: s_sub_i32 s3, 0, s9 1568; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 1569; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1570; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1571; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 1572; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 1573; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 1574; GFX9-NEXT: v_mul_lo_u32 v3, s2, v0 1575; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 1576; GFX9-NEXT: v_mul_lo_u32 v5, s3, v1 1577; GFX9-NEXT: s_sub_i32 s2, 0, s10 1578; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 1579; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 1580; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 1581; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s11 1582; GFX9-NEXT: v_add_u32_e32 v1, v1, v5 1583; GFX9-NEXT: v_mul_lo_u32 v5, s2, v2 1584; GFX9-NEXT: s_sub_i32 s2, 0, s11 1585; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 1586; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 1587; GFX9-NEXT: v_mul_hi_u32 v5, v2, v5 1588; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 1589; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 1590; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 1591; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 1592; GFX9-NEXT: v_mul_lo_u32 v0, v0, s8 1593; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 1594; GFX9-NEXT: v_mul_lo_u32 v5, s2, v3 1595; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 1596; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 1597; GFX9-NEXT: v_subrev_u32_e32 v6, s8, v0 1598; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 1599; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1600; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 1601; GFX9-NEXT: v_mul_lo_u32 v2, v2, s10 1602; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 1603; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 1604; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 1605; GFX9-NEXT: v_subrev_u32_e32 v6, s8, v0 1606; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1607; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 1608; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v1 1609; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1610; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 1611; GFX9-NEXT: v_mul_lo_u32 v3, v3, s11 1612; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v1 1613; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1614; GFX9-NEXT: v_sub_u32_e32 v2, s6, v2 1615; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 1616; GFX9-NEXT: v_subrev_u32_e32 v5, s10, v2 1617; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1618; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1619; GFX9-NEXT: v_subrev_u32_e32 v5, s10, v2 1620; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1621; GFX9-NEXT: v_sub_u32_e32 v3, s7, v3 1622; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1623; GFX9-NEXT: v_subrev_u32_e32 v5, s11, v3 1624; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1625; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 1626; GFX9-NEXT: v_subrev_u32_e32 v5, s11, v3 1627; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1628; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 1629; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1630; GFX9-NEXT: s_endpgm 1631 %r = urem <4 x i32> %x, %y 1632 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1633 ret void 1634} 1635 1636define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1637; CHECK-LABEL: @sdiv_v4i32( 1638; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1639; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1640; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 1641; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 1642; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 1643; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 1644; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 1645; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 1646; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 1647; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 1648; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 1649; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000 1650; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 1651; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]] 1652; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]] 1653; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64 1654; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1655; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1656; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1657; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1658; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1659; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]] 1660; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64 1661; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64 1662; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 1663; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 1664; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 1665; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 1666; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]] 1667; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]] 1668; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]] 1669; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1 1670; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]] 1671; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]] 1672; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]] 1673; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]] 1674; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1 1675; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]] 1676; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]] 1677; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]] 1678; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> undef, i32 [[TMP40]], i64 0 1679; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[X]], i64 1 1680; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1681; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31 1682; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31 1683; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]] 1684; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]] 1685; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]] 1686; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]] 1687; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]] 1688; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float 1689; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]]) 1690; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000 1691; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32 1692; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]] 1693; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]] 1694; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64 1695; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64 1696; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]] 1697; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32 1698; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32 1699; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32 1700; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]] 1701; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64 1702; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64 1703; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]] 1704; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32 1705; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32 1706; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32 1707; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]] 1708; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]] 1709; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]] 1710; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1 1711; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]] 1712; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]] 1713; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]] 1714; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]] 1715; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1 1716; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]] 1717; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]] 1718; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]] 1719; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP81]], i64 1 1720; CHECK-NEXT: [[TMP83:%.*]] = extractelement <4 x i32> [[X]], i64 2 1721; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1722; CHECK-NEXT: [[TMP85:%.*]] = ashr i32 [[TMP83]], 31 1723; CHECK-NEXT: [[TMP86:%.*]] = ashr i32 [[TMP84]], 31 1724; CHECK-NEXT: [[TMP87:%.*]] = xor i32 [[TMP85]], [[TMP86]] 1725; CHECK-NEXT: [[TMP88:%.*]] = add i32 [[TMP83]], [[TMP85]] 1726; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP84]], [[TMP86]] 1727; CHECK-NEXT: [[TMP90:%.*]] = xor i32 [[TMP88]], [[TMP85]] 1728; CHECK-NEXT: [[TMP91:%.*]] = xor i32 [[TMP89]], [[TMP86]] 1729; CHECK-NEXT: [[TMP92:%.*]] = uitofp i32 [[TMP91]] to float 1730; CHECK-NEXT: [[TMP93:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP92]]) 1731; CHECK-NEXT: [[TMP94:%.*]] = fmul fast float [[TMP93]], 0x41EFFFFFC0000000 1732; CHECK-NEXT: [[TMP95:%.*]] = fptoui float [[TMP94]] to i32 1733; CHECK-NEXT: [[TMP96:%.*]] = sub i32 0, [[TMP91]] 1734; CHECK-NEXT: [[TMP97:%.*]] = mul i32 [[TMP96]], [[TMP95]] 1735; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP95]] to i64 1736; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64 1737; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]] 1738; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 1739; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32 1740; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32 1741; CHECK-NEXT: [[TMP104:%.*]] = add i32 [[TMP95]], [[TMP103]] 1742; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP90]] to i64 1743; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64 1744; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 1745; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 1746; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 1747; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 1748; CHECK-NEXT: [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP91]] 1749; CHECK-NEXT: [[TMP112:%.*]] = sub i32 [[TMP90]], [[TMP111]] 1750; CHECK-NEXT: [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP91]] 1751; CHECK-NEXT: [[TMP114:%.*]] = add i32 [[TMP110]], 1 1752; CHECK-NEXT: [[TMP115:%.*]] = select i1 [[TMP113]], i32 [[TMP114]], i32 [[TMP110]] 1753; CHECK-NEXT: [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP91]] 1754; CHECK-NEXT: [[TMP117:%.*]] = select i1 [[TMP113]], i32 [[TMP116]], i32 [[TMP112]] 1755; CHECK-NEXT: [[TMP118:%.*]] = icmp uge i32 [[TMP117]], [[TMP91]] 1756; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], 1 1757; CHECK-NEXT: [[TMP120:%.*]] = select i1 [[TMP118]], i32 [[TMP119]], i32 [[TMP115]] 1758; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP120]], [[TMP87]] 1759; CHECK-NEXT: [[TMP122:%.*]] = sub i32 [[TMP121]], [[TMP87]] 1760; CHECK-NEXT: [[TMP123:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP122]], i64 2 1761; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i32> [[X]], i64 3 1762; CHECK-NEXT: [[TMP125:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1763; CHECK-NEXT: [[TMP126:%.*]] = ashr i32 [[TMP124]], 31 1764; CHECK-NEXT: [[TMP127:%.*]] = ashr i32 [[TMP125]], 31 1765; CHECK-NEXT: [[TMP128:%.*]] = xor i32 [[TMP126]], [[TMP127]] 1766; CHECK-NEXT: [[TMP129:%.*]] = add i32 [[TMP124]], [[TMP126]] 1767; CHECK-NEXT: [[TMP130:%.*]] = add i32 [[TMP125]], [[TMP127]] 1768; CHECK-NEXT: [[TMP131:%.*]] = xor i32 [[TMP129]], [[TMP126]] 1769; CHECK-NEXT: [[TMP132:%.*]] = xor i32 [[TMP130]], [[TMP127]] 1770; CHECK-NEXT: [[TMP133:%.*]] = uitofp i32 [[TMP132]] to float 1771; CHECK-NEXT: [[TMP134:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP133]]) 1772; CHECK-NEXT: [[TMP135:%.*]] = fmul fast float [[TMP134]], 0x41EFFFFFC0000000 1773; CHECK-NEXT: [[TMP136:%.*]] = fptoui float [[TMP135]] to i32 1774; CHECK-NEXT: [[TMP137:%.*]] = sub i32 0, [[TMP132]] 1775; CHECK-NEXT: [[TMP138:%.*]] = mul i32 [[TMP137]], [[TMP136]] 1776; CHECK-NEXT: [[TMP139:%.*]] = zext i32 [[TMP136]] to i64 1777; CHECK-NEXT: [[TMP140:%.*]] = zext i32 [[TMP138]] to i64 1778; CHECK-NEXT: [[TMP141:%.*]] = mul i64 [[TMP139]], [[TMP140]] 1779; CHECK-NEXT: [[TMP142:%.*]] = trunc i64 [[TMP141]] to i32 1780; CHECK-NEXT: [[TMP143:%.*]] = lshr i64 [[TMP141]], 32 1781; CHECK-NEXT: [[TMP144:%.*]] = trunc i64 [[TMP143]] to i32 1782; CHECK-NEXT: [[TMP145:%.*]] = add i32 [[TMP136]], [[TMP144]] 1783; CHECK-NEXT: [[TMP146:%.*]] = zext i32 [[TMP131]] to i64 1784; CHECK-NEXT: [[TMP147:%.*]] = zext i32 [[TMP145]] to i64 1785; CHECK-NEXT: [[TMP148:%.*]] = mul i64 [[TMP146]], [[TMP147]] 1786; CHECK-NEXT: [[TMP149:%.*]] = trunc i64 [[TMP148]] to i32 1787; CHECK-NEXT: [[TMP150:%.*]] = lshr i64 [[TMP148]], 32 1788; CHECK-NEXT: [[TMP151:%.*]] = trunc i64 [[TMP150]] to i32 1789; CHECK-NEXT: [[TMP152:%.*]] = mul i32 [[TMP151]], [[TMP132]] 1790; CHECK-NEXT: [[TMP153:%.*]] = sub i32 [[TMP131]], [[TMP152]] 1791; CHECK-NEXT: [[TMP154:%.*]] = icmp uge i32 [[TMP153]], [[TMP132]] 1792; CHECK-NEXT: [[TMP155:%.*]] = add i32 [[TMP151]], 1 1793; CHECK-NEXT: [[TMP156:%.*]] = select i1 [[TMP154]], i32 [[TMP155]], i32 [[TMP151]] 1794; CHECK-NEXT: [[TMP157:%.*]] = sub i32 [[TMP153]], [[TMP132]] 1795; CHECK-NEXT: [[TMP158:%.*]] = select i1 [[TMP154]], i32 [[TMP157]], i32 [[TMP153]] 1796; CHECK-NEXT: [[TMP159:%.*]] = icmp uge i32 [[TMP158]], [[TMP132]] 1797; CHECK-NEXT: [[TMP160:%.*]] = add i32 [[TMP156]], 1 1798; CHECK-NEXT: [[TMP161:%.*]] = select i1 [[TMP159]], i32 [[TMP160]], i32 [[TMP156]] 1799; CHECK-NEXT: [[TMP162:%.*]] = xor i32 [[TMP161]], [[TMP128]] 1800; CHECK-NEXT: [[TMP163:%.*]] = sub i32 [[TMP162]], [[TMP128]] 1801; CHECK-NEXT: [[TMP164:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP163]], i64 3 1802; CHECK-NEXT: store <4 x i32> [[TMP164]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1803; CHECK-NEXT: ret void 1804; 1805; GFX6-LABEL: sdiv_v4i32: 1806; GFX6: ; %bb.0: 1807; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1808; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 1809; GFX6-NEXT: s_mov_b32 s15, 0xf000 1810; GFX6-NEXT: s_mov_b32 s14, -1 1811; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1812; GFX6-NEXT: s_ashr_i32 s2, s8, 31 1813; GFX6-NEXT: s_add_i32 s3, s8, s2 1814; GFX6-NEXT: s_xor_b32 s3, s3, s2 1815; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 1816; GFX6-NEXT: s_ashr_i32 s8, s9, 31 1817; GFX6-NEXT: s_add_i32 s0, s9, s8 1818; GFX6-NEXT: s_xor_b32 s9, s0, s8 1819; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 1820; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 1821; GFX6-NEXT: s_sub_i32 s1, 0, s3 1822; GFX6-NEXT: s_ashr_i32 s0, s4, 31 1823; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1824; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 1825; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 1826; GFX6-NEXT: s_xor_b32 s2, s0, s2 1827; GFX6-NEXT: v_mul_lo_u32 v2, s1, v0 1828; GFX6-NEXT: s_add_i32 s1, s4, s0 1829; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 1830; GFX6-NEXT: s_xor_b32 s1, s1, s0 1831; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 1832; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 1833; GFX6-NEXT: s_sub_i32 s0, 0, s9 1834; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1835; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 1836; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 1837; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 1838; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 1839; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 1840; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 1841; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v3 1842; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 1843; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s3, v3 1844; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] 1845; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 1846; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 1847; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 1848; GFX6-NEXT: s_ashr_i32 s0, s5, 31 1849; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 1850; GFX6-NEXT: s_add_i32 s1, s5, s0 1851; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 1852; GFX6-NEXT: s_ashr_i32 s3, s10, 31 1853; GFX6-NEXT: s_xor_b32 s1, s1, s0 1854; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 1855; GFX6-NEXT: s_xor_b32 s2, s0, s8 1856; GFX6-NEXT: s_add_i32 s0, s10, s3 1857; GFX6-NEXT: s_xor_b32 s4, s0, s3 1858; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 1859; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 1860; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 1861; GFX6-NEXT: v_mul_lo_u32 v2, v1, s9 1862; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1863; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 1864; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 1865; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 1866; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 1867; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 1868; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v2 1869; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 1870; GFX6-NEXT: s_sub_i32 s0, 0, s4 1871; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 1872; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1873; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 1874; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1875; GFX6-NEXT: v_mul_hi_u32 v2, v3, v5 1876; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 1877; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 1878; GFX6-NEXT: s_ashr_i32 s2, s11, 31 1879; GFX6-NEXT: s_ashr_i32 s0, s6, 31 1880; GFX6-NEXT: s_add_i32 s5, s11, s2 1881; GFX6-NEXT: s_add_i32 s1, s6, s0 1882; GFX6-NEXT: s_xor_b32 s5, s5, s2 1883; GFX6-NEXT: s_xor_b32 s1, s1, s0 1884; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 1885; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s5 1886; GFX6-NEXT: v_mul_hi_u32 v2, s1, v2 1887; GFX6-NEXT: s_xor_b32 s3, s0, s3 1888; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v4 1889; GFX6-NEXT: v_mul_lo_u32 v3, v2, s4 1890; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v2 1891; GFX6-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 1892; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 1893; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 1894; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v3 1895; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 1896; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v3 1897; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1898; GFX6-NEXT: s_sub_i32 s0, 0, s5 1899; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 1900; GFX6-NEXT: s_ashr_i32 s0, s7, 31 1901; GFX6-NEXT: s_add_i32 s1, s7, s0 1902; GFX6-NEXT: s_xor_b32 s1, s1, s0 1903; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 1904; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v2 1905; GFX6-NEXT: s_xor_b32 s2, s0, s2 1906; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 1907; GFX6-NEXT: v_mul_hi_u32 v4, s1, v4 1908; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 1909; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 1910; GFX6-NEXT: v_xor_b32_e32 v2, s3, v2 1911; GFX6-NEXT: v_mul_lo_u32 v3, v4, s5 1912; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1913; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v2 1914; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 1915; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v3 1916; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] 1917; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v3 1918; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1919; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1920; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 1921; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 1922; GFX6-NEXT: v_xor_b32_e32 v3, s2, v3 1923; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v3 1924; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 1925; GFX6-NEXT: s_endpgm 1926; 1927; GFX9-LABEL: sdiv_v4i32: 1928; GFX9: ; %bb.0: 1929; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 1930; GFX9-NEXT: v_mov_b32_e32 v4, 0 1931; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1932; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1933; GFX9-NEXT: s_ashr_i32 s2, s8, 31 1934; GFX9-NEXT: s_add_i32 s3, s8, s2 1935; GFX9-NEXT: s_xor_b32 s3, s3, s2 1936; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 1937; GFX9-NEXT: s_ashr_i32 s12, s9, 31 1938; GFX9-NEXT: s_add_i32 s9, s9, s12 1939; GFX9-NEXT: s_xor_b32 s9, s9, s12 1940; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1941; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 1942; GFX9-NEXT: s_sub_i32 s14, 0, s3 1943; GFX9-NEXT: s_ashr_i32 s8, s4, 31 1944; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 1945; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1946; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 1947; GFX9-NEXT: s_add_i32 s4, s4, s8 1948; GFX9-NEXT: s_xor_b32 s4, s4, s8 1949; GFX9-NEXT: v_mul_lo_u32 v2, s14, v0 1950; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 1951; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 1952; GFX9-NEXT: s_sub_i32 s14, 0, s9 1953; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 1954; GFX9-NEXT: s_ashr_i32 s13, s5, 31 1955; GFX9-NEXT: v_mul_lo_u32 v3, s14, v1 1956; GFX9-NEXT: s_add_i32 s5, s5, s13 1957; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 1958; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 1959; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 1960; GFX9-NEXT: s_xor_b32 s5, s5, s13 1961; GFX9-NEXT: s_xor_b32 s2, s8, s2 1962; GFX9-NEXT: v_mul_lo_u32 v3, v0, s3 1963; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 1964; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 1965; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 1966; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 1967; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 1968; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 1969; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v3 1970; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 1971; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 1972; GFX9-NEXT: s_ashr_i32 s3, s10, 31 1973; GFX9-NEXT: s_add_i32 s4, s10, s3 1974; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 1975; GFX9-NEXT: s_xor_b32 s4, s4, s3 1976; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1977; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 1978; GFX9-NEXT: v_mul_lo_u32 v2, v1, s9 1979; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 1980; GFX9-NEXT: s_ashr_i32 s8, s11, 31 1981; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 1982; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 1983; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 1984; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1985; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 1986; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 1987; GFX9-NEXT: v_subrev_u32_e32 v5, s9, v2 1988; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1989; GFX9-NEXT: s_sub_i32 s5, 0, s4 1990; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 1991; GFX9-NEXT: v_mul_lo_u32 v2, s5, v3 1992; GFX9-NEXT: s_add_i32 s9, s11, s8 1993; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 1994; GFX9-NEXT: s_xor_b32 s9, s9, s8 1995; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1996; GFX9-NEXT: v_mul_hi_u32 v2, v3, v2 1997; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s9 1998; GFX9-NEXT: s_ashr_i32 s5, s6, 31 1999; GFX9-NEXT: s_add_i32 s6, s6, s5 2000; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 2001; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v5 2002; GFX9-NEXT: s_xor_b32 s6, s6, s5 2003; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 2004; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 2005; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 2006; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 2007; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 2008; GFX9-NEXT: s_xor_b32 s2, s13, s12 2009; GFX9-NEXT: v_mul_lo_u32 v5, v2, s4 2010; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 2011; GFX9-NEXT: v_subrev_u32_e32 v1, s2, v1 2012; GFX9-NEXT: s_xor_b32 s2, s5, s3 2013; GFX9-NEXT: s_sub_i32 s3, 0, s9 2014; GFX9-NEXT: v_mul_lo_u32 v7, s3, v3 2015; GFX9-NEXT: v_sub_u32_e32 v5, s6, v5 2016; GFX9-NEXT: v_add_u32_e32 v6, 1, v2 2017; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 2018; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2019; GFX9-NEXT: v_subrev_u32_e32 v6, s4, v5 2020; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 2021; GFX9-NEXT: v_mul_hi_u32 v6, v3, v7 2022; GFX9-NEXT: s_ashr_i32 s3, s7, 31 2023; GFX9-NEXT: s_add_i32 s5, s7, s3 2024; GFX9-NEXT: s_xor_b32 s5, s5, s3 2025; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 2026; GFX9-NEXT: v_mul_hi_u32 v3, s5, v3 2027; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 2028; GFX9-NEXT: v_add_u32_e32 v6, 1, v2 2029; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2030; GFX9-NEXT: v_mul_lo_u32 v5, v3, s9 2031; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 2032; GFX9-NEXT: v_xor_b32_e32 v2, s2, v2 2033; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v2 2034; GFX9-NEXT: v_sub_u32_e32 v5, s5, v5 2035; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 2036; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2037; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v5 2038; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 2039; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 2040; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 2041; GFX9-NEXT: s_xor_b32 s2, s3, s8 2042; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2043; GFX9-NEXT: v_xor_b32_e32 v3, s2, v3 2044; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v3 2045; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2046; GFX9-NEXT: s_endpgm 2047 %r = sdiv <4 x i32> %x, %y 2048 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 2049 ret void 2050} 2051 2052define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 2053; CHECK-LABEL: @srem_v4i32( 2054; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 2055; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 2056; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 2057; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 2058; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 2059; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 2060; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 2061; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 2062; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 2063; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2064; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000 2065; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 2066; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]] 2067; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]] 2068; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64 2069; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 2070; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]] 2071; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 2072; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32 2073; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 2074; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] 2075; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 2076; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64 2077; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 2078; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 2079; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 2080; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 2081; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]] 2082; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]] 2083; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]] 2084; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]] 2085; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]] 2086; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]] 2087; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]] 2088; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]] 2089; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]] 2090; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]] 2091; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i32> undef, i32 [[TMP37]], i64 0 2092; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i32> [[X]], i64 1 2093; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[Y]], i64 1 2094; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31 2095; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31 2096; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]] 2097; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]] 2098; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]] 2099; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]] 2100; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float 2101; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]]) 2102; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000 2103; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32 2104; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]] 2105; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]] 2106; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64 2107; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64 2108; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]] 2109; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32 2110; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32 2111; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32 2112; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]] 2113; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64 2114; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64 2115; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]] 2116; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32 2117; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32 2118; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32 2119; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]] 2120; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]] 2121; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]] 2122; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]] 2123; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]] 2124; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]] 2125; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]] 2126; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]] 2127; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]] 2128; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]] 2129; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP75]], i64 1 2130; CHECK-NEXT: [[TMP77:%.*]] = extractelement <4 x i32> [[X]], i64 2 2131; CHECK-NEXT: [[TMP78:%.*]] = extractelement <4 x i32> [[Y]], i64 2 2132; CHECK-NEXT: [[TMP79:%.*]] = ashr i32 [[TMP77]], 31 2133; CHECK-NEXT: [[TMP80:%.*]] = ashr i32 [[TMP78]], 31 2134; CHECK-NEXT: [[TMP81:%.*]] = add i32 [[TMP77]], [[TMP79]] 2135; CHECK-NEXT: [[TMP82:%.*]] = add i32 [[TMP78]], [[TMP80]] 2136; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP79]] 2137; CHECK-NEXT: [[TMP84:%.*]] = xor i32 [[TMP82]], [[TMP80]] 2138; CHECK-NEXT: [[TMP85:%.*]] = uitofp i32 [[TMP84]] to float 2139; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP85]]) 2140; CHECK-NEXT: [[TMP87:%.*]] = fmul fast float [[TMP86]], 0x41EFFFFFC0000000 2141; CHECK-NEXT: [[TMP88:%.*]] = fptoui float [[TMP87]] to i32 2142; CHECK-NEXT: [[TMP89:%.*]] = sub i32 0, [[TMP84]] 2143; CHECK-NEXT: [[TMP90:%.*]] = mul i32 [[TMP89]], [[TMP88]] 2144; CHECK-NEXT: [[TMP91:%.*]] = zext i32 [[TMP88]] to i64 2145; CHECK-NEXT: [[TMP92:%.*]] = zext i32 [[TMP90]] to i64 2146; CHECK-NEXT: [[TMP93:%.*]] = mul i64 [[TMP91]], [[TMP92]] 2147; CHECK-NEXT: [[TMP94:%.*]] = trunc i64 [[TMP93]] to i32 2148; CHECK-NEXT: [[TMP95:%.*]] = lshr i64 [[TMP93]], 32 2149; CHECK-NEXT: [[TMP96:%.*]] = trunc i64 [[TMP95]] to i32 2150; CHECK-NEXT: [[TMP97:%.*]] = add i32 [[TMP88]], [[TMP96]] 2151; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP83]] to i64 2152; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64 2153; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]] 2154; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 2155; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32 2156; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32 2157; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP84]] 2158; CHECK-NEXT: [[TMP105:%.*]] = sub i32 [[TMP83]], [[TMP104]] 2159; CHECK-NEXT: [[TMP106:%.*]] = icmp uge i32 [[TMP105]], [[TMP84]] 2160; CHECK-NEXT: [[TMP107:%.*]] = sub i32 [[TMP105]], [[TMP84]] 2161; CHECK-NEXT: [[TMP108:%.*]] = select i1 [[TMP106]], i32 [[TMP107]], i32 [[TMP105]] 2162; CHECK-NEXT: [[TMP109:%.*]] = icmp uge i32 [[TMP108]], [[TMP84]] 2163; CHECK-NEXT: [[TMP110:%.*]] = sub i32 [[TMP108]], [[TMP84]] 2164; CHECK-NEXT: [[TMP111:%.*]] = select i1 [[TMP109]], i32 [[TMP110]], i32 [[TMP108]] 2165; CHECK-NEXT: [[TMP112:%.*]] = xor i32 [[TMP111]], [[TMP79]] 2166; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP112]], [[TMP79]] 2167; CHECK-NEXT: [[TMP114:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP113]], i64 2 2168; CHECK-NEXT: [[TMP115:%.*]] = extractelement <4 x i32> [[X]], i64 3 2169; CHECK-NEXT: [[TMP116:%.*]] = extractelement <4 x i32> [[Y]], i64 3 2170; CHECK-NEXT: [[TMP117:%.*]] = ashr i32 [[TMP115]], 31 2171; CHECK-NEXT: [[TMP118:%.*]] = ashr i32 [[TMP116]], 31 2172; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], [[TMP117]] 2173; CHECK-NEXT: [[TMP120:%.*]] = add i32 [[TMP116]], [[TMP118]] 2174; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP119]], [[TMP117]] 2175; CHECK-NEXT: [[TMP122:%.*]] = xor i32 [[TMP120]], [[TMP118]] 2176; CHECK-NEXT: [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float 2177; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]]) 2178; CHECK-NEXT: [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41EFFFFFC0000000 2179; CHECK-NEXT: [[TMP126:%.*]] = fptoui float [[TMP125]] to i32 2180; CHECK-NEXT: [[TMP127:%.*]] = sub i32 0, [[TMP122]] 2181; CHECK-NEXT: [[TMP128:%.*]] = mul i32 [[TMP127]], [[TMP126]] 2182; CHECK-NEXT: [[TMP129:%.*]] = zext i32 [[TMP126]] to i64 2183; CHECK-NEXT: [[TMP130:%.*]] = zext i32 [[TMP128]] to i64 2184; CHECK-NEXT: [[TMP131:%.*]] = mul i64 [[TMP129]], [[TMP130]] 2185; CHECK-NEXT: [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32 2186; CHECK-NEXT: [[TMP133:%.*]] = lshr i64 [[TMP131]], 32 2187; CHECK-NEXT: [[TMP134:%.*]] = trunc i64 [[TMP133]] to i32 2188; CHECK-NEXT: [[TMP135:%.*]] = add i32 [[TMP126]], [[TMP134]] 2189; CHECK-NEXT: [[TMP136:%.*]] = zext i32 [[TMP121]] to i64 2190; CHECK-NEXT: [[TMP137:%.*]] = zext i32 [[TMP135]] to i64 2191; CHECK-NEXT: [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]] 2192; CHECK-NEXT: [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32 2193; CHECK-NEXT: [[TMP140:%.*]] = lshr i64 [[TMP138]], 32 2194; CHECK-NEXT: [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32 2195; CHECK-NEXT: [[TMP142:%.*]] = mul i32 [[TMP141]], [[TMP122]] 2196; CHECK-NEXT: [[TMP143:%.*]] = sub i32 [[TMP121]], [[TMP142]] 2197; CHECK-NEXT: [[TMP144:%.*]] = icmp uge i32 [[TMP143]], [[TMP122]] 2198; CHECK-NEXT: [[TMP145:%.*]] = sub i32 [[TMP143]], [[TMP122]] 2199; CHECK-NEXT: [[TMP146:%.*]] = select i1 [[TMP144]], i32 [[TMP145]], i32 [[TMP143]] 2200; CHECK-NEXT: [[TMP147:%.*]] = icmp uge i32 [[TMP146]], [[TMP122]] 2201; CHECK-NEXT: [[TMP148:%.*]] = sub i32 [[TMP146]], [[TMP122]] 2202; CHECK-NEXT: [[TMP149:%.*]] = select i1 [[TMP147]], i32 [[TMP148]], i32 [[TMP146]] 2203; CHECK-NEXT: [[TMP150:%.*]] = xor i32 [[TMP149]], [[TMP117]] 2204; CHECK-NEXT: [[TMP151:%.*]] = sub i32 [[TMP150]], [[TMP117]] 2205; CHECK-NEXT: [[TMP152:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP151]], i64 3 2206; CHECK-NEXT: store <4 x i32> [[TMP152]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 2207; CHECK-NEXT: ret void 2208; 2209; GFX6-LABEL: srem_v4i32: 2210; GFX6: ; %bb.0: 2211; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 2212; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2213; GFX6-NEXT: s_mov_b32 s3, 0xf000 2214; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2215; GFX6-NEXT: s_ashr_i32 s2, s8, 31 2216; GFX6-NEXT: s_add_i32 s8, s8, s2 2217; GFX6-NEXT: s_xor_b32 s8, s8, s2 2218; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 2219; GFX6-NEXT: s_ashr_i32 s13, s9, 31 2220; GFX6-NEXT: s_add_i32 s9, s9, s13 2221; GFX6-NEXT: s_xor_b32 s9, s9, s13 2222; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 2223; GFX6-NEXT: s_sub_i32 s14, 0, s8 2224; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 2225; GFX6-NEXT: s_ashr_i32 s12, s4, 31 2226; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 2227; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 2228; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 2229; GFX6-NEXT: s_add_i32 s4, s4, s12 2230; GFX6-NEXT: s_xor_b32 s4, s4, s12 2231; GFX6-NEXT: v_mul_lo_u32 v2, s14, v0 2232; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 2233; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2234; GFX6-NEXT: s_sub_i32 s14, 0, s9 2235; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 2236; GFX6-NEXT: s_ashr_i32 s13, s5, 31 2237; GFX6-NEXT: s_add_i32 s5, s5, s13 2238; GFX6-NEXT: s_xor_b32 s5, s5, s13 2239; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 2240; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 2241; GFX6-NEXT: v_mul_lo_u32 v2, s14, v1 2242; GFX6-NEXT: s_mov_b32 s2, -1 2243; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 2244; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 2245; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 2246; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 2247; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2248; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 2249; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 2250; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2251; GFX6-NEXT: s_ashr_i32 s4, s10, 31 2252; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2253; GFX6-NEXT: s_add_i32 s8, s10, s4 2254; GFX6-NEXT: s_xor_b32 s4, s8, s4 2255; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s4 2256; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 2257; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 2258; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 2259; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 2260; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 2261; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 2262; GFX6-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 2263; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 2264; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 2265; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v1 2266; GFX6-NEXT: s_sub_i32 s5, 0, s4 2267; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2268; GFX6-NEXT: v_mul_lo_u32 v4, s5, v2 2269; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2270; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v1 2271; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2272; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2273; GFX6-NEXT: v_mul_hi_u32 v3, v2, v4 2274; GFX6-NEXT: s_ashr_i32 s8, s11, 31 2275; GFX6-NEXT: s_add_i32 s9, s11, s8 2276; GFX6-NEXT: s_ashr_i32 s5, s6, 31 2277; GFX6-NEXT: s_xor_b32 s8, s9, s8 2278; GFX6-NEXT: s_add_i32 s6, s6, s5 2279; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 2280; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 2281; GFX6-NEXT: s_xor_b32 s6, s6, s5 2282; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 2283; GFX6-NEXT: v_xor_b32_e32 v1, s13, v1 2284; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 2285; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s13, v1 2286; GFX6-NEXT: v_mul_lo_u32 v2, v2, s4 2287; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 2288; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 2289; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 2290; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s4, v2 2291; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 2292; GFX6-NEXT: s_sub_i32 s6, 0, s8 2293; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 2294; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 2295; GFX6-NEXT: s_ashr_i32 s6, s7, 31 2296; GFX6-NEXT: s_add_i32 s7, s7, s6 2297; GFX6-NEXT: s_xor_b32 s7, s7, s6 2298; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 2299; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v2 2300; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 2301; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 2302; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 2303; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2304; GFX6-NEXT: v_xor_b32_e32 v2, s5, v2 2305; GFX6-NEXT: v_mul_lo_u32 v3, v3, s8 2306; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s5, v2 2307; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 2308; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 2309; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 2310; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 2311; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 2312; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 2313; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 2314; GFX6-NEXT: v_xor_b32_e32 v3, s6, v3 2315; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s6, v3 2316; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2317; GFX6-NEXT: s_endpgm 2318; 2319; GFX9-LABEL: srem_v4i32: 2320; GFX9: ; %bb.0: 2321; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 2322; GFX9-NEXT: v_mov_b32_e32 v4, 0 2323; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2324; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2325; GFX9-NEXT: s_ashr_i32 s2, s8, 31 2326; GFX9-NEXT: s_add_i32 s8, s8, s2 2327; GFX9-NEXT: s_xor_b32 s2, s8, s2 2328; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 2329; GFX9-NEXT: s_ashr_i32 s3, s9, 31 2330; GFX9-NEXT: s_add_i32 s8, s9, s3 2331; GFX9-NEXT: s_sub_i32 s12, 0, s2 2332; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 2333; GFX9-NEXT: s_xor_b32 s3, s8, s3 2334; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 2335; GFX9-NEXT: s_ashr_i32 s8, s4, 31 2336; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 2337; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 2338; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 2339; GFX9-NEXT: s_add_i32 s4, s4, s8 2340; GFX9-NEXT: s_xor_b32 s4, s4, s8 2341; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0 2342; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 2343; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 2344; GFX9-NEXT: s_sub_i32 s12, 0, s3 2345; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 2346; GFX9-NEXT: s_ashr_i32 s9, s5, 31 2347; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 2348; GFX9-NEXT: s_ashr_i32 s12, s10, 31 2349; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 2350; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 2351; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 2352; GFX9-NEXT: s_add_i32 s5, s5, s9 2353; GFX9-NEXT: s_xor_b32 s5, s5, s9 2354; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 2355; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 2356; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 2357; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 2358; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v0 2359; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 2360; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2361; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v0 2362; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 2363; GFX9-NEXT: s_add_i32 s2, s10, s12 2364; GFX9-NEXT: s_xor_b32 s2, s2, s12 2365; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2366; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 2367; GFX9-NEXT: v_mul_lo_u32 v1, v1, s3 2368; GFX9-NEXT: v_xor_b32_e32 v0, s8, v0 2369; GFX9-NEXT: v_subrev_u32_e32 v0, s8, v0 2370; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 2371; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 2372; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 2373; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 2374; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 2375; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 2376; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2377; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 2378; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 2379; GFX9-NEXT: s_sub_i32 s3, 0, s2 2380; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2381; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 2382; GFX9-NEXT: s_ashr_i32 s3, s11, 31 2383; GFX9-NEXT: s_add_i32 s4, s11, s3 2384; GFX9-NEXT: s_xor_b32 s3, s4, s3 2385; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 2386; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 2387; GFX9-NEXT: s_ashr_i32 s4, s6, 31 2388; GFX9-NEXT: s_add_i32 s5, s6, s4 2389; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 2390; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 2391; GFX9-NEXT: s_xor_b32 s5, s5, s4 2392; GFX9-NEXT: v_mul_hi_u32 v2, s5, v2 2393; GFX9-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v5 2394; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 2395; GFX9-NEXT: s_sub_i32 s6, 0, s3 2396; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 2397; GFX9-NEXT: v_xor_b32_e32 v1, s9, v1 2398; GFX9-NEXT: v_mul_lo_u32 v5, s6, v3 2399; GFX9-NEXT: v_subrev_u32_e32 v1, s9, v1 2400; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 2401; GFX9-NEXT: s_ashr_i32 s5, s7, 31 2402; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 2403; GFX9-NEXT: s_add_i32 s6, s7, s5 2404; GFX9-NEXT: s_xor_b32 s6, s6, s5 2405; GFX9-NEXT: v_subrev_u32_e32 v6, s2, v2 2406; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 2407; GFX9-NEXT: v_mul_hi_u32 v3, s6, v3 2408; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 2409; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2410; GFX9-NEXT: v_subrev_u32_e32 v5, s2, v2 2411; GFX9-NEXT: v_mul_lo_u32 v3, v3, s3 2412; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 2413; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2414; GFX9-NEXT: v_xor_b32_e32 v2, s4, v2 2415; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 2416; GFX9-NEXT: v_subrev_u32_e32 v5, s3, v3 2417; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 2418; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2419; GFX9-NEXT: v_subrev_u32_e32 v5, s3, v3 2420; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 2421; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2422; GFX9-NEXT: v_xor_b32_e32 v3, s5, v3 2423; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 2424; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v3 2425; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2426; GFX9-NEXT: s_endpgm 2427 %r = srem <4 x i32> %x, %y 2428 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 2429 ret void 2430} 2431 2432define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 2433; CHECK-LABEL: @udiv_v4i16( 2434; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2435; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2436; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 2437; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 2438; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 2439; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 2440; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 2441; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 2442; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 2443; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 2444; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 2445; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 2446; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 2447; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 2448; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 2449; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 2450; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 2451; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 2452; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 2453; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i16> undef, i16 [[TMP19]], i64 0 2454; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1 2455; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2456; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 2457; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 2458; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 2459; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 2460; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 2461; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 2462; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 2463; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 2464; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 2465; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 2466; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 2467; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 2468; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 2469; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 2470; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 2471; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 2472; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 2473; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1 2474; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2 2475; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2476; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 2477; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 2478; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 2479; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 2480; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 2481; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 2482; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 2483; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 2484; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 2485; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 2486; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 2487; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 2488; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 2489; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 2490; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 2491; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 2492; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 2493; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2 2494; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3 2495; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2496; CHECK-NEXT: [[TMP63:%.*]] = zext i16 [[TMP61]] to i32 2497; CHECK-NEXT: [[TMP64:%.*]] = zext i16 [[TMP62]] to i32 2498; CHECK-NEXT: [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float 2499; CHECK-NEXT: [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float 2500; CHECK-NEXT: [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]]) 2501; CHECK-NEXT: [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]] 2502; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]]) 2503; CHECK-NEXT: [[TMP70:%.*]] = fneg fast float [[TMP69]] 2504; CHECK-NEXT: [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]]) 2505; CHECK-NEXT: [[TMP72:%.*]] = fptoui float [[TMP69]] to i32 2506; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]]) 2507; CHECK-NEXT: [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 2508; CHECK-NEXT: [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]] 2509; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0 2510; CHECK-NEXT: [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]] 2511; CHECK-NEXT: [[TMP78:%.*]] = and i32 [[TMP77]], 65535 2512; CHECK-NEXT: [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16 2513; CHECK-NEXT: [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3 2514; CHECK-NEXT: store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 2515; CHECK-NEXT: ret void 2516; 2517; GFX6-LABEL: udiv_v4i16: 2518; GFX6: ; %bb.0: 2519; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 2520; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2521; GFX6-NEXT: s_mov_b32 s3, 0xf000 2522; GFX6-NEXT: s_mov_b32 s2, -1 2523; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2524; GFX6-NEXT: s_and_b32 s9, s6, 0xffff 2525; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 2526; GFX6-NEXT: s_lshr_b32 s6, s6, 16 2527; GFX6-NEXT: s_and_b32 s8, s4, 0xffff 2528; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 2529; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 2530; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 2531; GFX6-NEXT: s_lshr_b32 s4, s4, 16 2532; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 2533; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 2534; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 2535; GFX6-NEXT: v_trunc_f32_e32 v3, v3 2536; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 2537; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2538; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 2539; GFX6-NEXT: v_trunc_f32_e32 v1, v1 2540; GFX6-NEXT: s_and_b32 s4, s7, 0xffff 2541; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 2542; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 2543; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 2544; GFX6-NEXT: s_and_b32 s4, s5, 0xffff 2545; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc 2546; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2547; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 2548; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 2549; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 2550; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc 2551; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 2552; GFX6-NEXT: v_trunc_f32_e32 v1, v1 2553; GFX6-NEXT: s_lshr_b32 s4, s7, 16 2554; GFX6-NEXT: v_mad_f32 v3, -v1, v4, v5 2555; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 2556; GFX6-NEXT: s_lshr_b32 s4, s5, 16 2557; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 2558; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2559; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 2560; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 2561; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2562; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2563; GFX6-NEXT: v_mul_f32_e32 v3, v6, v7 2564; GFX6-NEXT: v_trunc_f32_e32 v3, v3 2565; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3 2566; GFX6-NEXT: v_mad_f32 v3, -v3, v5, v6 2567; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 2568; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 2569; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 2570; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2571; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 2572; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 2573; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 2574; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2575; GFX6-NEXT: s_endpgm 2576; 2577; GFX9-LABEL: udiv_v4i16: 2578; GFX9: ; %bb.0: 2579; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 2580; GFX9-NEXT: v_mov_b32_e32 v6, 0 2581; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2582; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2583; GFX9-NEXT: s_and_b32 s3, s6, 0xffff 2584; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 2585; GFX9-NEXT: s_and_b32 s2, s4, 0xffff 2586; GFX9-NEXT: s_lshr_b32 s6, s6, 16 2587; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 2588; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 2589; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 2590; GFX9-NEXT: s_lshr_b32 s4, s4, 16 2591; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 2592; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 2593; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 2594; GFX9-NEXT: v_trunc_f32_e32 v4, v4 2595; GFX9-NEXT: s_and_b32 s2, s7, 0xffff 2596; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 2597; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 2598; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 2599; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 2600; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 2601; GFX9-NEXT: s_and_b32 s2, s5, 0xffff 2602; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc 2603; GFX9-NEXT: v_trunc_f32_e32 v2, v5 2604; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 2605; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 2606; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 2607; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 2608; GFX9-NEXT: s_lshr_b32 s2, s7, 16 2609; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7 2610; GFX9-NEXT: v_trunc_f32_e32 v1, v1 2611; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 2612; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 2613; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 2614; GFX9-NEXT: s_lshr_b32 s2, s5, 16 2615; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 2616; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 2617; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 2618; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 2619; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 2620; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 2621; GFX9-NEXT: v_trunc_f32_e32 v3, v3 2622; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 2623; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 2624; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 2625; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 2626; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 2627; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 2628; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 2629; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 2630; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 2631; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] 2632; GFX9-NEXT: s_endpgm 2633 %r = udiv <4 x i16> %x, %y 2634 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 2635 ret void 2636} 2637 2638define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 2639; CHECK-LABEL: @urem_v4i16( 2640; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2641; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2642; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 2643; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 2644; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 2645; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 2646; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 2647; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 2648; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 2649; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 2650; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 2651; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 2652; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 2653; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 2654; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 2655; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 2656; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 2657; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 2658; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 2659; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 2660; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 2661; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i16> undef, i16 [[TMP21]], i64 0 2662; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1 2663; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2664; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 2665; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 2666; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 2667; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 2668; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 2669; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 2670; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 2671; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 2672; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 2673; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 2674; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 2675; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 2676; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 2677; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 2678; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 2679; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 2680; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 2681; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 2682; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 2683; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1 2684; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2 2685; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2686; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 2687; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 2688; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 2689; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 2690; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 2691; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 2692; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 2693; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 2694; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 2695; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 2696; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 2697; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 2698; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 2699; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 2700; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 2701; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 2702; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 2703; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 2704; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 2705; CHECK-NEXT: [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2 2706; CHECK-NEXT: [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3 2707; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2708; CHECK-NEXT: [[TMP69:%.*]] = zext i16 [[TMP67]] to i32 2709; CHECK-NEXT: [[TMP70:%.*]] = zext i16 [[TMP68]] to i32 2710; CHECK-NEXT: [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float 2711; CHECK-NEXT: [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float 2712; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]]) 2713; CHECK-NEXT: [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]] 2714; CHECK-NEXT: [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]]) 2715; CHECK-NEXT: [[TMP76:%.*]] = fneg fast float [[TMP75]] 2716; CHECK-NEXT: [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]]) 2717; CHECK-NEXT: [[TMP78:%.*]] = fptoui float [[TMP75]] to i32 2718; CHECK-NEXT: [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]]) 2719; CHECK-NEXT: [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]]) 2720; CHECK-NEXT: [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]] 2721; CHECK-NEXT: [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0 2722; CHECK-NEXT: [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]] 2723; CHECK-NEXT: [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]] 2724; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]] 2725; CHECK-NEXT: [[TMP86:%.*]] = and i32 [[TMP85]], 65535 2726; CHECK-NEXT: [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16 2727; CHECK-NEXT: [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3 2728; CHECK-NEXT: store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 2729; CHECK-NEXT: ret void 2730; 2731; GFX6-LABEL: urem_v4i16: 2732; GFX6: ; %bb.0: 2733; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 2734; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2735; GFX6-NEXT: s_mov_b32 s3, 0xf000 2736; GFX6-NEXT: s_mov_b32 s2, -1 2737; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2738; GFX6-NEXT: s_and_b32 s8, s6, 0xffff 2739; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 2740; GFX6-NEXT: v_mov_b32_e32 v4, s6 2741; GFX6-NEXT: v_alignbit_b32 v4, s7, v4, 16 2742; GFX6-NEXT: s_and_b32 s8, s4, 0xffff 2743; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v4 2744; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s8 2745; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 2746; GFX6-NEXT: v_cvt_f32_u32_e32 v5, v5 2747; GFX6-NEXT: v_mov_b32_e32 v1, s4 2748; GFX6-NEXT: v_alignbit_b32 v1, s5, v1, 16 2749; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v1 2750; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 2751; GFX6-NEXT: v_cvt_f32_u32_e32 v6, v6 2752; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 2753; GFX6-NEXT: v_trunc_f32_e32 v3, v3 2754; GFX6-NEXT: v_mad_f32 v2, -v3, v0, v2 2755; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 2756; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 2757; GFX6-NEXT: v_mul_f32_e32 v2, v6, v7 2758; GFX6-NEXT: v_trunc_f32_e32 v2, v2 2759; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 2760; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 2761; GFX6-NEXT: v_mad_f32 v2, -v2, v5, v6 2762; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 2763; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v5 2764; GFX6-NEXT: s_and_b32 s6, s7, 0xffff 2765; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 2766; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 2767; GFX6-NEXT: s_and_b32 s6, s5, 0xffff 2768; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 2769; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s6 2770; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 2771; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 2772; GFX6-NEXT: s_lshr_b32 s4, s7, 16 2773; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v1, v2 2774; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 2775; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 2776; GFX6-NEXT: s_lshr_b32 s6, s5, 16 2777; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s6 2778; GFX6-NEXT: v_trunc_f32_e32 v1, v1 2779; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 2780; GFX6-NEXT: v_mad_f32 v4, -v1, v3, v4 2781; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2782; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 2783; GFX6-NEXT: v_mul_f32_e32 v3, v6, v7 2784; GFX6-NEXT: v_trunc_f32_e32 v3, v3 2785; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3 2786; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2787; GFX6-NEXT: v_mad_f32 v3, -v3, v5, v6 2788; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 2789; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 2790; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 2791; GFX6-NEXT: v_mul_lo_u32 v3, v3, s4 2792; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2793; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 2794; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 2795; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 2796; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2797; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 2798; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 2799; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 2800; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 2801; GFX6-NEXT: s_endpgm 2802; 2803; GFX9-LABEL: urem_v4i16: 2804; GFX9: ; %bb.0: 2805; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 2806; GFX9-NEXT: v_mov_b32_e32 v6, 0 2807; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2808; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2809; GFX9-NEXT: s_and_b32 s3, s6, 0xffff 2810; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 2811; GFX9-NEXT: s_and_b32 s2, s4, 0xffff 2812; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 2813; GFX9-NEXT: s_lshr_b32 s6, s6, 16 2814; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 2815; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 2816; GFX9-NEXT: s_lshr_b32 s4, s4, 16 2817; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 2818; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 2819; GFX9-NEXT: v_trunc_f32_e32 v4, v4 2820; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 2821; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 2822; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 2823; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 2824; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc 2825; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 2826; GFX9-NEXT: s_and_b32 s3, s7, 0xffff 2827; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 2828; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 2829; GFX9-NEXT: v_trunc_f32_e32 v2, v5 2830; GFX9-NEXT: s_and_b32 s8, s5, 0xffff 2831; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 2832; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 2833; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s8 2834; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 2835; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 2836; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 2837; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7 2838; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 2839; GFX9-NEXT: v_trunc_f32_e32 v2, v2 2840; GFX9-NEXT: s_lshr_b32 s6, s7, 16 2841; GFX9-NEXT: v_mad_f32 v3, -v2, v4, v5 2842; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6 2843; GFX9-NEXT: s_lshr_b32 s5, s5, 16 2844; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s5 2845; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 2846; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 2847; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 2848; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 2849; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 2850; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8 2851; GFX9-NEXT: v_trunc_f32_e32 v3, v3 2852; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 2853; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7 2854; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 2855; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 2856; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 2857; GFX9-NEXT: v_mul_lo_u32 v3, v3, s6 2858; GFX9-NEXT: v_sub_u32_e32 v4, s4, v1 2859; GFX9-NEXT: v_sub_u32_e32 v1, s8, v2 2860; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 2861; GFX9-NEXT: v_sub_u32_e32 v2, s5, v3 2862; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 2863; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1 2864; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 2865; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] 2866; GFX9-NEXT: s_endpgm 2867 %r = urem <4 x i16> %x, %y 2868 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 2869 ret void 2870} 2871 2872define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 2873; CHECK-LABEL: @sdiv_v4i16( 2874; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2875; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2876; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 2877; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 2878; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 2879; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 2880; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 2881; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 2882; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 2883; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2884; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 2885; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 2886; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 2887; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 2888; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 2889; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 2890; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2891; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 2892; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 2893; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 2894; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 2895; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 2896; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 2897; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> undef, i16 [[TMP23]], i64 0 2898; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1 2899; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2900; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 2901; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 2902; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 2903; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 2904; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 2905; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 2906; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 2907; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 2908; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 2909; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 2910; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 2911; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 2912; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 2913; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 2914; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 2915; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 2916; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 2917; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 2918; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 2919; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 2920; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 2921; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1 2922; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2 2923; CHECK-NEXT: [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2924; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 2925; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 2926; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 2927; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 2928; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 2929; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 2930; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 2931; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 2932; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 2933; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 2934; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 2935; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 2936; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 2937; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 2938; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 2939; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 2940; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 2941; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 2942; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 2943; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 2944; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 2945; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2 2946; CHECK-NEXT: [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3 2947; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2948; CHECK-NEXT: [[TMP75:%.*]] = sext i16 [[TMP73]] to i32 2949; CHECK-NEXT: [[TMP76:%.*]] = sext i16 [[TMP74]] to i32 2950; CHECK-NEXT: [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]] 2951; CHECK-NEXT: [[TMP78:%.*]] = ashr i32 [[TMP77]], 30 2952; CHECK-NEXT: [[TMP79:%.*]] = or i32 [[TMP78]], 1 2953; CHECK-NEXT: [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float 2954; CHECK-NEXT: [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float 2955; CHECK-NEXT: [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]]) 2956; CHECK-NEXT: [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]] 2957; CHECK-NEXT: [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]]) 2958; CHECK-NEXT: [[TMP85:%.*]] = fneg fast float [[TMP84]] 2959; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]]) 2960; CHECK-NEXT: [[TMP87:%.*]] = fptosi float [[TMP84]] to i32 2961; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]]) 2962; CHECK-NEXT: [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]]) 2963; CHECK-NEXT: [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]] 2964; CHECK-NEXT: [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0 2965; CHECK-NEXT: [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]] 2966; CHECK-NEXT: [[TMP93:%.*]] = shl i32 [[TMP92]], 16 2967; CHECK-NEXT: [[TMP94:%.*]] = ashr i32 [[TMP93]], 16 2968; CHECK-NEXT: [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16 2969; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3 2970; CHECK-NEXT: store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 2971; CHECK-NEXT: ret void 2972; 2973; GFX6-LABEL: sdiv_v4i16: 2974; GFX6: ; %bb.0: 2975; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 2976; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2977; GFX6-NEXT: s_mov_b32 s3, 0xf000 2978; GFX6-NEXT: s_mov_b32 s2, -1 2979; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2980; GFX6-NEXT: s_sext_i32_i16 s8, s6 2981; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 2982; GFX6-NEXT: s_sext_i32_i16 s9, s4 2983; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 2984; GFX6-NEXT: s_xor_b32 s8, s9, s8 2985; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 2986; GFX6-NEXT: s_ashr_i32 s6, s6, 16 2987; GFX6-NEXT: s_ashr_i32 s8, s8, 30 2988; GFX6-NEXT: s_or_b32 s8, s8, 1 2989; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 2990; GFX6-NEXT: v_trunc_f32_e32 v2, v2 2991; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 2992; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 2993; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 2994; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 2995; GFX6-NEXT: v_mov_b32_e32 v3, s8 2996; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 2997; GFX6-NEXT: s_ashr_i32 s4, s4, 16 2998; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 2999; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 3000; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 3001; GFX6-NEXT: s_xor_b32 s4, s4, s6 3002; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3003; GFX6-NEXT: s_or_b32 s4, s4, 1 3004; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 3005; GFX6-NEXT: v_trunc_f32_e32 v3, v3 3006; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 3007; GFX6-NEXT: v_mov_b32_e32 v4, s4 3008; GFX6-NEXT: s_sext_i32_i16 s4, s7 3009; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 3010; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 3011; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 3012; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 3013; GFX6-NEXT: s_sext_i32_i16 s6, s5 3014; GFX6-NEXT: v_add_i32_e32 v3, vcc, v1, v3 3015; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 3016; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 3017; GFX6-NEXT: s_xor_b32 s4, s6, s4 3018; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3019; GFX6-NEXT: s_or_b32 s4, s4, 1 3020; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 3021; GFX6-NEXT: v_trunc_f32_e32 v4, v4 3022; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1 3023; GFX6-NEXT: v_mov_b32_e32 v5, s4 3024; GFX6-NEXT: s_ashr_i32 s4, s7, 16 3025; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 3026; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 3027; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 3028; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc 3029; GFX6-NEXT: s_ashr_i32 s5, s5, 16 3030; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 3031; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 3032; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 3033; GFX6-NEXT: s_xor_b32 s4, s5, s4 3034; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3035; GFX6-NEXT: s_or_b32 s4, s4, 1 3036; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 3037; GFX6-NEXT: v_trunc_f32_e32 v5, v5 3038; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 3039; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 3040; GFX6-NEXT: v_mov_b32_e32 v6, s4 3041; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| 3042; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc 3043; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 3044; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3045; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 3046; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 3047; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 3048; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 3049; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 3050; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3051; GFX6-NEXT: s_endpgm 3052; 3053; GFX9-LABEL: sdiv_v4i16: 3054; GFX9: ; %bb.0: 3055; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3056; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3057; GFX9-NEXT: v_mov_b32_e32 v2, 0 3058; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3059; GFX9-NEXT: s_sext_i32_i16 s0, s6 3060; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 3061; GFX9-NEXT: s_sext_i32_i16 s1, s4 3062; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 3063; GFX9-NEXT: s_xor_b32 s0, s1, s0 3064; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3065; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3066; GFX9-NEXT: s_or_b32 s8, s0, 1 3067; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 3068; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3069; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 3070; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3071; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3072; GFX9-NEXT: s_cselect_b32 s0, s8, 0 3073; GFX9-NEXT: s_ashr_i32 s1, s6, 16 3074; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 3075; GFX9-NEXT: s_ashr_i32 s4, s4, 16 3076; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s4 3077; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 3078; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 3079; GFX9-NEXT: v_add_u32_e32 v3, s0, v3 3080; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 3081; GFX9-NEXT: s_xor_b32 s0, s4, s1 3082; GFX9-NEXT: v_trunc_f32_e32 v4, v4 3083; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3084; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 3085; GFX9-NEXT: s_or_b32 s4, s0, 1 3086; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3087; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3088; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 3089; GFX9-NEXT: s_sext_i32_i16 s1, s7 3090; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 3091; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3092; GFX9-NEXT: v_add_u32_e32 v4, s0, v4 3093; GFX9-NEXT: s_sext_i32_i16 s0, s5 3094; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 3095; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 3096; GFX9-NEXT: s_xor_b32 s0, s0, s1 3097; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3098; GFX9-NEXT: s_or_b32 s4, s0, 1 3099; GFX9-NEXT: v_mul_f32_e32 v5, v1, v5 3100; GFX9-NEXT: v_trunc_f32_e32 v5, v5 3101; GFX9-NEXT: v_mad_f32 v1, -v5, v0, v1 3102; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3103; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3104; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 3105; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3106; GFX9-NEXT: s_ashr_i32 s1, s7, 16 3107; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 3108; GFX9-NEXT: v_add_u32_e32 v1, s0, v5 3109; GFX9-NEXT: s_ashr_i32 s0, s5, 16 3110; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 3111; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v0 3112; GFX9-NEXT: s_xor_b32 s0, s0, s1 3113; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3114; GFX9-NEXT: s_or_b32 s4, s0, 1 3115; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 3116; GFX9-NEXT: v_trunc_f32_e32 v6, v6 3117; GFX9-NEXT: v_mad_f32 v5, -v6, v0, v5 3118; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 3119; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| 3120; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3121; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3122; GFX9-NEXT: v_add_u32_e32 v0, s0, v6 3123; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 3124; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 3125; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v3 3126; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 3127; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3128; GFX9-NEXT: s_endpgm 3129 %r = sdiv <4 x i16> %x, %y 3130 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 3131 ret void 3132} 3133 3134define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 3135; CHECK-LABEL: @srem_v4i16( 3136; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 3137; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 3138; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 3139; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 3140; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 3141; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 3142; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 3143; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 3144; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 3145; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 3146; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 3147; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 3148; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 3149; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 3150; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 3151; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 3152; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3153; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 3154; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 3155; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 3156; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 3157; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 3158; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 3159; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 3160; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 3161; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> undef, i16 [[TMP25]], i64 0 3162; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1 3163; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1 3164; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 3165; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 3166; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 3167; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 3168; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 3169; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 3170; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 3171; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 3172; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 3173; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 3174; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 3175; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 3176; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 3177; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 3178; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 3179; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 3180; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 3181; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 3182; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 3183; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 3184; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 3185; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 3186; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 3187; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1 3188; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2 3189; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2 3190; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 3191; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 3192; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 3193; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 3194; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 3195; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 3196; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 3197; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 3198; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 3199; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 3200; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 3201; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 3202; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 3203; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 3204; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 3205; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 3206; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 3207; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 3208; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 3209; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 3210; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 3211; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 3212; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 3213; CHECK-NEXT: [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2 3214; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3 3215; CHECK-NEXT: [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3 3216; CHECK-NEXT: [[TMP81:%.*]] = sext i16 [[TMP79]] to i32 3217; CHECK-NEXT: [[TMP82:%.*]] = sext i16 [[TMP80]] to i32 3218; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]] 3219; CHECK-NEXT: [[TMP84:%.*]] = ashr i32 [[TMP83]], 30 3220; CHECK-NEXT: [[TMP85:%.*]] = or i32 [[TMP84]], 1 3221; CHECK-NEXT: [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float 3222; CHECK-NEXT: [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float 3223; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]]) 3224; CHECK-NEXT: [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]] 3225; CHECK-NEXT: [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]]) 3226; CHECK-NEXT: [[TMP91:%.*]] = fneg fast float [[TMP90]] 3227; CHECK-NEXT: [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]]) 3228; CHECK-NEXT: [[TMP93:%.*]] = fptosi float [[TMP90]] to i32 3229; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]]) 3230; CHECK-NEXT: [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]]) 3231; CHECK-NEXT: [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]] 3232; CHECK-NEXT: [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0 3233; CHECK-NEXT: [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]] 3234; CHECK-NEXT: [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]] 3235; CHECK-NEXT: [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]] 3236; CHECK-NEXT: [[TMP101:%.*]] = shl i32 [[TMP100]], 16 3237; CHECK-NEXT: [[TMP102:%.*]] = ashr i32 [[TMP101]], 16 3238; CHECK-NEXT: [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16 3239; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3 3240; CHECK-NEXT: store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 3241; CHECK-NEXT: ret void 3242; 3243; GFX6-LABEL: srem_v4i16: 3244; GFX6: ; %bb.0: 3245; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 3246; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3247; GFX6-NEXT: s_mov_b32 s3, 0xf000 3248; GFX6-NEXT: s_mov_b32 s2, -1 3249; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3250; GFX6-NEXT: s_sext_i32_i16 s8, s6 3251; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 3252; GFX6-NEXT: s_sext_i32_i16 s9, s4 3253; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 3254; GFX6-NEXT: s_xor_b32 s8, s9, s8 3255; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3256; GFX6-NEXT: s_ashr_i32 s8, s8, 30 3257; GFX6-NEXT: s_or_b32 s8, s8, 1 3258; GFX6-NEXT: v_mov_b32_e32 v3, s8 3259; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3260; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3261; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3262; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 3263; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3264; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3265; GFX6-NEXT: v_mov_b32_e32 v1, s4 3266; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 3267; GFX6-NEXT: v_mov_b32_e32 v2, s6 3268; GFX6-NEXT: v_alignbit_b32 v2, s7, v2, 16 3269; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 3270; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v3 3271; GFX6-NEXT: v_alignbit_b32 v1, s5, v1, 16 3272; GFX6-NEXT: v_bfe_i32 v5, v1, 0, 16 3273; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v5 3274; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 3275; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 3276; GFX6-NEXT: v_xor_b32_e32 v3, v5, v3 3277; GFX6-NEXT: v_ashrrev_i32_e32 v3, 30, v3 3278; GFX6-NEXT: v_mul_f32_e32 v5, v6, v7 3279; GFX6-NEXT: v_trunc_f32_e32 v5, v5 3280; GFX6-NEXT: v_mad_f32 v6, -v5, v4, v6 3281; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 3282; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 3283; GFX6-NEXT: v_or_b32_e32 v3, 1, v3 3284; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4| 3285; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 3286; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 3287; GFX6-NEXT: s_sext_i32_i16 s4, s7 3288; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 3289; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s4 3290; GFX6-NEXT: s_sext_i32_i16 s6, s5 3291; GFX6-NEXT: s_xor_b32 s4, s6, s4 3292; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 3293; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s6 3294; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v3 3295; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3296; GFX6-NEXT: s_or_b32 s4, s4, 1 3297; GFX6-NEXT: v_mov_b32_e32 v5, s4 3298; GFX6-NEXT: v_mul_f32_e32 v4, v2, v4 3299; GFX6-NEXT: v_trunc_f32_e32 v4, v4 3300; GFX6-NEXT: v_mad_f32 v2, -v4, v3, v2 3301; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 3302; GFX6-NEXT: s_ashr_i32 s4, s7, 16 3303; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v3| 3304; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s4 3305; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 3306; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 3307; GFX6-NEXT: v_mul_lo_u32 v2, v2, s7 3308; GFX6-NEXT: s_lshr_b32 s6, s7, 16 3309; GFX6-NEXT: s_ashr_i32 s7, s5, 16 3310; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s7 3311; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 3312; GFX6-NEXT: s_xor_b32 s4, s7, s4 3313; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3314; GFX6-NEXT: s_or_b32 s4, s4, 1 3315; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 3316; GFX6-NEXT: v_trunc_f32_e32 v5, v5 3317; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 3318; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 3319; GFX6-NEXT: v_mov_b32_e32 v6, s4 3320; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| 3321; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 3322; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 3323; GFX6-NEXT: v_mul_lo_u32 v3, v3, s6 3324; GFX6-NEXT: s_lshr_b32 s4, s5, 16 3325; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v2 3326; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 3327; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3328; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 3329; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 3330; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 3331; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 3332; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 3333; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3334; GFX6-NEXT: s_endpgm 3335; 3336; GFX9-LABEL: srem_v4i16: 3337; GFX9: ; %bb.0: 3338; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3339; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3340; GFX9-NEXT: v_mov_b32_e32 v2, 0 3341; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3342; GFX9-NEXT: s_sext_i32_i16 s8, s6 3343; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 3344; GFX9-NEXT: s_sext_i32_i16 s9, s4 3345; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 3346; GFX9-NEXT: s_xor_b32 s0, s9, s8 3347; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3348; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3349; GFX9-NEXT: s_or_b32 s10, s0, 1 3350; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 3351; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3352; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 3353; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3354; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3355; GFX9-NEXT: s_cselect_b32 s0, s10, 0 3356; GFX9-NEXT: s_ashr_i32 s6, s6, 16 3357; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 3358; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s6 3359; GFX9-NEXT: s_ashr_i32 s4, s4, 16 3360; GFX9-NEXT: v_add_u32_e32 v1, s0, v3 3361; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 3362; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 3363; GFX9-NEXT: s_xor_b32 s0, s4, s6 3364; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3365; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 3366; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 3367; GFX9-NEXT: v_trunc_f32_e32 v4, v4 3368; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 3369; GFX9-NEXT: s_or_b32 s8, s0, 1 3370; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| 3371; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 3372; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3373; GFX9-NEXT: s_cselect_b32 s0, s8, 0 3374; GFX9-NEXT: s_sext_i32_i16 s8, s7 3375; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s8 3376; GFX9-NEXT: v_add_u32_e32 v0, s0, v4 3377; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 3378; GFX9-NEXT: s_sext_i32_i16 s6, s5 3379; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s6 3380; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 3381; GFX9-NEXT: s_xor_b32 s0, s6, s8 3382; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3383; GFX9-NEXT: s_or_b32 s10, s0, 1 3384; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 3385; GFX9-NEXT: v_trunc_f32_e32 v5, v5 3386; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 3387; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 3388; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3389; GFX9-NEXT: s_cselect_b32 s0, s10, 0 3390; GFX9-NEXT: s_ashr_i32 s7, s7, 16 3391; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 3392; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s7 3393; GFX9-NEXT: s_ashr_i32 s5, s5, 16 3394; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 3395; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 3396; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s5 3397; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 3398; GFX9-NEXT: s_xor_b32 s0, s5, s7 3399; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3400; GFX9-NEXT: v_mul_lo_u32 v3, v3, s8 3401; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 3402; GFX9-NEXT: v_trunc_f32_e32 v6, v6 3403; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 3404; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 3405; GFX9-NEXT: s_or_b32 s8, s0, 1 3406; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| 3407; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3408; GFX9-NEXT: s_cselect_b32 s0, s8, 0 3409; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 3410; GFX9-NEXT: v_mul_lo_u32 v4, v4, s7 3411; GFX9-NEXT: v_sub_u32_e32 v5, s9, v1 3412; GFX9-NEXT: v_sub_u32_e32 v1, s6, v3 3413; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 3414; GFX9-NEXT: v_sub_u32_e32 v3, s5, v4 3415; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 3416; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v5 3417; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 3418; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3419; GFX9-NEXT: s_endpgm 3420 %r = srem <4 x i16> %x, %y 3421 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 3422 ret void 3423} 3424 3425define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 3426; CHECK-LABEL: @udiv_i3( 3427; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 3428; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 3429; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 3430; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 3431; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 3432; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 3433; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 3434; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 3435; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 3436; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 3437; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3438; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 3439; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 3440; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 3441; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 3442; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 7 3443; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3 3444; CHECK-NEXT: store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]], align 1 3445; CHECK-NEXT: ret void 3446; 3447; GFX6-LABEL: udiv_i3: 3448; GFX6: ; %bb.0: 3449; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 3450; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3451; GFX6-NEXT: s_mov_b32 s3, 0xf000 3452; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3453; GFX6-NEXT: s_bfe_u32 s2, s4, 0x30008 3454; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 3455; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 3456; GFX6-NEXT: s_and_b32 s4, s4, 7 3457; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 3458; GFX6-NEXT: s_mov_b32 s2, -1 3459; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 3460; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3461; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 3462; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 3463; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3464; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 3465; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 3466; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 3467; GFX6-NEXT: s_endpgm 3468; 3469; GFX9-LABEL: udiv_i3: 3470; GFX9: ; %bb.0: 3471; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 3472; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3473; GFX9-NEXT: v_mov_b32_e32 v2, 0 3474; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3475; GFX9-NEXT: s_bfe_u32 s0, s4, 0x30008 3476; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 3477; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 3478; GFX9-NEXT: s_and_b32 s0, s4, 7 3479; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 3480; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 3481; GFX9-NEXT: v_trunc_f32_e32 v1, v1 3482; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 3483; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v3 3484; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3485; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 3486; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 3487; GFX9-NEXT: global_store_byte v2, v0, s[2:3] 3488; GFX9-NEXT: s_endpgm 3489 %r = udiv i3 %x, %y 3490 store i3 %r, i3 addrspace(1)* %out 3491 ret void 3492} 3493 3494define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 3495; CHECK-LABEL: @urem_i3( 3496; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 3497; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 3498; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 3499; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 3500; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 3501; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 3502; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 3503; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 3504; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 3505; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 3506; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3507; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 3508; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 3509; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 3510; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 3511; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 3512; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 3513; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 7 3514; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3 3515; CHECK-NEXT: store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]], align 1 3516; CHECK-NEXT: ret void 3517; 3518; GFX6-LABEL: urem_i3: 3519; GFX6: ; %bb.0: 3520; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 3521; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3522; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3523; GFX6-NEXT: s_bfe_u32 s2, s4, 0x30008 3524; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 3525; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 3526; GFX6-NEXT: s_and_b32 s3, s4, 7 3527; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s3 3528; GFX6-NEXT: s_lshr_b32 s2, s4, 8 3529; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 3530; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3531; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 3532; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 3533; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3534; GFX6-NEXT: s_mov_b32 s3, 0xf000 3535; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 3536; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 3537; GFX6-NEXT: s_mov_b32 s2, -1 3538; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 3539; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 3540; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 3541; GFX6-NEXT: s_endpgm 3542; 3543; GFX9-LABEL: urem_i3: 3544; GFX9: ; %bb.0: 3545; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 3546; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3547; GFX9-NEXT: s_bfe_u32 s3, s2, 0x30008 3548; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 3549; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 3550; GFX9-NEXT: s_and_b32 s4, s2, 7 3551; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 3552; GFX9-NEXT: s_lshr_b32 s3, s2, 8 3553; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 3554; GFX9-NEXT: v_trunc_f32_e32 v1, v1 3555; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 3556; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2 3557; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3558; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3559; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 3560; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 3561; GFX9-NEXT: v_mov_b32_e32 v1, 0 3562; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 3563; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 3564; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3565; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 3566; GFX9-NEXT: s_endpgm 3567 %r = urem i3 %x, %y 3568 store i3 %r, i3 addrspace(1)* %out 3569 ret void 3570} 3571 3572define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 3573; CHECK-LABEL: @sdiv_i3( 3574; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 3575; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 3576; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 3577; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 3578; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 3579; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 3580; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 3581; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 3582; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 3583; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 3584; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 3585; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 3586; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 3587; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 3588; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 3589; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 3590; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 3591; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 3592; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 29 3593; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 29 3594; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3 3595; CHECK-NEXT: store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]], align 1 3596; CHECK-NEXT: ret void 3597; 3598; GFX6-LABEL: sdiv_i3: 3599; GFX6: ; %bb.0: 3600; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 3601; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3602; GFX6-NEXT: s_mov_b32 s3, 0xf000 3603; GFX6-NEXT: s_mov_b32 s2, -1 3604; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3605; GFX6-NEXT: s_bfe_i32 s5, s4, 0x30008 3606; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 3607; GFX6-NEXT: s_bfe_i32 s4, s4, 0x30000 3608; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 3609; GFX6-NEXT: s_xor_b32 s4, s4, s5 3610; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3611; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3612; GFX6-NEXT: s_or_b32 s4, s4, 1 3613; GFX6-NEXT: v_mov_b32_e32 v3, s4 3614; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3615; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3616; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3617; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 3618; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3619; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3620; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 3621; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 3622; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 3623; GFX6-NEXT: s_endpgm 3624; 3625; GFX9-LABEL: sdiv_i3: 3626; GFX9: ; %bb.0: 3627; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 3628; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3629; GFX9-NEXT: v_mov_b32_e32 v1, 0 3630; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3631; GFX9-NEXT: s_bfe_i32 s0, s4, 0x30008 3632; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 3633; GFX9-NEXT: s_bfe_i32 s1, s4, 0x30000 3634; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 3635; GFX9-NEXT: s_xor_b32 s0, s1, s0 3636; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3637; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3638; GFX9-NEXT: s_or_b32 s4, s0, 1 3639; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 3640; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3641; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 3642; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 3643; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 3644; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3645; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3646; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 3647; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 3648; GFX9-NEXT: global_store_byte v1, v0, s[2:3] 3649; GFX9-NEXT: s_endpgm 3650 %r = sdiv i3 %x, %y 3651 store i3 %r, i3 addrspace(1)* %out 3652 ret void 3653} 3654 3655define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 3656; CHECK-LABEL: @srem_i3( 3657; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 3658; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 3659; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 3660; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 3661; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 3662; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 3663; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 3664; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 3665; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 3666; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 3667; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 3668; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 3669; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 3670; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 3671; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 3672; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 3673; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 3674; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 3675; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 3676; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 3677; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 29 3678; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 29 3679; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3 3680; CHECK-NEXT: store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]], align 1 3681; CHECK-NEXT: ret void 3682; 3683; GFX6-LABEL: srem_i3: 3684; GFX6: ; %bb.0: 3685; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 3686; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3687; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3688; GFX6-NEXT: s_bfe_i32 s2, s4, 0x30008 3689; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 3690; GFX6-NEXT: s_bfe_i32 s5, s4, 0x30000 3691; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 3692; GFX6-NEXT: s_xor_b32 s2, s5, s2 3693; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3694; GFX6-NEXT: s_ashr_i32 s2, s2, 30 3695; GFX6-NEXT: s_or_b32 s2, s2, 1 3696; GFX6-NEXT: v_mov_b32_e32 v3, s2 3697; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3698; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3699; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3700; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 3701; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3702; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3703; GFX6-NEXT: s_lshr_b32 s3, s4, 8 3704; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 3705; GFX6-NEXT: v_mul_lo_u32 v0, v0, s3 3706; GFX6-NEXT: s_mov_b32 s3, 0xf000 3707; GFX6-NEXT: s_mov_b32 s2, -1 3708; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 3709; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 3710; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 3711; GFX6-NEXT: s_endpgm 3712; 3713; GFX9-LABEL: srem_i3: 3714; GFX9: ; %bb.0: 3715; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 3716; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3717; GFX9-NEXT: s_bfe_i32 s2, s4, 0x30008 3718; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 3719; GFX9-NEXT: s_bfe_i32 s3, s4, 0x30000 3720; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 3721; GFX9-NEXT: s_xor_b32 s2, s3, s2 3722; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 3723; GFX9-NEXT: s_ashr_i32 s2, s2, 30 3724; GFX9-NEXT: s_lshr_b32 s5, s4, 8 3725; GFX9-NEXT: s_or_b32 s6, s2, 1 3726; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 3727; GFX9-NEXT: v_trunc_f32_e32 v2, v2 3728; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 3729; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 3730; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 3731; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 3732; GFX9-NEXT: s_cselect_b32 s2, s6, 0 3733; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 3734; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 3735; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3736; GFX9-NEXT: v_mov_b32_e32 v1, 0 3737; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 3738; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 3739; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3740; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 3741; GFX9-NEXT: s_endpgm 3742 %r = srem i3 %x, %y 3743 store i3 %r, i3 addrspace(1)* %out 3744 ret void 3745} 3746 3747define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 3748; CHECK-LABEL: @udiv_v3i16( 3749; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 3750; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 3751; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 3752; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 3753; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3754; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3755; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3756; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3757; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3758; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3759; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3760; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3761; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3762; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3763; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3764; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3765; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3766; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 3767; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 3768; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i16> undef, i16 [[TMP19]], i64 0 3769; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1 3770; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1 3771; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 3772; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 3773; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 3774; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 3775; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 3776; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 3777; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 3778; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 3779; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 3780; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 3781; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 3782; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 3783; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 3784; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 3785; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 3786; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 3787; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 3788; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1 3789; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2 3790; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2 3791; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 3792; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 3793; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 3794; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 3795; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 3796; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 3797; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 3798; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 3799; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 3800; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 3801; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 3802; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 3803; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 3804; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 3805; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 3806; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 3807; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 3808; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2 3809; CHECK-NEXT: store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 3810; CHECK-NEXT: ret void 3811; 3812; GFX6-LABEL: udiv_v3i16: 3813; GFX6: ; %bb.0: 3814; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 3815; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3816; GFX6-NEXT: s_mov_b32 s3, 0xf000 3817; GFX6-NEXT: s_mov_b32 s2, -1 3818; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3819; GFX6-NEXT: s_and_b32 s9, s6, 0xffff 3820; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 3821; GFX6-NEXT: s_lshr_b32 s6, s6, 16 3822; GFX6-NEXT: s_and_b32 s8, s4, 0xffff 3823; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 3824; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 3825; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 3826; GFX6-NEXT: s_lshr_b32 s4, s4, 16 3827; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 3828; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 3829; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 3830; GFX6-NEXT: v_trunc_f32_e32 v3, v3 3831; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 3832; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3833; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 3834; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3835; GFX6-NEXT: s_and_b32 s4, s7, 0xffff 3836; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 3837; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 3838; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 3839; GFX6-NEXT: s_and_b32 s4, s5, 0xffff 3840; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc 3841; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 3842; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 3843; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 3844; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 3845; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 3846; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6 3847; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3848; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 3849; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3850; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5 3851; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 3852; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 3853; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3854; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 3855; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 3856; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 3857; GFX6-NEXT: s_endpgm 3858; 3859; GFX9-LABEL: udiv_v3i16: 3860; GFX9: ; %bb.0: 3861; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3862; GFX9-NEXT: v_mov_b32_e32 v6, 0 3863; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3864; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3865; GFX9-NEXT: s_and_b32 s3, s6, 0xffff 3866; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 3867; GFX9-NEXT: s_and_b32 s2, s4, 0xffff 3868; GFX9-NEXT: s_lshr_b32 s6, s6, 16 3869; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 3870; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 3871; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 3872; GFX9-NEXT: s_lshr_b32 s4, s4, 16 3873; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 3874; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 3875; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 3876; GFX9-NEXT: v_trunc_f32_e32 v4, v4 3877; GFX9-NEXT: s_and_b32 s2, s7, 0xffff 3878; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4 3879; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 3880; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 3881; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 3882; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 3883; GFX9-NEXT: v_trunc_f32_e32 v2, v5 3884; GFX9-NEXT: s_and_b32 s2, s5, 0xffff 3885; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc 3886; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3 3887; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 3888; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2 3889; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4 3890; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 3891; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 3892; GFX9-NEXT: v_mul_f32_e32 v2, v5, v7 3893; GFX9-NEXT: v_trunc_f32_e32 v2, v2 3894; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 3895; GFX9-NEXT: v_mad_f32 v2, -v2, v4, v5 3896; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 3897; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 3898; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc 3899; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 3900; GFX9-NEXT: global_store_short v6, v2, s[0:1] offset:4 3901; GFX9-NEXT: global_store_dword v6, v0, s[0:1] 3902; GFX9-NEXT: s_endpgm 3903 %r = udiv <3 x i16> %x, %y 3904 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 3905 ret void 3906} 3907 3908define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 3909; CHECK-LABEL: @urem_v3i16( 3910; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 3911; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 3912; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 3913; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 3914; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3915; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3916; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3917; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3918; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3919; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3920; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3921; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3922; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3923; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3924; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3925; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3926; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3927; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 3928; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 3929; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 3930; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 3931; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i16> undef, i16 [[TMP21]], i64 0 3932; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1 3933; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1 3934; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 3935; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 3936; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 3937; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 3938; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 3939; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 3940; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 3941; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 3942; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 3943; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 3944; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 3945; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 3946; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 3947; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 3948; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 3949; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 3950; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 3951; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 3952; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 3953; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1 3954; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2 3955; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2 3956; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 3957; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 3958; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 3959; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 3960; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 3961; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 3962; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 3963; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 3964; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 3965; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 3966; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 3967; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 3968; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 3969; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 3970; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 3971; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 3972; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 3973; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 3974; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 3975; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2 3976; CHECK-NEXT: store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 3977; CHECK-NEXT: ret void 3978; 3979; GFX6-LABEL: urem_v3i16: 3980; GFX6: ; %bb.0: 3981; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 3982; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3983; GFX6-NEXT: s_mov_b32 s3, 0xf000 3984; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3985; GFX6-NEXT: s_and_b32 s8, s6, 0xffff 3986; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 3987; GFX6-NEXT: s_and_b32 s2, s4, 0xffff 3988; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 3989; GFX6-NEXT: v_mov_b32_e32 v2, s6 3990; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 3991; GFX6-NEXT: v_alignbit_b32 v2, s7, v2, 16 3992; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v2 3993; GFX6-NEXT: v_cvt_f32_u32_e32 v5, v5 3994; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 3995; GFX6-NEXT: v_trunc_f32_e32 v4, v4 3996; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v4 3997; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 3998; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 3999; GFX6-NEXT: v_mov_b32_e32 v0, s4 4000; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc 4001; GFX6-NEXT: v_alignbit_b32 v0, s5, v0, 16 4002; GFX6-NEXT: v_mul_lo_u32 v1, v1, s6 4003; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v0 4004; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 4005; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 4006; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 4007; GFX6-NEXT: s_and_b32 s4, s7, 0xffff 4008; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 4009; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 4010; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4011; GFX6-NEXT: v_mad_f32 v3, -v4, v5, v3 4012; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 4013; GFX6-NEXT: s_and_b32 s4, s5, 0xffff 4014; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s4 4015; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 4016; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 4017; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 4018; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 4019; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 4020; GFX6-NEXT: v_trunc_f32_e32 v3, v3 4021; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3 4022; GFX6-NEXT: v_mad_f32 v3, -v3, v6, v7 4023; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v6 4024; GFX6-NEXT: s_mov_b32 s2, -1 4025; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 4026; GFX6-NEXT: v_mul_lo_u32 v3, v3, s7 4027; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 4028; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 4029; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v3 4030; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 4031; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 4032; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 4033; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 4034; GFX6-NEXT: s_endpgm 4035; 4036; GFX9-LABEL: urem_v3i16: 4037; GFX9: ; %bb.0: 4038; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 4039; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4040; GFX9-NEXT: s_and_b32 s3, s6, 0xffff 4041; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 4042; GFX9-NEXT: s_and_b32 s2, s4, 0xffff 4043; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 4044; GFX9-NEXT: s_lshr_b32 s6, s6, 16 4045; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 4046; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 4047; GFX9-NEXT: s_lshr_b32 s4, s4, 16 4048; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 4049; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4 4050; GFX9-NEXT: v_trunc_f32_e32 v4, v4 4051; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 4052; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4 4053; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2 4054; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 4055; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5 4056; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc 4057; GFX9-NEXT: v_trunc_f32_e32 v5, v5 4058; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 4059; GFX9-NEXT: s_and_b32 s3, s7, 0xffff 4060; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3 4061; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3 4062; GFX9-NEXT: s_and_b32 s5, s5, 0xffff 4063; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v5 4064; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5 4065; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 4066; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 4067; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc 4068; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6 4069; GFX9-NEXT: v_trunc_f32_e32 v2, v2 4070; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 4071; GFX9-NEXT: v_mad_f32 v2, -v2, v3, v5 4072; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 4073; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4074; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc 4075; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 4076; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 4077; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 4078; GFX9-NEXT: v_mov_b32_e32 v3, 0 4079; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 4080; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 4081; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 4082; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 4083; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4084; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 4085; GFX9-NEXT: global_store_dword v3, v0, s[0:1] 4086; GFX9-NEXT: s_endpgm 4087 %r = urem <3 x i16> %x, %y 4088 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 4089 ret void 4090} 4091 4092define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 4093; CHECK-LABEL: @sdiv_v3i16( 4094; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 4095; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 4096; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 4097; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 4098; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 4099; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 4100; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 4101; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 4102; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 4103; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 4104; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 4105; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 4106; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 4107; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 4108; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 4109; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 4110; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4111; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 4112; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 4113; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 4114; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 4115; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 4116; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 4117; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i16> undef, i16 [[TMP23]], i64 0 4118; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1 4119; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1 4120; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 4121; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 4122; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 4123; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 4124; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 4125; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 4126; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 4127; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 4128; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 4129; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 4130; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 4131; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 4132; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 4133; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 4134; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 4135; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 4136; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 4137; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 4138; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 4139; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 4140; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 4141; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1 4142; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2 4143; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2 4144; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 4145; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 4146; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 4147; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 4148; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 4149; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 4150; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 4151; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 4152; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 4153; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 4154; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 4155; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 4156; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 4157; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 4158; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 4159; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 4160; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 4161; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 4162; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 4163; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 4164; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 4165; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2 4166; CHECK-NEXT: store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 4167; CHECK-NEXT: ret void 4168; 4169; GFX6-LABEL: sdiv_v3i16: 4170; GFX6: ; %bb.0: 4171; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 4172; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4173; GFX6-NEXT: s_mov_b32 s3, 0xf000 4174; GFX6-NEXT: s_mov_b32 s2, -1 4175; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4176; GFX6-NEXT: s_sext_i32_i16 s8, s6 4177; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 4178; GFX6-NEXT: s_sext_i32_i16 s9, s4 4179; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 4180; GFX6-NEXT: s_xor_b32 s8, s9, s8 4181; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 4182; GFX6-NEXT: s_ashr_i32 s6, s6, 16 4183; GFX6-NEXT: s_ashr_i32 s8, s8, 30 4184; GFX6-NEXT: s_or_b32 s8, s8, 1 4185; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 4186; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4187; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 4188; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 4189; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 4190; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 4191; GFX6-NEXT: v_mov_b32_e32 v3, s8 4192; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 4193; GFX6-NEXT: s_ashr_i32 s4, s4, 16 4194; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 4195; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 4196; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 4197; GFX6-NEXT: s_xor_b32 s4, s4, s6 4198; GFX6-NEXT: s_ashr_i32 s4, s4, 30 4199; GFX6-NEXT: s_or_b32 s4, s4, 1 4200; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 4201; GFX6-NEXT: v_trunc_f32_e32 v3, v3 4202; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 4203; GFX6-NEXT: v_mov_b32_e32 v4, s4 4204; GFX6-NEXT: s_sext_i32_i16 s4, s7 4205; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 4206; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 4207; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 4208; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 4209; GFX6-NEXT: s_sext_i32_i16 s5, s5 4210; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 4211; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 4212; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 4213; GFX6-NEXT: s_xor_b32 s4, s5, s4 4214; GFX6-NEXT: s_ashr_i32 s4, s4, 30 4215; GFX6-NEXT: s_or_b32 s4, s4, 1 4216; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 4217; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4218; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 4219; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 4220; GFX6-NEXT: v_mov_b32_e32 v5, s4 4221; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 4222; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 4223; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 4224; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 4225; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 4226; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 4227; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 4228; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 4229; GFX6-NEXT: s_endpgm 4230; 4231; GFX9-LABEL: sdiv_v3i16: 4232; GFX9: ; %bb.0: 4233; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 4234; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4235; GFX9-NEXT: v_mov_b32_e32 v1, 0 4236; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4237; GFX9-NEXT: s_sext_i32_i16 s0, s6 4238; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 4239; GFX9-NEXT: s_sext_i32_i16 s1, s4 4240; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 4241; GFX9-NEXT: s_xor_b32 s0, s1, s0 4242; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 4243; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4244; GFX9-NEXT: s_or_b32 s8, s0, 1 4245; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 4246; GFX9-NEXT: v_trunc_f32_e32 v3, v3 4247; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 4248; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 4249; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4250; GFX9-NEXT: s_cselect_b32 s0, s8, 0 4251; GFX9-NEXT: s_ashr_i32 s1, s6, 16 4252; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 4253; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 4254; GFX9-NEXT: s_ashr_i32 s4, s4, 16 4255; GFX9-NEXT: v_add_u32_e32 v2, s0, v3 4256; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 4257; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 4258; GFX9-NEXT: s_xor_b32 s0, s4, s1 4259; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4260; GFX9-NEXT: s_or_b32 s4, s0, 1 4261; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 4262; GFX9-NEXT: v_trunc_f32_e32 v4, v4 4263; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 4264; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| 4265; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4266; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 4267; GFX9-NEXT: s_sext_i32_i16 s1, s7 4268; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 4269; GFX9-NEXT: s_cselect_b32 s0, s4, 0 4270; GFX9-NEXT: v_add_u32_e32 v3, s0, v4 4271; GFX9-NEXT: s_sext_i32_i16 s0, s5 4272; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 4273; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 4274; GFX9-NEXT: s_xor_b32 s0, s0, s1 4275; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4276; GFX9-NEXT: s_or_b32 s4, s0, 1 4277; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 4278; GFX9-NEXT: v_trunc_f32_e32 v5, v5 4279; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 4280; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 4281; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| 4282; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4283; GFX9-NEXT: s_cselect_b32 s0, s4, 0 4284; GFX9-NEXT: v_add_u32_e32 v0, s0, v5 4285; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 4286; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 4287; GFX9-NEXT: global_store_short v1, v0, s[2:3] offset:4 4288; GFX9-NEXT: global_store_dword v1, v2, s[2:3] 4289; GFX9-NEXT: s_endpgm 4290 %r = sdiv <3 x i16> %x, %y 4291 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 4292 ret void 4293} 4294 4295define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 4296; CHECK-LABEL: @srem_v3i16( 4297; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 4298; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 4299; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 4300; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 4301; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 4302; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 4303; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 4304; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 4305; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 4306; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 4307; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 4308; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 4309; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 4310; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 4311; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 4312; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 4313; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4314; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 4315; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 4316; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 4317; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 4318; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 4319; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 4320; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 4321; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 4322; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i16> undef, i16 [[TMP25]], i64 0 4323; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1 4324; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1 4325; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 4326; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 4327; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 4328; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 4329; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 4330; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 4331; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 4332; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 4333; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 4334; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 4335; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 4336; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 4337; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 4338; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 4339; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 4340; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 4341; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 4342; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 4343; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 4344; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 4345; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 4346; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 4347; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 4348; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1 4349; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2 4350; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2 4351; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 4352; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 4353; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 4354; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 4355; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 4356; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 4357; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 4358; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 4359; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 4360; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 4361; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 4362; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 4363; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 4364; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 4365; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 4366; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 4367; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 4368; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 4369; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 4370; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 4371; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 4372; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 4373; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 4374; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2 4375; CHECK-NEXT: store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 4376; CHECK-NEXT: ret void 4377; 4378; GFX6-LABEL: srem_v3i16: 4379; GFX6: ; %bb.0: 4380; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 4381; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4382; GFX6-NEXT: s_mov_b32 s3, 0xf000 4383; GFX6-NEXT: s_mov_b32 s2, -1 4384; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4385; GFX6-NEXT: s_sext_i32_i16 s8, s6 4386; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 4387; GFX6-NEXT: s_sext_i32_i16 s9, s4 4388; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 4389; GFX6-NEXT: s_xor_b32 s8, s9, s8 4390; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 4391; GFX6-NEXT: s_ashr_i32 s8, s8, 30 4392; GFX6-NEXT: s_or_b32 s8, s8, 1 4393; GFX6-NEXT: v_mov_b32_e32 v3, s8 4394; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 4395; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4396; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 4397; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 4398; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 4399; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 4400; GFX6-NEXT: v_mov_b32_e32 v1, s4 4401; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 4402; GFX6-NEXT: v_mov_b32_e32 v2, s6 4403; GFX6-NEXT: v_alignbit_b32 v2, s7, v2, 16 4404; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 4405; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v3 4406; GFX6-NEXT: v_alignbit_b32 v1, s5, v1, 16 4407; GFX6-NEXT: v_bfe_i32 v5, v1, 0, 16 4408; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v5 4409; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 4410; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 4411; GFX6-NEXT: v_xor_b32_e32 v3, v5, v3 4412; GFX6-NEXT: v_ashrrev_i32_e32 v3, 30, v3 4413; GFX6-NEXT: v_mul_f32_e32 v5, v6, v7 4414; GFX6-NEXT: v_trunc_f32_e32 v5, v5 4415; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 4416; GFX6-NEXT: v_mad_f32 v6, -v5, v4, v6 4417; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 4418; GFX6-NEXT: s_sext_i32_i16 s4, s7 4419; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4| 4420; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 4421; GFX6-NEXT: v_or_b32_e32 v3, 1, v3 4422; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 4423; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 4424; GFX6-NEXT: s_sext_i32_i16 s6, s5 4425; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 4426; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s6 4427; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v4 4428; GFX6-NEXT: s_xor_b32 s4, s6, s4 4429; GFX6-NEXT: s_ashr_i32 s4, s4, 30 4430; GFX6-NEXT: s_or_b32 s4, s4, 1 4431; GFX6-NEXT: v_mul_f32_e32 v5, v3, v5 4432; GFX6-NEXT: v_trunc_f32_e32 v5, v5 4433; GFX6-NEXT: v_mad_f32 v3, -v5, v4, v3 4434; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 4435; GFX6-NEXT: v_mov_b32_e32 v6, s4 4436; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v4| 4437; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 4438; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 4439; GFX6-NEXT: v_mul_lo_u32 v3, v3, s7 4440; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 4441; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 4442; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v3 4443; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 4444; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 4445; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 4446; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 4447; GFX6-NEXT: s_endpgm 4448; 4449; GFX9-LABEL: srem_v3i16: 4450; GFX9: ; %bb.0: 4451; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 4452; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4453; GFX9-NEXT: s_sext_i32_i16 s8, s6 4454; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 4455; GFX9-NEXT: s_sext_i32_i16 s9, s4 4456; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 4457; GFX9-NEXT: s_xor_b32 s2, s9, s8 4458; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 4459; GFX9-NEXT: s_ashr_i32 s2, s2, 30 4460; GFX9-NEXT: s_or_b32 s10, s2, 1 4461; GFX9-NEXT: s_sext_i32_i16 s7, s7 4462; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 4463; GFX9-NEXT: v_trunc_f32_e32 v2, v2 4464; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 4465; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 4466; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 4467; GFX9-NEXT: s_cselect_b32 s2, s10, 0 4468; GFX9-NEXT: s_ashr_i32 s6, s6, 16 4469; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 4470; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s6 4471; GFX9-NEXT: s_ashr_i32 s4, s4, 16 4472; GFX9-NEXT: s_sext_i32_i16 s5, s5 4473; GFX9-NEXT: v_add_u32_e32 v1, s2, v2 4474; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s4 4475; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 4476; GFX9-NEXT: s_xor_b32 s2, s4, s6 4477; GFX9-NEXT: s_ashr_i32 s2, s2, 30 4478; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 4479; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 4480; GFX9-NEXT: v_trunc_f32_e32 v3, v3 4481; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 4482; GFX9-NEXT: s_or_b32 s8, s2, 1 4483; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 4484; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0| 4485; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s7 4486; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 4487; GFX9-NEXT: s_cselect_b32 s2, s8, 0 4488; GFX9-NEXT: v_add_u32_e32 v0, s2, v3 4489; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5 4490; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 4491; GFX9-NEXT: s_xor_b32 s2, s5, s7 4492; GFX9-NEXT: s_ashr_i32 s2, s2, 30 4493; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 4494; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 4495; GFX9-NEXT: v_trunc_f32_e32 v4, v4 4496; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 4497; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 4498; GFX9-NEXT: s_or_b32 s6, s2, 1 4499; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v2| 4500; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 4501; GFX9-NEXT: s_cselect_b32 s2, s6, 0 4502; GFX9-NEXT: v_add_u32_e32 v2, s2, v4 4503; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4504; GFX9-NEXT: v_mul_lo_u32 v2, v2, s7 4505; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 4506; GFX9-NEXT: v_mov_b32_e32 v3, 0 4507; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 4508; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 4509; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 4510; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 4511; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4512; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 4513; GFX9-NEXT: global_store_dword v3, v0, s[0:1] 4514; GFX9-NEXT: s_endpgm 4515 %r = srem <3 x i16> %x, %y 4516 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 4517 ret void 4518} 4519 4520define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 4521; CHECK-LABEL: @udiv_v3i15( 4522; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 4523; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 4524; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 4525; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 4526; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 4527; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 4528; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 4529; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 4530; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 4531; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 4532; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 4533; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 4534; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 4535; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 4536; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 4537; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 4538; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 4539; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 32767 4540; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15 4541; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i15> undef, i15 [[TMP19]], i64 0 4542; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1 4543; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1 4544; CHECK-NEXT: [[TMP23:%.*]] = zext i15 [[TMP21]] to i32 4545; CHECK-NEXT: [[TMP24:%.*]] = zext i15 [[TMP22]] to i32 4546; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 4547; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 4548; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 4549; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 4550; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 4551; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 4552; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 4553; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 4554; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 4555; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 4556; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 4557; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 4558; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 4559; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 32767 4560; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15 4561; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1 4562; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2 4563; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2 4564; CHECK-NEXT: [[TMP43:%.*]] = zext i15 [[TMP41]] to i32 4565; CHECK-NEXT: [[TMP44:%.*]] = zext i15 [[TMP42]] to i32 4566; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 4567; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 4568; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 4569; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 4570; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 4571; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 4572; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 4573; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 4574; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 4575; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 4576; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 4577; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 4578; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 4579; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 32767 4580; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15 4581; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2 4582; CHECK-NEXT: store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 4583; CHECK-NEXT: ret void 4584; 4585; GFX6-LABEL: udiv_v3i15: 4586; GFX6: ; %bb.0: 4587; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4588; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4589; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 4590; GFX6-NEXT: s_mov_b32 s7, 0xf000 4591; GFX6-NEXT: s_mov_b32 s6, -1 4592; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4593; GFX6-NEXT: v_mov_b32_e32 v0, s2 4594; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 4595; GFX6-NEXT: s_and_b32 s8, s0, 0x7fff 4596; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s8 4597; GFX6-NEXT: s_and_b32 s3, s2, 0x7fff 4598; GFX6-NEXT: v_mov_b32_e32 v2, s0 4599; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf000f 4600; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s3 4601; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 4602; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 4603; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf000f 4604; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 4605; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 4606; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s2 4607; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 4608; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 4609; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4610; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 4611; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 4612; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v2 4613; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 4614; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7 4615; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 4616; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4617; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 4618; GFX6-NEXT: v_mad_f32 v4, -v1, v5, v6 4619; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 4620; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0 4621; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 4622; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v5 4623; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 4624; GFX6-NEXT: v_mul_f32_e32 v1, v0, v6 4625; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4626; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v1 4627; GFX6-NEXT: v_mad_f32 v0, -v1, v2, v0 4628; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 4629; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v3 4630; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc 4631; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 4632; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 4633; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 4634; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 4635; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 4636; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4637; GFX6-NEXT: s_waitcnt expcnt(0) 4638; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 4639; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 4640; GFX6-NEXT: s_endpgm 4641; 4642; GFX9-LABEL: udiv_v3i15: 4643; GFX9: ; %bb.0: 4644; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 4645; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 4646; GFX9-NEXT: v_mov_b32_e32 v2, 0 4647; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 4648; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4649; GFX9-NEXT: v_mov_b32_e32 v0, s2 4650; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 4651; GFX9-NEXT: s_and_b32 s6, s2, 0x7fff 4652; GFX9-NEXT: s_and_b32 s3, s0, 0x7fff 4653; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 4654; GFX9-NEXT: v_mov_b32_e32 v3, s0 4655; GFX9-NEXT: s_bfe_u32 s0, s0, 0xf000f 4656; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 4657; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 4658; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 4659; GFX9-NEXT: s_bfe_u32 s2, s2, 0xf000f 4660; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30 4661; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 4662; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 4663; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 4664; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 4665; GFX9-NEXT: v_trunc_f32_e32 v5, v5 4666; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 4667; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 4668; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 4669; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 4670; GFX9-NEXT: v_mul_f32_e32 v1, v7, v8 4671; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 4672; GFX9-NEXT: v_trunc_f32_e32 v1, v1 4673; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 4674; GFX9-NEXT: v_mad_f32 v5, -v1, v6, v7 4675; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 4676; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 4677; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 4678; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v6 4679; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc 4680; GFX9-NEXT: v_mul_f32_e32 v1, v0, v7 4681; GFX9-NEXT: v_trunc_f32_e32 v1, v1 4682; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v1 4683; GFX9-NEXT: v_mad_f32 v0, -v1, v3, v0 4684; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v3 4685; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v4 4686; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc 4687; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v5 4688; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 4689; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 4690; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 4691; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 4692; GFX9-NEXT: global_store_dword v2, v0, s[4:5] 4693; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 4694; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 4695; GFX9-NEXT: s_endpgm 4696 %r = udiv <3 x i15> %x, %y 4697 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 4698 ret void 4699} 4700 4701define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 4702; CHECK-LABEL: @urem_v3i15( 4703; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 4704; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 4705; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 4706; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 4707; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 4708; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 4709; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 4710; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 4711; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 4712; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 4713; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 4714; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 4715; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 4716; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 4717; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 4718; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 4719; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 4720; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 4721; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 4722; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 32767 4723; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15 4724; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i15> undef, i15 [[TMP21]], i64 0 4725; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1 4726; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1 4727; CHECK-NEXT: [[TMP25:%.*]] = zext i15 [[TMP23]] to i32 4728; CHECK-NEXT: [[TMP26:%.*]] = zext i15 [[TMP24]] to i32 4729; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 4730; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 4731; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 4732; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 4733; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 4734; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 4735; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 4736; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 4737; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 4738; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 4739; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 4740; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 4741; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 4742; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 4743; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 4744; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 32767 4745; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15 4746; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1 4747; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2 4748; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2 4749; CHECK-NEXT: [[TMP47:%.*]] = zext i15 [[TMP45]] to i32 4750; CHECK-NEXT: [[TMP48:%.*]] = zext i15 [[TMP46]] to i32 4751; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 4752; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 4753; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 4754; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 4755; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 4756; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 4757; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 4758; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 4759; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 4760; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 4761; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 4762; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 4763; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 4764; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 4765; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 4766; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 32767 4767; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15 4768; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2 4769; CHECK-NEXT: store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 4770; CHECK-NEXT: ret void 4771; 4772; GFX6-LABEL: urem_v3i15: 4773; GFX6: ; %bb.0: 4774; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4775; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4776; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 4777; GFX6-NEXT: s_mov_b32 s7, 0xf000 4778; GFX6-NEXT: s_mov_b32 s6, -1 4779; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4780; GFX6-NEXT: s_and_b32 s8, s2, 0x7fff 4781; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 4782; GFX6-NEXT: s_and_b32 s9, s0, 0x7fff 4783; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 4784; GFX6-NEXT: v_mov_b32_e32 v2, s0 4785; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 4786; GFX6-NEXT: s_bfe_u32 s1, s0, 0xf000f 4787; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 4788; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 4789; GFX6-NEXT: s_bfe_u32 s9, s2, 0xf000f 4790; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 4791; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 4792; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4793; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 4794; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 4795; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 4796; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s9 4797; GFX6-NEXT: v_mov_b32_e32 v0, s2 4798; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc 4799; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 4800; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 4801; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 4802; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0 4803; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s2, v1 4804; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 4805; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2 4806; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0 4807; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4808; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 4809; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 4810; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 4811; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 4812; GFX6-NEXT: s_lshr_b32 s0, s0, 15 4813; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 4814; GFX6-NEXT: v_trunc_f32_e32 v3, v3 4815; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3 4816; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4817; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 4818; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 4819; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 4820; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 4821; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 4822; GFX6-NEXT: s_lshr_b32 s3, s2, 15 4823; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s3, v1 4824; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 4825; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 4826; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 4827; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v6 4828; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 4829; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 4830; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 4831; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4832; GFX6-NEXT: s_waitcnt expcnt(0) 4833; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 4834; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 4835; GFX6-NEXT: s_endpgm 4836; 4837; GFX9-LABEL: urem_v3i15: 4838; GFX9: ; %bb.0: 4839; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 4840; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 4841; GFX9-NEXT: v_mov_b32_e32 v2, 0 4842; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 4843; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4844; GFX9-NEXT: s_and_b32 s6, s2, 0x7fff 4845; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 4846; GFX9-NEXT: v_mov_b32_e32 v0, s2 4847; GFX9-NEXT: s_and_b32 s7, s0, 0x7fff 4848; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 4849; GFX9-NEXT: s_bfe_u32 s6, s0, 0xf000f 4850; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s6 4851; GFX9-NEXT: v_mov_b32_e32 v3, s0 4852; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 4853; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30 4854; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 4855; GFX9-NEXT: s_bfe_u32 s3, s2, 0xf000f 4856; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 4857; GFX9-NEXT: v_trunc_f32_e32 v5, v5 4858; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 4859; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 4860; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 4861; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 4862; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s3 4863; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 4864; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc 4865; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 4866; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 4867; GFX9-NEXT: v_mul_f32_e32 v4, v7, v8 4868; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 4869; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v5 4870; GFX9-NEXT: v_trunc_f32_e32 v4, v4 4871; GFX9-NEXT: v_mad_f32 v7, -v4, v6, v7 4872; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 4873; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 4874; GFX9-NEXT: v_mul_f32_e32 v6, v8, v9 4875; GFX9-NEXT: v_trunc_f32_e32 v6, v6 4876; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 4877; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 4878; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v8 4879; GFX9-NEXT: s_lshr_b32 s1, s0, 15 4880; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 4881; GFX9-NEXT: v_mul_lo_u32 v4, v4, s1 4882; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc 4883; GFX9-NEXT: v_mul_lo_u32 v1, v1, s0 4884; GFX9-NEXT: v_mul_lo_u32 v3, v5, v3 4885; GFX9-NEXT: s_lshr_b32 s0, s2, 15 4886; GFX9-NEXT: v_sub_u32_e32 v4, s0, v4 4887; GFX9-NEXT: v_sub_u32_e32 v5, s2, v1 4888; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 4889; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4 4890; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 4891; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v5 4892; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 4893; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 4894; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 4895; GFX9-NEXT: global_store_dword v2, v0, s[4:5] 4896; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 4897; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 4898; GFX9-NEXT: s_endpgm 4899 %r = urem <3 x i15> %x, %y 4900 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 4901 ret void 4902} 4903 4904define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 4905; CHECK-LABEL: @sdiv_v3i15( 4906; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 4907; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 4908; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 4909; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 4910; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 4911; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 4912; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 4913; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 4914; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 4915; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 4916; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 4917; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 4918; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 4919; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 4920; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 4921; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 4922; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4923; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 4924; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 4925; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 4926; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 17 4927; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 17 4928; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15 4929; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i15> undef, i15 [[TMP23]], i64 0 4930; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1 4931; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1 4932; CHECK-NEXT: [[TMP27:%.*]] = sext i15 [[TMP25]] to i32 4933; CHECK-NEXT: [[TMP28:%.*]] = sext i15 [[TMP26]] to i32 4934; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 4935; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 4936; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 4937; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 4938; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 4939; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 4940; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 4941; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 4942; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 4943; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 4944; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 4945; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 4946; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 4947; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 4948; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 4949; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 4950; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 17 4951; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 17 4952; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15 4953; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1 4954; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2 4955; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2 4956; CHECK-NEXT: [[TMP51:%.*]] = sext i15 [[TMP49]] to i32 4957; CHECK-NEXT: [[TMP52:%.*]] = sext i15 [[TMP50]] to i32 4958; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 4959; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 4960; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 4961; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 4962; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 4963; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 4964; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 4965; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 4966; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 4967; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 4968; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 4969; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 4970; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 4971; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 4972; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 4973; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 4974; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 17 4975; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 17 4976; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15 4977; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2 4978; CHECK-NEXT: store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 4979; CHECK-NEXT: ret void 4980; 4981; GFX6-LABEL: sdiv_v3i15: 4982; GFX6: ; %bb.0: 4983; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4984; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4985; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 4986; GFX6-NEXT: s_mov_b32 s7, 0xf000 4987; GFX6-NEXT: s_mov_b32 s6, -1 4988; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4989; GFX6-NEXT: v_mov_b32_e32 v0, s2 4990; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 4991; GFX6-NEXT: s_bfe_i32 s3, s0, 0xf0000 4992; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s3 4993; GFX6-NEXT: v_mov_b32_e32 v1, s0 4994; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 30 4995; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf0000 4996; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s1 4997; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 4998; GFX6-NEXT: s_xor_b32 s1, s1, s3 4999; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f 5000; GFX6-NEXT: s_ashr_i32 s1, s1, 30 5001; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 5002; GFX6-NEXT: v_trunc_f32_e32 v4, v4 5003; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 5004; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 5005; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 5006; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s0 5007; GFX6-NEXT: s_or_b32 s1, s1, 1 5008; GFX6-NEXT: v_mov_b32_e32 v5, s1 5009; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 5010; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf000f 5011; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 5012; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 5013; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 5014; GFX6-NEXT: s_xor_b32 s0, s1, s0 5015; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 5016; GFX6-NEXT: s_ashr_i32 s0, s0, 30 5017; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 5018; GFX6-NEXT: v_trunc_f32_e32 v5, v5 5019; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 5020; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 5021; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| 5022; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v1 5023; GFX6-NEXT: s_or_b32 s0, s0, 1 5024; GFX6-NEXT: v_mov_b32_e32 v6, s0 5025; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 5026; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 5027; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 5028; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v0 5029; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 5030; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 5031; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 5032; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 5033; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 5034; GFX6-NEXT: v_trunc_f32_e32 v1, v1 5035; GFX6-NEXT: v_mad_f32 v5, -v1, v4, v5 5036; GFX6-NEXT: v_cvt_i32_f32_e32 v1, v1 5037; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| 5038; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 5039; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v3 5040; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5041; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 5042; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 5043; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 5044; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 5045; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 5046; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5047; GFX6-NEXT: s_waitcnt expcnt(0) 5048; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5049; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 5050; GFX6-NEXT: s_endpgm 5051; 5052; GFX9-LABEL: sdiv_v3i15: 5053; GFX9: ; %bb.0: 5054; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 5055; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 5056; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 5057; GFX9-NEXT: v_mov_b32_e32 v2, 0 5058; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5059; GFX9-NEXT: s_bfe_i32 s1, s2, 0xf0000 5060; GFX9-NEXT: s_bfe_i32 s0, s4, 0xf0000 5061; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 5062; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 5063; GFX9-NEXT: s_xor_b32 s0, s1, s0 5064; GFX9-NEXT: v_mov_b32_e32 v0, s2 5065; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 5066; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5067; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 5068; GFX9-NEXT: s_or_b32 s3, s0, 1 5069; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 5070; GFX9-NEXT: v_trunc_f32_e32 v5, v5 5071; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 5072; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 5073; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5074; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 5075; GFX9-NEXT: s_cselect_b32 s0, s3, 0 5076; GFX9-NEXT: s_bfe_i32 s1, s4, 0xf000f 5077; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s1 5078; GFX9-NEXT: v_add_u32_e32 v4, s0, v5 5079; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf000f 5080; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 5081; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 5082; GFX9-NEXT: v_mov_b32_e32 v1, s4 5083; GFX9-NEXT: v_alignbit_b32 v1, s5, v1, 30 5084; GFX9-NEXT: s_xor_b32 s0, s0, s1 5085; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 5086; GFX9-NEXT: v_trunc_f32_e32 v6, v6 5087; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5088; GFX9-NEXT: v_mad_f32 v5, -v6, v3, v5 5089; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 5090; GFX9-NEXT: s_or_b32 s2, s0, 1 5091; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 5092; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| 5093; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 5094; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5095; GFX9-NEXT: s_cselect_b32 s0, s2, 0 5096; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 15 5097; GFX9-NEXT: v_add_u32_e32 v5, s0, v6 5098; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v0 5099; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 5100; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 5101; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 5102; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 5103; GFX9-NEXT: v_mul_f32_e32 v1, v6, v7 5104; GFX9-NEXT: v_trunc_f32_e32 v1, v1 5105; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v1 5106; GFX9-NEXT: v_mad_f32 v1, -v1, v3, v6 5107; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| 5108; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 5109; GFX9-NEXT: v_add_u32_e32 v0, v7, v0 5110; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v4 5111; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v5 5112; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 5113; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 5114; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 5115; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 5116; GFX9-NEXT: global_store_dword v2, v0, s[6:7] 5117; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5118; GFX9-NEXT: global_store_short v2, v0, s[6:7] offset:4 5119; GFX9-NEXT: s_endpgm 5120 %r = sdiv <3 x i15> %x, %y 5121 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 5122 ret void 5123} 5124 5125define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 5126; CHECK-LABEL: @srem_v3i15( 5127; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 5128; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 5129; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 5130; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 5131; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 5132; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 5133; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 5134; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 5135; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 5136; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 5137; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 5138; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 5139; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 5140; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 5141; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 5142; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 5143; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 5144; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 5145; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 5146; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 5147; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 5148; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 5149; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 17 5150; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 17 5151; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15 5152; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i15> undef, i15 [[TMP25]], i64 0 5153; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1 5154; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1 5155; CHECK-NEXT: [[TMP29:%.*]] = sext i15 [[TMP27]] to i32 5156; CHECK-NEXT: [[TMP30:%.*]] = sext i15 [[TMP28]] to i32 5157; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 5158; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 5159; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 5160; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 5161; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 5162; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 5163; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 5164; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 5165; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 5166; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 5167; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 5168; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 5169; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 5170; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 5171; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 5172; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 5173; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 5174; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 5175; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 17 5176; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 17 5177; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15 5178; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1 5179; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2 5180; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2 5181; CHECK-NEXT: [[TMP55:%.*]] = sext i15 [[TMP53]] to i32 5182; CHECK-NEXT: [[TMP56:%.*]] = sext i15 [[TMP54]] to i32 5183; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 5184; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 5185; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 5186; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 5187; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 5188; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 5189; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 5190; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 5191; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 5192; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 5193; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 5194; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 5195; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 5196; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 5197; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 5198; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 5199; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 5200; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 5201; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 17 5202; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 17 5203; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15 5204; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2 5205; CHECK-NEXT: store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 5206; CHECK-NEXT: ret void 5207; 5208; GFX6-LABEL: srem_v3i15: 5209; GFX6: ; %bb.0: 5210; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 5211; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5212; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 5213; GFX6-NEXT: s_mov_b32 s7, 0xf000 5214; GFX6-NEXT: s_mov_b32 s6, -1 5215; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5216; GFX6-NEXT: s_bfe_i32 s9, s2, 0xf0000 5217; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s9 5218; GFX6-NEXT: v_mov_b32_e32 v2, s0 5219; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 5220; GFX6-NEXT: s_bfe_i32 s1, s0, 0xf0000 5221; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 5222; GFX6-NEXT: s_xor_b32 s1, s9, s1 5223; GFX6-NEXT: s_ashr_i32 s1, s1, 30 5224; GFX6-NEXT: s_or_b32 s1, s1, 1 5225; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 5226; GFX6-NEXT: v_mov_b32_e32 v7, s1 5227; GFX6-NEXT: s_lshr_b32 s8, s0, 15 5228; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf000f 5229; GFX6-NEXT: v_mul_f32_e32 v6, v5, v6 5230; GFX6-NEXT: v_trunc_f32_e32 v6, v6 5231; GFX6-NEXT: v_mad_f32 v5, -v6, v4, v5 5232; GFX6-NEXT: v_cvt_i32_f32_e32 v6, v6 5233; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| 5234; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v7, vcc 5235; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v2 5236; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 5237; GFX6-NEXT: v_mul_lo_u32 v4, v4, s0 5238; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f 5239; GFX6-NEXT: v_cvt_f32_i32_e32 v5, s0 5240; GFX6-NEXT: v_cvt_f32_i32_e32 v6, s1 5241; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s2, v4 5242; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 5243; GFX6-NEXT: s_xor_b32 s0, s1, s0 5244; GFX6-NEXT: v_bfe_i32 v2, v2, 0, 15 5245; GFX6-NEXT: s_ashr_i32 s0, s0, 30 5246; GFX6-NEXT: v_mul_f32_e32 v7, v6, v7 5247; GFX6-NEXT: v_trunc_f32_e32 v7, v7 5248; GFX6-NEXT: v_mad_f32 v6, -v7, v5, v6 5249; GFX6-NEXT: v_cvt_i32_f32_e32 v7, v7 5250; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v5| 5251; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v2 5252; GFX6-NEXT: v_mov_b32_e32 v0, s2 5253; GFX6-NEXT: s_or_b32 s0, s0, 1 5254; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 5255; GFX6-NEXT: v_mov_b32_e32 v8, s0 5256; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v0 5257; GFX6-NEXT: v_cndmask_b32_e32 v5, 0, v8, vcc 5258; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 5259; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v7 5260; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v0 5261; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 5262; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2 5263; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 5264; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 5265; GFX6-NEXT: v_mul_f32_e32 v2, v7, v8 5266; GFX6-NEXT: v_trunc_f32_e32 v2, v2 5267; GFX6-NEXT: v_mad_f32 v7, -v2, v6, v7 5268; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 5269; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| 5270; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 5271; GFX6-NEXT: v_mul_lo_u32 v5, v5, s8 5272; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 5273; GFX6-NEXT: v_mul_lo_u32 v0, v0, v3 5274; GFX6-NEXT: s_lshr_b32 s3, s2, 15 5275; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v5 5276; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v0, v1 5277; GFX6-NEXT: v_and_b32_e32 v2, 0x7fff, v2 5278; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 5279; GFX6-NEXT: v_and_b32_e32 v3, 0x7fff, v4 5280; GFX6-NEXT: v_lshlrev_b32_e32 v2, 15, v2 5281; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 5282; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 5283; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5284; GFX6-NEXT: s_waitcnt expcnt(0) 5285; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5286; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 5287; GFX6-NEXT: s_endpgm 5288; 5289; GFX9-LABEL: srem_v3i15: 5290; GFX9: ; %bb.0: 5291; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 5292; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 5293; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 5294; GFX9-NEXT: v_mov_b32_e32 v2, 0 5295; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5296; GFX9-NEXT: s_bfe_i32 s1, s2, 0xf0000 5297; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s1 5298; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf0000 5299; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 5300; GFX9-NEXT: s_xor_b32 s0, s1, s0 5301; GFX9-NEXT: v_mov_b32_e32 v0, s2 5302; GFX9-NEXT: v_mov_b32_e32 v1, s6 5303; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 5304; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5305; GFX9-NEXT: s_lshr_b32 s8, s2, 15 5306; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 5307; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 5308; GFX9-NEXT: v_trunc_f32_e32 v6, v6 5309; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 5310; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 5311; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 5312; GFX9-NEXT: s_lshr_b32 s3, s6, 15 5313; GFX9-NEXT: s_or_b32 s7, s0, 1 5314; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| 5315; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5316; GFX9-NEXT: s_cselect_b32 s0, s7, 0 5317; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 5318; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf000f 5319; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 5320; GFX9-NEXT: s_bfe_i32 s1, s2, 0xf000f 5321; GFX9-NEXT: v_cvt_f32_i32_e32 v6, s1 5322; GFX9-NEXT: s_xor_b32 s0, s1, s0 5323; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 5324; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v1 5325; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5326; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 5327; GFX9-NEXT: v_mul_f32_e32 v7, v6, v7 5328; GFX9-NEXT: v_trunc_f32_e32 v7, v7 5329; GFX9-NEXT: v_mad_f32 v6, -v7, v5, v6 5330; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v7 5331; GFX9-NEXT: v_mul_lo_u32 v4, v4, s6 5332; GFX9-NEXT: s_or_b32 s6, s0, 1 5333; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v6|, |v5| 5334; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v1 5335; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5336; GFX9-NEXT: s_cselect_b32 s0, s6, 0 5337; GFX9-NEXT: v_add_u32_e32 v5, s0, v7 5338; GFX9-NEXT: v_bfe_i32 v7, v0, 0, 15 5339; GFX9-NEXT: v_cvt_f32_i32_e32 v8, v7 5340; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v6 5341; GFX9-NEXT: v_xor_b32_e32 v1, v7, v1 5342; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1 5343; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 5344; GFX9-NEXT: v_mul_f32_e32 v7, v8, v9 5345; GFX9-NEXT: v_trunc_f32_e32 v7, v7 5346; GFX9-NEXT: v_cvt_i32_f32_e32 v9, v7 5347; GFX9-NEXT: v_mad_f32 v7, -v7, v6, v8 5348; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| 5349; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc 5350; GFX9-NEXT: v_mul_lo_u32 v5, v5, s3 5351; GFX9-NEXT: v_add_u32_e32 v1, v9, v1 5352; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3 5353; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0 5354; GFX9-NEXT: v_sub_u32_e32 v3, s2, v4 5355; GFX9-NEXT: v_sub_u32_e32 v4, s8, v5 5356; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 5357; GFX9-NEXT: v_and_b32_e32 v4, 0x7fff, v4 5358; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 5359; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3 5360; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 5361; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 5362; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 5363; GFX9-NEXT: global_store_dword v2, v0, s[4:5] 5364; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5365; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 5366; GFX9-NEXT: s_endpgm 5367 %r = srem <3 x i15> %x, %y 5368 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 5369 ret void 5370} 5371 5372define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 5373; CHECK-LABEL: @udiv_i32_oddk_denom( 5374; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 1235195 5375; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5376; CHECK-NEXT: ret void 5377; 5378; GFX6-LABEL: udiv_i32_oddk_denom: 5379; GFX6: ; %bb.0: 5380; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 5381; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5382; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 5383; GFX6-NEXT: s_mov_b32 s3, 0xf000 5384; GFX6-NEXT: s_mov_b32 s2, -1 5385; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5386; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 5387; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v0 5388; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 5389; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5390; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 5391; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 5392; GFX6-NEXT: s_endpgm 5393; 5394; GFX9-LABEL: udiv_i32_oddk_denom: 5395; GFX9: ; %bb.0: 5396; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 5397; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5398; GFX9-NEXT: v_mov_b32_e32 v0, 0 5399; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5400; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 5401; GFX9-NEXT: s_sub_i32 s1, s4, s0 5402; GFX9-NEXT: s_lshr_b32 s1, s1, 1 5403; GFX9-NEXT: s_add_i32 s1, s1, s0 5404; GFX9-NEXT: s_lshr_b32 s0, s1, 20 5405; GFX9-NEXT: v_mov_b32_e32 v1, s0 5406; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 5407; GFX9-NEXT: s_endpgm 5408 %r = udiv i32 %x, 1235195 5409 store i32 %r, i32 addrspace(1)* %out 5410 ret void 5411} 5412 5413define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 5414; CHECK-LABEL: @udiv_i32_pow2k_denom( 5415; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 4096 5416; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5417; CHECK-NEXT: ret void 5418; 5419; GFX6-LABEL: udiv_i32_pow2k_denom: 5420; GFX6: ; %bb.0: 5421; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 5422; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5423; GFX6-NEXT: s_mov_b32 s3, 0xf000 5424; GFX6-NEXT: s_mov_b32 s2, -1 5425; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5426; GFX6-NEXT: s_lshr_b32 s4, s4, 12 5427; GFX6-NEXT: v_mov_b32_e32 v0, s4 5428; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 5429; GFX6-NEXT: s_endpgm 5430; 5431; GFX9-LABEL: udiv_i32_pow2k_denom: 5432; GFX9: ; %bb.0: 5433; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 5434; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5435; GFX9-NEXT: v_mov_b32_e32 v0, 0 5436; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5437; GFX9-NEXT: s_lshr_b32 s0, s4, 12 5438; GFX9-NEXT: v_mov_b32_e32 v1, s0 5439; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 5440; GFX9-NEXT: s_endpgm 5441 %r = udiv i32 %x, 4096 5442 store i32 %r, i32 addrspace(1)* %out 5443 ret void 5444} 5445 5446define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 5447; CHECK-LABEL: @udiv_i32_pow2_shl_denom( 5448; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 5449; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]] 5450; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5451; CHECK-NEXT: ret void 5452; 5453; GFX6-LABEL: udiv_i32_pow2_shl_denom: 5454; GFX6: ; %bb.0: 5455; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 5456; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5457; GFX6-NEXT: s_mov_b32 s3, 0xf000 5458; GFX6-NEXT: s_mov_b32 s2, -1 5459; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5460; GFX6-NEXT: s_add_i32 s5, s5, 12 5461; GFX6-NEXT: s_lshr_b32 s4, s4, s5 5462; GFX6-NEXT: v_mov_b32_e32 v0, s4 5463; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 5464; GFX6-NEXT: s_endpgm 5465; 5466; GFX9-LABEL: udiv_i32_pow2_shl_denom: 5467; GFX9: ; %bb.0: 5468; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 5469; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 5470; GFX9-NEXT: v_mov_b32_e32 v0, 0 5471; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5472; GFX9-NEXT: s_add_i32 s0, s3, 12 5473; GFX9-NEXT: s_lshr_b32 s0, s2, s0 5474; GFX9-NEXT: v_mov_b32_e32 v1, s0 5475; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 5476; GFX9-NEXT: s_endpgm 5477 %shl.y = shl i32 4096, %y 5478 %r = udiv i32 %x, %shl.y 5479 store i32 %r, i32 addrspace(1)* %out 5480 ret void 5481} 5482 5483define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 5484; CHECK-LABEL: @udiv_v2i32_pow2k_denom( 5485; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5486; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 5487; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 5488; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 5489; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096 5490; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 5491; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 5492; CHECK-NEXT: ret void 5493; 5494; GFX6-LABEL: udiv_v2i32_pow2k_denom: 5495; GFX6: ; %bb.0: 5496; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 5497; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5498; GFX6-NEXT: s_mov_b32 s3, 0xf000 5499; GFX6-NEXT: s_mov_b32 s2, -1 5500; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5501; GFX6-NEXT: s_lshr_b32 s4, s4, 12 5502; GFX6-NEXT: s_lshr_b32 s5, s5, 12 5503; GFX6-NEXT: v_mov_b32_e32 v0, s4 5504; GFX6-NEXT: v_mov_b32_e32 v1, s5 5505; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5506; GFX6-NEXT: s_endpgm 5507; 5508; GFX9-LABEL: udiv_v2i32_pow2k_denom: 5509; GFX9: ; %bb.0: 5510; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 5511; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 5512; GFX9-NEXT: v_mov_b32_e32 v2, 0 5513; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5514; GFX9-NEXT: s_lshr_b32 s0, s2, 12 5515; GFX9-NEXT: s_lshr_b32 s1, s3, 12 5516; GFX9-NEXT: v_mov_b32_e32 v0, s0 5517; GFX9-NEXT: v_mov_b32_e32 v1, s1 5518; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 5519; GFX9-NEXT: s_endpgm 5520 %r = udiv <2 x i32> %x, <i32 4096, i32 4096> 5521 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 5522 ret void 5523} 5524 5525define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 5526; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom( 5527; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5528; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 5529; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 5530; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 5531; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095 5532; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 5533; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 5534; CHECK-NEXT: ret void 5535; 5536; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom: 5537; GFX6: ; %bb.0: 5538; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 5539; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5540; GFX6-NEXT: v_mov_b32_e32 v0, 0x100101 5541; GFX6-NEXT: s_mov_b32 s3, 0xf000 5542; GFX6-NEXT: s_mov_b32 s2, -1 5543; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5544; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 5545; GFX6-NEXT: s_lshr_b32 s4, s4, 12 5546; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v0 5547; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 5548; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5549; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v0 5550; GFX6-NEXT: v_mov_b32_e32 v0, s4 5551; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5552; GFX6-NEXT: s_endpgm 5553; 5554; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: 5555; GFX9: ; %bb.0: 5556; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 5557; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 5558; GFX9-NEXT: v_mov_b32_e32 v2, 0 5559; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5560; GFX9-NEXT: s_mul_hi_u32 s1, s3, 0x100101 5561; GFX9-NEXT: s_lshr_b32 s0, s2, 12 5562; GFX9-NEXT: s_sub_i32 s2, s3, s1 5563; GFX9-NEXT: s_lshr_b32 s2, s2, 1 5564; GFX9-NEXT: s_add_i32 s2, s2, s1 5565; GFX9-NEXT: s_lshr_b32 s1, s2, 11 5566; GFX9-NEXT: v_mov_b32_e32 v0, s0 5567; GFX9-NEXT: v_mov_b32_e32 v1, s1 5568; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 5569; GFX9-NEXT: s_endpgm 5570 %r = udiv <2 x i32> %x, <i32 4096, i32 4095> 5571 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 5572 ret void 5573} 5574 5575define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 5576; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom( 5577; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 5578; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5579; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 5580; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 5581; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 5582; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 5583; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 5584; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 5585; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 5586; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 5587; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 5588; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 5589; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 5590; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 5591; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 5592; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 5593; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 5594; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 5595; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 5596; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 5597; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 5598; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 5599; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 5600; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 5601; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 5602; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1 5603; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]] 5604; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]] 5605; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]] 5606; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]] 5607; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 5608; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 5609; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> undef, i32 [[TMP31]], i64 0 5610; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x i32> [[X]], i64 1 5611; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 5612; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float 5613; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 5614; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000 5615; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32 5616; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]] 5617; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]] 5618; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64 5619; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64 5620; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]] 5621; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 5622; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32 5623; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32 5624; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]] 5625; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64 5626; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64 5627; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]] 5628; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 5629; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32 5630; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32 5631; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]] 5632; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]] 5633; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]] 5634; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1 5635; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]] 5636; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]] 5637; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]] 5638; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]] 5639; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1 5640; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]] 5641; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP63]], i64 1 5642; CHECK-NEXT: store <2 x i32> [[TMP64]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 5643; CHECK-NEXT: ret void 5644; 5645; GFX6-LABEL: udiv_v2i32_pow2_shl_denom: 5646; GFX6: ; %bb.0: 5647; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 5648; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 5649; GFX6-NEXT: s_mov_b32 s11, 0xf000 5650; GFX6-NEXT: s_mov_b32 s10, -1 5651; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5652; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 5653; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 5654; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s7 5655; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 5656; GFX6-NEXT: s_sub_i32 s0, 0, s2 5657; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 5658; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 5659; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 5660; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 5661; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 5662; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 5663; GFX6-NEXT: v_mul_lo_u32 v2, s0, v0 5664; GFX6-NEXT: s_sub_i32 s0, 0, s3 5665; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1 5666; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 5667; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 5668; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 5669; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 5670; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 5671; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 5672; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 5673; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 5674; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 5675; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 5676; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 5677; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 5678; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v2 5679; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 5680; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 5681; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 5682; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 5683; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v4 5684; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 5685; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v2 5686; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 5687; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v2 5688; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 5689; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 5690; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 5691; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 5692; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 5693; GFX6-NEXT: s_endpgm 5694; 5695; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: 5696; GFX9: ; %bb.0: 5697; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 5698; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5699; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6 5700; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7 5701; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6 5702; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 5703; GFX9-NEXT: s_sub_i32 s2, 0, s6 5704; GFX9-NEXT: s_sub_i32 s3, 0, s7 5705; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 5706; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 5707; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 5708; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 5709; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 5710; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 5711; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 5712; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 5713; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5714; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 5715; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 5716; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 5717; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 5718; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 5719; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 5720; GFX9-NEXT: v_mov_b32_e32 v2, 0 5721; GFX9-NEXT: v_mul_lo_u32 v3, v0, s6 5722; GFX9-NEXT: v_mul_lo_u32 v4, v1, s7 5723; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 5724; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 5725; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 5726; GFX9-NEXT: v_sub_u32_e32 v4, s5, v4 5727; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 5728; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 5729; GFX9-NEXT: v_subrev_u32_e32 v5, s6, v3 5730; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v4 5731; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[0:1] 5732; GFX9-NEXT: v_subrev_u32_e32 v6, s7, v4 5733; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 5734; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 5735; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[0:1] 5736; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 5737; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 5738; GFX9-NEXT: v_add_u32_e32 v3, 1, v1 5739; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v4 5740; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 5741; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5742; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 5743; GFX9-NEXT: s_endpgm 5744 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 5745 %r = udiv <2 x i32> %x, %shl.y 5746 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 5747 ret void 5748} 5749 5750define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 5751; CHECK-LABEL: @urem_i32_oddk_denom( 5752; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 1235195 5753; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5754; CHECK-NEXT: ret void 5755; 5756; GFX6-LABEL: urem_i32_oddk_denom: 5757; GFX6: ; %bb.0: 5758; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 5759; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 5760; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb 5761; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5762; GFX6-NEXT: s_mov_b32 s3, 0xf000 5763; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5764; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 5765; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v0 5766; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 5767; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5768; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 5769; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 5770; GFX6-NEXT: s_mov_b32 s2, -1 5771; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 5772; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 5773; GFX6-NEXT: s_endpgm 5774; 5775; GFX9-LABEL: urem_i32_oddk_denom: 5776; GFX9: ; %bb.0: 5777; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 5778; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5779; GFX9-NEXT: v_mov_b32_e32 v0, 0 5780; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5781; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 5782; GFX9-NEXT: s_sub_i32 s1, s4, s0 5783; GFX9-NEXT: s_lshr_b32 s1, s1, 1 5784; GFX9-NEXT: s_add_i32 s1, s1, s0 5785; GFX9-NEXT: s_lshr_b32 s0, s1, 20 5786; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb 5787; GFX9-NEXT: s_sub_i32 s0, s4, s0 5788; GFX9-NEXT: v_mov_b32_e32 v1, s0 5789; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 5790; GFX9-NEXT: s_endpgm 5791 %r = urem i32 %x, 1235195 5792 store i32 %r, i32 addrspace(1)* %out 5793 ret void 5794} 5795 5796define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 5797; CHECK-LABEL: @urem_i32_pow2k_denom( 5798; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 4096 5799; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5800; CHECK-NEXT: ret void 5801; 5802; GFX6-LABEL: urem_i32_pow2k_denom: 5803; GFX6: ; %bb.0: 5804; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 5805; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5806; GFX6-NEXT: s_mov_b32 s3, 0xf000 5807; GFX6-NEXT: s_mov_b32 s2, -1 5808; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5809; GFX6-NEXT: s_and_b32 s4, s4, 0xfff 5810; GFX6-NEXT: v_mov_b32_e32 v0, s4 5811; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 5812; GFX6-NEXT: s_endpgm 5813; 5814; GFX9-LABEL: urem_i32_pow2k_denom: 5815; GFX9: ; %bb.0: 5816; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 5817; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5818; GFX9-NEXT: v_mov_b32_e32 v0, 0 5819; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5820; GFX9-NEXT: s_and_b32 s0, s4, 0xfff 5821; GFX9-NEXT: v_mov_b32_e32 v1, s0 5822; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 5823; GFX9-NEXT: s_endpgm 5824 %r = urem i32 %x, 4096 5825 store i32 %r, i32 addrspace(1)* %out 5826 ret void 5827} 5828 5829define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 5830; CHECK-LABEL: @urem_i32_pow2_shl_denom( 5831; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 5832; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]] 5833; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5834; CHECK-NEXT: ret void 5835; 5836; GFX6-LABEL: urem_i32_pow2_shl_denom: 5837; GFX6: ; %bb.0: 5838; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 5839; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5840; GFX6-NEXT: s_mov_b32 s3, 0xf000 5841; GFX6-NEXT: s_mov_b32 s2, -1 5842; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5843; GFX6-NEXT: s_lshl_b32 s5, 0x1000, s5 5844; GFX6-NEXT: s_add_i32 s5, s5, -1 5845; GFX6-NEXT: s_and_b32 s4, s4, s5 5846; GFX6-NEXT: v_mov_b32_e32 v0, s4 5847; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 5848; GFX6-NEXT: s_endpgm 5849; 5850; GFX9-LABEL: urem_i32_pow2_shl_denom: 5851; GFX9: ; %bb.0: 5852; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 5853; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 5854; GFX9-NEXT: v_mov_b32_e32 v0, 0 5855; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5856; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s3 5857; GFX9-NEXT: s_add_i32 s0, s0, -1 5858; GFX9-NEXT: s_and_b32 s0, s2, s0 5859; GFX9-NEXT: v_mov_b32_e32 v1, s0 5860; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 5861; GFX9-NEXT: s_endpgm 5862 %shl.y = shl i32 4096, %y 5863 %r = urem i32 %x, %shl.y 5864 store i32 %r, i32 addrspace(1)* %out 5865 ret void 5866} 5867 5868define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 5869; CHECK-LABEL: @urem_v2i32_pow2k_denom( 5870; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5871; CHECK-NEXT: [[TMP2:%.*]] = urem i32 [[TMP1]], 4096 5872; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 5873; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 5874; CHECK-NEXT: [[TMP5:%.*]] = urem i32 [[TMP4]], 4096 5875; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 5876; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 5877; CHECK-NEXT: ret void 5878; 5879; GFX6-LABEL: urem_v2i32_pow2k_denom: 5880; GFX6: ; %bb.0: 5881; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 5882; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5883; GFX6-NEXT: s_mov_b32 s3, 0xf000 5884; GFX6-NEXT: s_mov_b32 s2, -1 5885; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5886; GFX6-NEXT: s_and_b32 s4, s4, 0xfff 5887; GFX6-NEXT: s_and_b32 s5, s5, 0xfff 5888; GFX6-NEXT: v_mov_b32_e32 v0, s4 5889; GFX6-NEXT: v_mov_b32_e32 v1, s5 5890; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 5891; GFX6-NEXT: s_endpgm 5892; 5893; GFX9-LABEL: urem_v2i32_pow2k_denom: 5894; GFX9: ; %bb.0: 5895; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 5896; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 5897; GFX9-NEXT: v_mov_b32_e32 v2, 0 5898; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5899; GFX9-NEXT: s_and_b32 s0, s2, 0xfff 5900; GFX9-NEXT: s_and_b32 s1, s3, 0xfff 5901; GFX9-NEXT: v_mov_b32_e32 v0, s0 5902; GFX9-NEXT: v_mov_b32_e32 v1, s1 5903; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 5904; GFX9-NEXT: s_endpgm 5905 %r = urem <2 x i32> %x, <i32 4096, i32 4096> 5906 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 5907 ret void 5908} 5909 5910define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 5911; CHECK-LABEL: @urem_v2i32_pow2_shl_denom( 5912; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 5913; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5914; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 5915; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 5916; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 5917; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 5918; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 5919; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 5920; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 5921; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 5922; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 5923; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 5924; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 5925; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 5926; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 5927; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 5928; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 5929; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 5930; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 5931; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 5932; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 5933; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 5934; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 5935; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 5936; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 5937; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]] 5938; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]] 5939; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]] 5940; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]] 5941; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]] 5942; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> undef, i32 [[TMP29]], i64 0 5943; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[X]], i64 1 5944; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 5945; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float 5946; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 5947; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000 5948; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32 5949; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]] 5950; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]] 5951; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64 5952; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64 5953; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]] 5954; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32 5955; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32 5956; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 5957; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]] 5958; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64 5959; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64 5960; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]] 5961; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32 5962; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32 5963; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 5964; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]] 5965; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]] 5966; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]] 5967; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]] 5968; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]] 5969; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]] 5970; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]] 5971; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]] 5972; CHECK-NEXT: [[TMP60:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP59]], i64 1 5973; CHECK-NEXT: store <2 x i32> [[TMP60]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 5974; CHECK-NEXT: ret void 5975; 5976; GFX6-LABEL: urem_v2i32_pow2_shl_denom: 5977; GFX6: ; %bb.0: 5978; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 5979; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5980; GFX6-NEXT: s_mov_b32 s3, 0xf000 5981; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5982; GFX6-NEXT: s_lshl_b32 s6, 0x1000, s6 5983; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 5984; GFX6-NEXT: s_lshl_b32 s7, 0x1000, s7 5985; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 5986; GFX6-NEXT: s_sub_i32 s2, 0, s6 5987; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 5988; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 5989; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 5990; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 5991; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 5992; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 5993; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 5994; GFX6-NEXT: s_sub_i32 s2, 0, s7 5995; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 5996; GFX6-NEXT: s_mov_b32 s2, -1 5997; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 5998; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 5999; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 6000; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 6001; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 6002; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 6003; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 6004; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 6005; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 6006; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 6007; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 6008; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 6009; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6010; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 6011; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 6012; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6013; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 6014; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 6015; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6016; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 6017; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 6018; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6019; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6020; GFX6-NEXT: s_endpgm 6021; 6022; GFX9-LABEL: urem_v2i32_pow2_shl_denom: 6023; GFX9: ; %bb.0: 6024; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 6025; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6026; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6 6027; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7 6028; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 6029; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 6030; GFX9-NEXT: s_sub_i32 s6, 0, s3 6031; GFX9-NEXT: s_sub_i32 s7, 0, s2 6032; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6033; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 6034; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6035; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6036; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 6037; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6038; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 6039; GFX9-NEXT: v_mul_lo_u32 v2, s6, v0 6040; GFX9-NEXT: v_mul_lo_u32 v3, s7, v1 6041; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 6042; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 6043; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 6044; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 6045; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 6046; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 6047; GFX9-NEXT: v_mov_b32_e32 v2, 0 6048; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 6049; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2 6050; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 6051; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 6052; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v0 6053; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 6054; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v1 6055; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6056; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 6057; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 6058; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v0 6059; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 6060; GFX9-NEXT: v_subrev_u32_e32 v4, s2, v1 6061; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6062; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 6063; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 6064; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6065; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 6066; GFX9-NEXT: s_endpgm 6067 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 6068 %r = urem <2 x i32> %x, %shl.y 6069 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 6070 ret void 6071} 6072 6073define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 6074; CHECK-LABEL: @sdiv_i32_oddk_denom( 6075; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195 6076; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6077; CHECK-NEXT: ret void 6078; 6079; GFX6-LABEL: sdiv_i32_oddk_denom: 6080; GFX6: ; %bb.0: 6081; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 6082; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6083; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 6084; GFX6-NEXT: s_mov_b32 s3, 0xf000 6085; GFX6-NEXT: s_mov_b32 s2, -1 6086; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6087; GFX6-NEXT: v_mul_hi_i32 v0, s4, v0 6088; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 6089; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 6090; GFX6-NEXT: v_ashrrev_i32_e32 v0, 20, v0 6091; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 6092; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 6093; GFX6-NEXT: s_endpgm 6094; 6095; GFX9-LABEL: sdiv_i32_oddk_denom: 6096; GFX9: ; %bb.0: 6097; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 6098; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6099; GFX9-NEXT: v_mov_b32_e32 v0, 0 6100; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6101; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 6102; GFX9-NEXT: s_add_i32 s0, s0, s4 6103; GFX9-NEXT: s_lshr_b32 s1, s0, 31 6104; GFX9-NEXT: s_ashr_i32 s0, s0, 20 6105; GFX9-NEXT: s_add_i32 s0, s0, s1 6106; GFX9-NEXT: v_mov_b32_e32 v1, s0 6107; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 6108; GFX9-NEXT: s_endpgm 6109 %r = sdiv i32 %x, 1235195 6110 store i32 %r, i32 addrspace(1)* %out 6111 ret void 6112} 6113 6114define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 6115; CHECK-LABEL: @sdiv_i32_pow2k_denom( 6116; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 4096 6117; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6118; CHECK-NEXT: ret void 6119; 6120; GFX6-LABEL: sdiv_i32_pow2k_denom: 6121; GFX6: ; %bb.0: 6122; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 6123; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6124; GFX6-NEXT: s_mov_b32 s3, 0xf000 6125; GFX6-NEXT: s_mov_b32 s2, -1 6126; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6127; GFX6-NEXT: s_ashr_i32 s5, s4, 31 6128; GFX6-NEXT: s_lshr_b32 s5, s5, 20 6129; GFX6-NEXT: s_add_i32 s4, s4, s5 6130; GFX6-NEXT: s_ashr_i32 s4, s4, 12 6131; GFX6-NEXT: v_mov_b32_e32 v0, s4 6132; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 6133; GFX6-NEXT: s_endpgm 6134; 6135; GFX9-LABEL: sdiv_i32_pow2k_denom: 6136; GFX9: ; %bb.0: 6137; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 6138; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6139; GFX9-NEXT: v_mov_b32_e32 v0, 0 6140; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6141; GFX9-NEXT: s_ashr_i32 s0, s4, 31 6142; GFX9-NEXT: s_lshr_b32 s0, s0, 20 6143; GFX9-NEXT: s_add_i32 s4, s4, s0 6144; GFX9-NEXT: s_ashr_i32 s0, s4, 12 6145; GFX9-NEXT: v_mov_b32_e32 v1, s0 6146; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 6147; GFX9-NEXT: s_endpgm 6148 %r = sdiv i32 %x, 4096 6149 store i32 %r, i32 addrspace(1)* %out 6150 ret void 6151} 6152 6153define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 6154; CHECK-LABEL: @sdiv_i32_pow2_shl_denom( 6155; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 6156; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]] 6157; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6158; CHECK-NEXT: ret void 6159; 6160; GFX6-LABEL: sdiv_i32_pow2_shl_denom: 6161; GFX6: ; %bb.0: 6162; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 6163; GFX6-NEXT: s_mov_b32 s7, 0xf000 6164; GFX6-NEXT: s_mov_b32 s6, -1 6165; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6166; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 6167; GFX6-NEXT: s_ashr_i32 s8, s3, 31 6168; GFX6-NEXT: s_add_i32 s3, s3, s8 6169; GFX6-NEXT: s_xor_b32 s3, s3, s8 6170; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 6171; GFX6-NEXT: s_sub_i32 s4, 0, s3 6172; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6173; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6174; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6175; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 6176; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6177; GFX6-NEXT: s_ashr_i32 s0, s2, 31 6178; GFX6-NEXT: s_add_i32 s1, s2, s0 6179; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 6180; GFX6-NEXT: s_xor_b32 s1, s1, s0 6181; GFX6-NEXT: s_xor_b32 s2, s0, s8 6182; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 6183; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 6184; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 6185; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 6186; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 6187; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 6188; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 6189; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 6190; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 6191; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 6192; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 6193; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6194; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 6195; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 6196; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6197; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 6198; GFX6-NEXT: s_endpgm 6199; 6200; GFX9-LABEL: sdiv_i32_pow2_shl_denom: 6201; GFX9: ; %bb.0: 6202; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6203; GFX9-NEXT: v_mov_b32_e32 v2, 0 6204; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6205; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6206; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 6207; GFX9-NEXT: s_ashr_i32 s4, s3, 31 6208; GFX9-NEXT: s_add_i32 s3, s3, s4 6209; GFX9-NEXT: s_xor_b32 s3, s3, s4 6210; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 6211; GFX9-NEXT: s_sub_i32 s5, 0, s3 6212; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6213; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6214; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6215; GFX9-NEXT: v_mul_lo_u32 v1, s5, v0 6216; GFX9-NEXT: s_ashr_i32 s5, s2, 31 6217; GFX9-NEXT: s_add_i32 s2, s2, s5 6218; GFX9-NEXT: s_xor_b32 s2, s2, s5 6219; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 6220; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 6221; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 6222; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3 6223; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 6224; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 6225; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 6226; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6227; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 6228; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 6229; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 6230; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 6231; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 6232; GFX9-NEXT: s_xor_b32 s2, s5, s4 6233; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 6234; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 6235; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 6236; GFX9-NEXT: s_endpgm 6237 %shl.y = shl i32 4096, %y 6238 %r = sdiv i32 %x, %shl.y 6239 store i32 %r, i32 addrspace(1)* %out 6240 ret void 6241} 6242 6243define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 6244; CHECK-LABEL: @sdiv_v2i32_pow2k_denom( 6245; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6246; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 6247; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 6248; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 6249; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096 6250; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 6251; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 6252; CHECK-NEXT: ret void 6253; 6254; GFX6-LABEL: sdiv_v2i32_pow2k_denom: 6255; GFX6: ; %bb.0: 6256; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 6257; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6258; GFX6-NEXT: s_mov_b32 s3, 0xf000 6259; GFX6-NEXT: s_mov_b32 s2, -1 6260; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6261; GFX6-NEXT: s_ashr_i32 s6, s4, 31 6262; GFX6-NEXT: s_ashr_i32 s7, s5, 31 6263; GFX6-NEXT: s_lshr_b32 s6, s6, 20 6264; GFX6-NEXT: s_add_i32 s4, s4, s6 6265; GFX6-NEXT: s_lshr_b32 s6, s7, 20 6266; GFX6-NEXT: s_add_i32 s5, s5, s6 6267; GFX6-NEXT: s_ashr_i32 s4, s4, 12 6268; GFX6-NEXT: s_ashr_i32 s5, s5, 12 6269; GFX6-NEXT: v_mov_b32_e32 v0, s4 6270; GFX6-NEXT: v_mov_b32_e32 v1, s5 6271; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6272; GFX6-NEXT: s_endpgm 6273; 6274; GFX9-LABEL: sdiv_v2i32_pow2k_denom: 6275; GFX9: ; %bb.0: 6276; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6277; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 6278; GFX9-NEXT: v_mov_b32_e32 v2, 0 6279; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6280; GFX9-NEXT: s_ashr_i32 s0, s2, 31 6281; GFX9-NEXT: s_ashr_i32 s1, s3, 31 6282; GFX9-NEXT: s_lshr_b32 s0, s0, 20 6283; GFX9-NEXT: s_lshr_b32 s1, s1, 20 6284; GFX9-NEXT: s_add_i32 s0, s2, s0 6285; GFX9-NEXT: s_add_i32 s1, s3, s1 6286; GFX9-NEXT: s_ashr_i32 s0, s0, 12 6287; GFX9-NEXT: s_ashr_i32 s1, s1, 12 6288; GFX9-NEXT: v_mov_b32_e32 v0, s0 6289; GFX9-NEXT: v_mov_b32_e32 v1, s1 6290; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 6291; GFX9-NEXT: s_endpgm 6292 %r = sdiv <2 x i32> %x, <i32 4096, i32 4096> 6293 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 6294 ret void 6295} 6296 6297define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 6298; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom( 6299; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6300; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 6301; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 6302; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 6303; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095 6304; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 6305; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 6306; CHECK-NEXT: ret void 6307; 6308; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 6309; GFX6: ; %bb.0: 6310; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 6311; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6312; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080081 6313; GFX6-NEXT: s_mov_b32 s3, 0xf000 6314; GFX6-NEXT: s_mov_b32 s2, -1 6315; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6316; GFX6-NEXT: v_mul_hi_i32 v0, s5, v0 6317; GFX6-NEXT: s_ashr_i32 s6, s4, 31 6318; GFX6-NEXT: s_lshr_b32 s6, s6, 20 6319; GFX6-NEXT: s_add_i32 s4, s4, s6 6320; GFX6-NEXT: v_add_i32_e32 v0, vcc, s5, v0 6321; GFX6-NEXT: s_ashr_i32 s4, s4, 12 6322; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 6323; GFX6-NEXT: v_ashrrev_i32_e32 v0, 11, v0 6324; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v0 6325; GFX6-NEXT: v_mov_b32_e32 v0, s4 6326; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6327; GFX6-NEXT: s_endpgm 6328; 6329; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 6330; GFX9: ; %bb.0: 6331; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6332; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 6333; GFX9-NEXT: v_mov_b32_e32 v2, 0 6334; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6335; GFX9-NEXT: s_ashr_i32 s0, s2, 31 6336; GFX9-NEXT: s_mul_hi_i32 s1, s3, 0x80080081 6337; GFX9-NEXT: s_lshr_b32 s0, s0, 20 6338; GFX9-NEXT: s_add_i32 s1, s1, s3 6339; GFX9-NEXT: s_add_i32 s0, s2, s0 6340; GFX9-NEXT: s_lshr_b32 s2, s1, 31 6341; GFX9-NEXT: s_ashr_i32 s1, s1, 11 6342; GFX9-NEXT: s_ashr_i32 s0, s0, 12 6343; GFX9-NEXT: s_add_i32 s1, s1, s2 6344; GFX9-NEXT: v_mov_b32_e32 v0, s0 6345; GFX9-NEXT: v_mov_b32_e32 v1, s1 6346; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 6347; GFX9-NEXT: s_endpgm 6348 %r = sdiv <2 x i32> %x, <i32 4096, i32 4095> 6349 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 6350 ret void 6351} 6352 6353define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 6354; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom( 6355; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 6356; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6357; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 6358; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 6359; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 6360; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 6361; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 6362; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 6363; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 6364; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 6365; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 6366; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 6367; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000 6368; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 6369; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]] 6370; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]] 6371; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64 6372; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 6373; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 6374; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 6375; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 6376; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 6377; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]] 6378; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64 6379; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64 6380; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 6381; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 6382; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 6383; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 6384; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]] 6385; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]] 6386; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]] 6387; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1 6388; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]] 6389; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]] 6390; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]] 6391; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]] 6392; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1 6393; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]] 6394; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]] 6395; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]] 6396; CHECK-NEXT: [[TMP41:%.*]] = insertelement <2 x i32> undef, i32 [[TMP40]], i64 0 6397; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[X]], i64 1 6398; CHECK-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 6399; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31 6400; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31 6401; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]] 6402; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]] 6403; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]] 6404; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]] 6405; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]] 6406; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float 6407; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]]) 6408; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000 6409; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32 6410; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]] 6411; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]] 6412; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64 6413; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64 6414; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]] 6415; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32 6416; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32 6417; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32 6418; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]] 6419; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64 6420; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64 6421; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]] 6422; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32 6423; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32 6424; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32 6425; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]] 6426; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]] 6427; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]] 6428; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1 6429; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]] 6430; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]] 6431; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]] 6432; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]] 6433; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1 6434; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]] 6435; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]] 6436; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]] 6437; CHECK-NEXT: [[TMP82:%.*]] = insertelement <2 x i32> [[TMP41]], i32 [[TMP81]], i64 1 6438; CHECK-NEXT: store <2 x i32> [[TMP82]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 6439; CHECK-NEXT: ret void 6440; 6441; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom: 6442; GFX6: ; %bb.0: 6443; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xb 6444; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6445; GFX6-NEXT: s_mov_b32 s7, 0xf000 6446; GFX6-NEXT: s_mov_b32 s6, -1 6447; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6448; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s10 6449; GFX6-NEXT: s_ashr_i32 s1, s0, 31 6450; GFX6-NEXT: s_add_i32 s0, s0, s1 6451; GFX6-NEXT: s_xor_b32 s2, s0, s1 6452; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 6453; GFX6-NEXT: s_lshl_b32 s0, 0x1000, s11 6454; GFX6-NEXT: s_ashr_i32 s3, s0, 31 6455; GFX6-NEXT: s_add_i32 s0, s0, s3 6456; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6457; GFX6-NEXT: s_sub_i32 s11, 0, s2 6458; GFX6-NEXT: s_xor_b32 s10, s0, s3 6459; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 6460; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6461; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6462; GFX6-NEXT: s_ashr_i32 s0, s8, 31 6463; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 6464; GFX6-NEXT: s_add_i32 s8, s8, s0 6465; GFX6-NEXT: v_mul_lo_u32 v2, s11, v0 6466; GFX6-NEXT: s_xor_b32 s8, s8, s0 6467; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 6468; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 6469; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 6470; GFX6-NEXT: s_xor_b32 s11, s0, s1 6471; GFX6-NEXT: s_sub_i32 s0, 0, s10 6472; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 6473; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0 6474; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 6475; GFX6-NEXT: v_mul_lo_u32 v3, v0, s2 6476; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 6477; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 6478; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 6479; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v3 6480; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 6481; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s2, v3 6482; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] 6483; GFX6-NEXT: s_ashr_i32 s0, s9, 31 6484; GFX6-NEXT: s_add_i32 s1, s9, s0 6485; GFX6-NEXT: s_xor_b32 s1, s1, s0 6486; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 6487; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 6488; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 6489; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 6490; GFX6-NEXT: v_mul_lo_u32 v2, v1, s10 6491; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 6492; GFX6-NEXT: s_xor_b32 s2, s0, s3 6493; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 6494; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 6495; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 6496; GFX6-NEXT: v_xor_b32_e32 v0, s11, v0 6497; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 6498; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s10, v2 6499; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s11, v0 6500; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 6501; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 6502; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 6503; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 6504; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 6505; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 6506; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6507; GFX6-NEXT: s_endpgm 6508; 6509; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: 6510; GFX9: ; %bb.0: 6511; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 6512; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6513; GFX9-NEXT: v_mov_b32_e32 v2, 0 6514; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6515; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s6 6516; GFX9-NEXT: s_ashr_i32 s1, s0, 31 6517; GFX9-NEXT: s_add_i32 s0, s0, s1 6518; GFX9-NEXT: s_xor_b32 s0, s0, s1 6519; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 6520; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s7 6521; GFX9-NEXT: s_ashr_i32 s8, s6, 31 6522; GFX9-NEXT: s_add_i32 s6, s6, s8 6523; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6524; GFX9-NEXT: s_xor_b32 s6, s6, s8 6525; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 6526; GFX9-NEXT: s_sub_i32 s10, 0, s0 6527; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6528; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6529; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 6530; GFX9-NEXT: s_ashr_i32 s7, s4, 31 6531; GFX9-NEXT: s_add_i32 s4, s4, s7 6532; GFX9-NEXT: v_mul_lo_u32 v3, s10, v0 6533; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 6534; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 6535; GFX9-NEXT: s_sub_i32 s10, 0, s6 6536; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 6537; GFX9-NEXT: s_xor_b32 s4, s4, s7 6538; GFX9-NEXT: v_mul_lo_u32 v4, s10, v1 6539; GFX9-NEXT: s_ashr_i32 s9, s5, 31 6540; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 6541; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 6542; GFX9-NEXT: v_mul_hi_u32 v3, v1, v4 6543; GFX9-NEXT: s_add_i32 s5, s5, s9 6544; GFX9-NEXT: s_xor_b32 s5, s5, s9 6545; GFX9-NEXT: v_mul_lo_u32 v4, v0, s0 6546; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 6547; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 6548; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 6549; GFX9-NEXT: v_sub_u32_e32 v4, s4, v4 6550; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v4 6551; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6552; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v4 6553; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc 6554; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 6555; GFX9-NEXT: v_mul_lo_u32 v3, v1, s6 6556; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 6557; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 6558; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 6559; GFX9-NEXT: v_sub_u32_e32 v3, s5, v3 6560; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 6561; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 6562; GFX9-NEXT: v_subrev_u32_e32 v4, s6, v3 6563; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 6564; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 6565; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 6566; GFX9-NEXT: s_xor_b32 s1, s7, s1 6567; GFX9-NEXT: s_xor_b32 s0, s9, s8 6568; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 6569; GFX9-NEXT: v_xor_b32_e32 v0, s1, v0 6570; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 6571; GFX9-NEXT: v_subrev_u32_e32 v0, s1, v0 6572; GFX9-NEXT: v_subrev_u32_e32 v1, s0, v1 6573; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 6574; GFX9-NEXT: s_endpgm 6575 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 6576 %r = sdiv <2 x i32> %x, %shl.y 6577 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 6578 ret void 6579} 6580 6581define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 6582; CHECK-LABEL: @srem_i32_oddk_denom( 6583; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 1235195 6584; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6585; CHECK-NEXT: ret void 6586; 6587; GFX6-LABEL: srem_i32_oddk_denom: 6588; GFX6: ; %bb.0: 6589; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 6590; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 6591; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb 6592; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6593; GFX6-NEXT: s_mov_b32 s3, 0xf000 6594; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6595; GFX6-NEXT: v_mul_hi_i32 v0, s4, v0 6596; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 6597; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 6598; GFX6-NEXT: v_ashrrev_i32_e32 v0, 20, v0 6599; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 6600; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 6601; GFX6-NEXT: s_mov_b32 s2, -1 6602; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 6603; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 6604; GFX6-NEXT: s_endpgm 6605; 6606; GFX9-LABEL: srem_i32_oddk_denom: 6607; GFX9: ; %bb.0: 6608; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 6609; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6610; GFX9-NEXT: v_mov_b32_e32 v0, 0 6611; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6612; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 6613; GFX9-NEXT: s_add_i32 s0, s0, s4 6614; GFX9-NEXT: s_lshr_b32 s1, s0, 31 6615; GFX9-NEXT: s_ashr_i32 s0, s0, 20 6616; GFX9-NEXT: s_add_i32 s0, s0, s1 6617; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb 6618; GFX9-NEXT: s_sub_i32 s0, s4, s0 6619; GFX9-NEXT: v_mov_b32_e32 v1, s0 6620; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 6621; GFX9-NEXT: s_endpgm 6622 %r = srem i32 %x, 1235195 6623 store i32 %r, i32 addrspace(1)* %out 6624 ret void 6625} 6626 6627define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 6628; CHECK-LABEL: @srem_i32_pow2k_denom( 6629; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 4096 6630; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6631; CHECK-NEXT: ret void 6632; 6633; GFX6-LABEL: srem_i32_pow2k_denom: 6634; GFX6: ; %bb.0: 6635; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 6636; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6637; GFX6-NEXT: s_mov_b32 s3, 0xf000 6638; GFX6-NEXT: s_mov_b32 s2, -1 6639; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6640; GFX6-NEXT: s_ashr_i32 s5, s4, 31 6641; GFX6-NEXT: s_lshr_b32 s5, s5, 20 6642; GFX6-NEXT: s_add_i32 s5, s4, s5 6643; GFX6-NEXT: s_and_b32 s5, s5, 0xfffff000 6644; GFX6-NEXT: s_sub_i32 s4, s4, s5 6645; GFX6-NEXT: v_mov_b32_e32 v0, s4 6646; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 6647; GFX6-NEXT: s_endpgm 6648; 6649; GFX9-LABEL: srem_i32_pow2k_denom: 6650; GFX9: ; %bb.0: 6651; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 6652; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6653; GFX9-NEXT: v_mov_b32_e32 v0, 0 6654; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6655; GFX9-NEXT: s_ashr_i32 s0, s4, 31 6656; GFX9-NEXT: s_lshr_b32 s0, s0, 20 6657; GFX9-NEXT: s_add_i32 s0, s4, s0 6658; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 6659; GFX9-NEXT: s_sub_i32 s0, s4, s0 6660; GFX9-NEXT: v_mov_b32_e32 v1, s0 6661; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 6662; GFX9-NEXT: s_endpgm 6663 %r = srem i32 %x, 4096 6664 store i32 %r, i32 addrspace(1)* %out 6665 ret void 6666} 6667 6668define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 6669; CHECK-LABEL: @srem_i32_pow2_shl_denom( 6670; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 6671; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]] 6672; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6673; CHECK-NEXT: ret void 6674; 6675; GFX6-LABEL: srem_i32_pow2_shl_denom: 6676; GFX6: ; %bb.0: 6677; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 6678; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6679; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6680; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 6681; GFX6-NEXT: s_ashr_i32 s4, s3, 31 6682; GFX6-NEXT: s_add_i32 s3, s3, s4 6683; GFX6-NEXT: s_xor_b32 s4, s3, s4 6684; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 6685; GFX6-NEXT: s_sub_i32 s3, 0, s4 6686; GFX6-NEXT: s_ashr_i32 s5, s2, 31 6687; GFX6-NEXT: s_add_i32 s2, s2, s5 6688; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6689; GFX6-NEXT: s_xor_b32 s6, s2, s5 6690; GFX6-NEXT: s_mov_b32 s2, -1 6691; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6692; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6693; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 6694; GFX6-NEXT: s_mov_b32 s3, 0xf000 6695; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 6696; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 6697; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 6698; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 6699; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 6700; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 6701; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 6702; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 6703; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 6704; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 6705; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 6706; GFX6-NEXT: v_xor_b32_e32 v0, s5, v0 6707; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s5, v0 6708; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 6709; GFX6-NEXT: s_endpgm 6710; 6711; GFX9-LABEL: srem_i32_pow2_shl_denom: 6712; GFX9: ; %bb.0: 6713; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6714; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6715; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 6716; GFX9-NEXT: s_ashr_i32 s4, s3, 31 6717; GFX9-NEXT: s_add_i32 s3, s3, s4 6718; GFX9-NEXT: s_xor_b32 s3, s3, s4 6719; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 6720; GFX9-NEXT: s_sub_i32 s4, 0, s3 6721; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6722; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6723; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6724; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6725; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 6726; GFX9-NEXT: s_ashr_i32 s4, s2, 31 6727; GFX9-NEXT: s_add_i32 s2, s2, s4 6728; GFX9-NEXT: s_xor_b32 s2, s2, s4 6729; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 6730; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 6731; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 6732; GFX9-NEXT: v_mov_b32_e32 v1, 0 6733; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 6734; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 6735; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 6736; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 6737; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6738; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 6739; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 6740; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6741; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 6742; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 6743; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6744; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 6745; GFX9-NEXT: s_endpgm 6746 %shl.y = shl i32 4096, %y 6747 %r = srem i32 %x, %shl.y 6748 store i32 %r, i32 addrspace(1)* %out 6749 ret void 6750} 6751 6752define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 6753; CHECK-LABEL: @srem_v2i32_pow2k_denom( 6754; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6755; CHECK-NEXT: [[TMP2:%.*]] = srem i32 [[TMP1]], 4096 6756; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 6757; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 6758; CHECK-NEXT: [[TMP5:%.*]] = srem i32 [[TMP4]], 4096 6759; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 6760; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 6761; CHECK-NEXT: ret void 6762; 6763; GFX6-LABEL: srem_v2i32_pow2k_denom: 6764; GFX6: ; %bb.0: 6765; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 6766; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6767; GFX6-NEXT: s_mov_b32 s3, 0xf000 6768; GFX6-NEXT: s_mov_b32 s2, -1 6769; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6770; GFX6-NEXT: s_ashr_i32 s6, s4, 31 6771; GFX6-NEXT: s_lshr_b32 s6, s6, 20 6772; GFX6-NEXT: s_add_i32 s6, s4, s6 6773; GFX6-NEXT: s_ashr_i32 s7, s5, 31 6774; GFX6-NEXT: s_and_b32 s6, s6, 0xfffff000 6775; GFX6-NEXT: s_sub_i32 s4, s4, s6 6776; GFX6-NEXT: s_lshr_b32 s6, s7, 20 6777; GFX6-NEXT: s_add_i32 s6, s5, s6 6778; GFX6-NEXT: s_and_b32 s6, s6, 0xfffff000 6779; GFX6-NEXT: s_sub_i32 s5, s5, s6 6780; GFX6-NEXT: v_mov_b32_e32 v0, s4 6781; GFX6-NEXT: v_mov_b32_e32 v1, s5 6782; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6783; GFX6-NEXT: s_endpgm 6784; 6785; GFX9-LABEL: srem_v2i32_pow2k_denom: 6786; GFX9: ; %bb.0: 6787; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6788; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 6789; GFX9-NEXT: v_mov_b32_e32 v2, 0 6790; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6791; GFX9-NEXT: s_ashr_i32 s0, s2, 31 6792; GFX9-NEXT: s_ashr_i32 s1, s3, 31 6793; GFX9-NEXT: s_lshr_b32 s0, s0, 20 6794; GFX9-NEXT: s_lshr_b32 s1, s1, 20 6795; GFX9-NEXT: s_add_i32 s0, s2, s0 6796; GFX9-NEXT: s_add_i32 s1, s3, s1 6797; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 6798; GFX9-NEXT: s_and_b32 s1, s1, 0xfffff000 6799; GFX9-NEXT: s_sub_i32 s0, s2, s0 6800; GFX9-NEXT: s_sub_i32 s1, s3, s1 6801; GFX9-NEXT: v_mov_b32_e32 v0, s0 6802; GFX9-NEXT: v_mov_b32_e32 v1, s1 6803; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 6804; GFX9-NEXT: s_endpgm 6805 %r = srem <2 x i32> %x, <i32 4096, i32 4096> 6806 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 6807 ret void 6808} 6809 6810define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 6811; CHECK-LABEL: @srem_v2i32_pow2_shl_denom( 6812; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 6813; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6814; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 6815; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 6816; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 6817; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 6818; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 6819; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 6820; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 6821; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 6822; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 6823; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000 6824; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 6825; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]] 6826; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]] 6827; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64 6828; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 6829; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]] 6830; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 6831; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32 6832; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 6833; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] 6834; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 6835; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64 6836; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 6837; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 6838; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 6839; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 6840; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]] 6841; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]] 6842; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]] 6843; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]] 6844; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]] 6845; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]] 6846; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]] 6847; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]] 6848; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]] 6849; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]] 6850; CHECK-NEXT: [[TMP38:%.*]] = insertelement <2 x i32> undef, i32 [[TMP37]], i64 0 6851; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[X]], i64 1 6852; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 6853; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31 6854; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31 6855; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]] 6856; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]] 6857; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]] 6858; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]] 6859; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float 6860; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]]) 6861; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000 6862; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32 6863; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]] 6864; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]] 6865; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64 6866; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64 6867; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]] 6868; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32 6869; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32 6870; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32 6871; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]] 6872; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64 6873; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64 6874; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]] 6875; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32 6876; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32 6877; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32 6878; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]] 6879; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]] 6880; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]] 6881; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]] 6882; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]] 6883; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]] 6884; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]] 6885; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]] 6886; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]] 6887; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]] 6888; CHECK-NEXT: [[TMP76:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP75]], i64 1 6889; CHECK-NEXT: store <2 x i32> [[TMP76]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 6890; CHECK-NEXT: ret void 6891; 6892; GFX6-LABEL: srem_v2i32_pow2_shl_denom: 6893; GFX6: ; %bb.0: 6894; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 6895; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6896; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6897; GFX6-NEXT: s_lshl_b32 s2, 0x1000, s6 6898; GFX6-NEXT: s_ashr_i32 s3, s2, 31 6899; GFX6-NEXT: s_add_i32 s2, s2, s3 6900; GFX6-NEXT: s_xor_b32 s6, s2, s3 6901; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 6902; GFX6-NEXT: s_lshl_b32 s7, 0x1000, s7 6903; GFX6-NEXT: s_ashr_i32 s8, s7, 31 6904; GFX6-NEXT: s_add_i32 s7, s7, s8 6905; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6906; GFX6-NEXT: s_xor_b32 s7, s7, s8 6907; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 6908; GFX6-NEXT: s_sub_i32 s9, 0, s6 6909; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6910; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6911; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 6912; GFX6-NEXT: s_ashr_i32 s8, s4, 31 6913; GFX6-NEXT: s_add_i32 s4, s4, s8 6914; GFX6-NEXT: v_mul_lo_u32 v2, s9, v0 6915; GFX6-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 6916; GFX6-NEXT: s_xor_b32 s4, s4, s8 6917; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 6918; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 6919; GFX6-NEXT: s_sub_i32 s9, 0, s7 6920; GFX6-NEXT: s_mov_b32 s3, 0xf000 6921; GFX6-NEXT: s_mov_b32 s2, -1 6922; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 6923; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 6924; GFX6-NEXT: v_mul_lo_u32 v2, s9, v1 6925; GFX6-NEXT: s_ashr_i32 s9, s5, 31 6926; GFX6-NEXT: s_add_i32 s5, s5, s9 6927; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 6928; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 6929; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 6930; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0 6931; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 6932; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6933; GFX6-NEXT: s_xor_b32 s4, s5, s9 6934; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 6935; GFX6-NEXT: v_mul_hi_u32 v1, s4, v1 6936; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s6, v0 6937; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 6938; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 6939; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6940; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 6941; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 6942; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 6943; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 6944; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 6945; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6946; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 6947; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 6948; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6949; GFX6-NEXT: v_xor_b32_e32 v1, s9, v1 6950; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s9, v1 6951; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 6952; GFX6-NEXT: s_endpgm 6953; 6954; GFX9-LABEL: srem_v2i32_pow2_shl_denom: 6955; GFX9: ; %bb.0: 6956; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 6957; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6958; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6 6959; GFX9-NEXT: s_ashr_i32 s6, s3, 31 6960; GFX9-NEXT: s_add_i32 s3, s3, s6 6961; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7 6962; GFX9-NEXT: s_xor_b32 s3, s3, s6 6963; GFX9-NEXT: s_ashr_i32 s7, s2, 31 6964; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 6965; GFX9-NEXT: s_add_i32 s2, s2, s7 6966; GFX9-NEXT: s_xor_b32 s2, s2, s7 6967; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 6968; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6969; GFX9-NEXT: s_sub_i32 s8, 0, s3 6970; GFX9-NEXT: s_ashr_i32 s6, s4, 31 6971; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 6972; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6973; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6974; GFX9-NEXT: s_add_i32 s4, s4, s6 6975; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1 6976; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 6977; GFX9-NEXT: v_mul_lo_u32 v2, s8, v0 6978; GFX9-NEXT: s_sub_i32 s8, 0, s2 6979; GFX9-NEXT: s_xor_b32 s4, s4, s6 6980; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 6981; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 6982; GFX9-NEXT: s_ashr_i32 s7, s5, 31 6983; GFX9-NEXT: s_add_i32 s5, s5, s7 6984; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 6985; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 6986; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 6987; GFX9-NEXT: s_xor_b32 s5, s5, s7 6988; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 6989; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 6990; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 6991; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6992; GFX9-NEXT: v_mov_b32_e32 v2, 0 6993; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2 6994; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 6995; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v0 6996; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 6997; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6998; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v0 6999; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 7000; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 7001; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 7002; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v1 7003; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 7004; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 7005; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v1 7006; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 7007; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 7008; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 7009; GFX9-NEXT: v_xor_b32_e32 v1, s7, v1 7010; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 7011; GFX9-NEXT: v_subrev_u32_e32 v1, s7, v1 7012; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7013; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 7014; GFX9-NEXT: s_endpgm 7015 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 7016 %r = srem <2 x i32> %x, %shl.y 7017 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 7018 ret void 7019} 7020 7021define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 7022; CHECK-LABEL: @udiv_i64_oddk_denom( 7023; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943 7024; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 7025; CHECK-NEXT: ret void 7026; 7027; GFX6-LABEL: udiv_i64_oddk_denom: 7028; GFX6: ; %bb.0: 7029; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f176a73 7030; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 7031; GFX6-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 7032; GFX6-NEXT: v_rcp_f32_e32 v0, v0 7033; GFX6-NEXT: s_movk_i32 s4, 0xfee0 7034; GFX6-NEXT: s_mov_b32 s5, 0x68958c89 7035; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 7036; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7037; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7038; GFX6-NEXT: v_trunc_f32_e32 v1, v1 7039; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7040; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 7041; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 7042; GFX6-NEXT: s_movk_i32 s8, 0x11f 7043; GFX6-NEXT: s_mov_b32 s9, 0x976a7377 7044; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 7045; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 7046; GFX6-NEXT: v_mul_lo_u32 v4, v1, s5 7047; GFX6-NEXT: v_mul_lo_u32 v5, v0, s5 7048; GFX6-NEXT: s_mov_b32 s7, 0xf000 7049; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7050; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 7051; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 7052; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 7053; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 7054; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 7055; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 7056; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 7057; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 7058; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 7059; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 7060; GFX6-NEXT: s_mov_b32 s6, -1 7061; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 7062; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 7063; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 7064; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7065; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 7066; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 7067; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 7068; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 7069; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 7070; GFX6-NEXT: v_mul_lo_u32 v4, v1, s5 7071; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7072; GFX6-NEXT: s_mov_b32 s4, s0 7073; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7074; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 7075; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7076; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 7077; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 7078; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 7079; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 7080; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 7081; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 7082; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 7083; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 7084; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 7085; GFX6-NEXT: s_mov_b32 s5, s1 7086; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 7087; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 7088; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 7089; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7090; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 7091; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 7092; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 7093; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 7094; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 7095; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 7096; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 7097; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 7098; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7099; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 7100; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 7101; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 7102; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7103; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 7104; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 7105; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7106; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 7107; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 7108; GFX6-NEXT: v_mul_hi_u32 v3, v0, s9 7109; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9 7110; GFX6-NEXT: v_mov_b32_e32 v5, 0x11f 7111; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7112; GFX6-NEXT: v_mul_lo_u32 v3, v0, s9 7113; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7114; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 7115; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 7116; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 7117; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s9, v3 7118; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 7119; GFX6-NEXT: s_movk_i32 s2, 0x11e 7120; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s2, v4 7121; GFX6-NEXT: s_mov_b32 s9, 0x976a7376 7122; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 7123; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s9, v5 7124; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 7125; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s8, v4 7126; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 7127; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 7128; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 7129; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 7130; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 7131; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 7132; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] 7133; GFX6-NEXT: v_mov_b32_e32 v6, s3 7134; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 7135; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s2, v2 7136; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 7137; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s9, v3 7138; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 7139; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s8, v2 7140; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 7141; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 7142; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 7143; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 7144; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 7145; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7146; GFX6-NEXT: s_endpgm 7147; 7148; GFX9-LABEL: udiv_i64_oddk_denom: 7149; GFX9: ; %bb.0: 7150; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f176a73 7151; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 7152; GFX9-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 7153; GFX9-NEXT: v_rcp_f32_e32 v0, v0 7154; GFX9-NEXT: s_movk_i32 s2, 0xfee0 7155; GFX9-NEXT: s_mov_b32 s3, 0x68958c89 7156; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 7157; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7158; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7159; GFX9-NEXT: v_trunc_f32_e32 v1, v1 7160; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7161; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 7162; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 7163; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 7164; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 7165; GFX9-NEXT: v_mul_lo_u32 v5, v1, s3 7166; GFX9-NEXT: v_mul_lo_u32 v4, v0, s3 7167; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 7168; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 7169; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 7170; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 7171; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 7172; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 7173; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 7174; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 7175; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc 7176; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 7177; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 7178; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 7179; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc 7180; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc 7181; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7182; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 7183; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 7184; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 7185; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 7186; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 7187; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 7188; GFX9-NEXT: v_mul_lo_u32 v5, v0, s3 7189; GFX9-NEXT: s_movk_i32 s2, 0x11f 7190; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 7191; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 7192; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 7193; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 7194; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 7195; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 7196; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 7197; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 7198; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc 7199; GFX9-NEXT: v_mul_lo_u32 v6, v1, v5 7200; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 7201; GFX9-NEXT: s_mov_b32 s3, 0x976a7377 7202; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 7203; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v5, vcc 7204; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc 7205; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7206; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 7207; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 7208; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 7209; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7210; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 7211; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 7212; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 7213; GFX9-NEXT: v_mul_hi_u32 v5, s7, v1 7214; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 7215; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7216; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 7217; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 7218; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 7219; GFX9-NEXT: v_mov_b32_e32 v6, 0x11f 7220; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 7221; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 7222; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc 7223; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 7224; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 7225; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 7226; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 7227; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 7228; GFX9-NEXT: v_mov_b32_e32 v5, 0 7229; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 7230; GFX9-NEXT: v_mul_lo_u32 v3, v0, s3 7231; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 7232; GFX9-NEXT: v_sub_u32_e32 v4, s7, v2 7233; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 7234; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc 7235; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s3, v3 7236; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] 7237; GFX9-NEXT: s_movk_i32 s3, 0x11e 7238; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s3, v4 7239; GFX9-NEXT: s_mov_b32 s6, 0x976a7376 7240; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 7241; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s6, v6 7242; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 7243; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 7244; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] 7245; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], 2, v0 7246; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] 7247; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], 1, v0 7248; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1] 7249; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 7250; GFX9-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] 7251; GFX9-NEXT: v_mov_b32_e32 v7, s7 7252; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v7, v2, vcc 7253; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s3, v2 7254; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 7255; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v3 7256; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 7257; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 7258; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc 7259; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 7260; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v6, s[0:1] 7261; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 7262; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 7263; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] 7264; GFX9-NEXT: s_endpgm 7265 %r = udiv i64 %x, 1235195949943 7266 store i64 %r, i64 addrspace(1)* %out 7267 ret void 7268} 7269 7270define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 7271; CHECK-LABEL: @udiv_i64_pow2k_denom( 7272; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 4096 7273; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 7274; CHECK-NEXT: ret void 7275; 7276; GFX6-LABEL: udiv_i64_pow2k_denom: 7277; GFX6: ; %bb.0: 7278; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 7279; GFX6-NEXT: s_mov_b32 s7, 0xf000 7280; GFX6-NEXT: s_mov_b32 s6, -1 7281; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7282; GFX6-NEXT: s_mov_b32 s4, s0 7283; GFX6-NEXT: s_mov_b32 s5, s1 7284; GFX6-NEXT: s_lshr_b64 s[0:1], s[2:3], 12 7285; GFX6-NEXT: v_mov_b32_e32 v0, s0 7286; GFX6-NEXT: v_mov_b32_e32 v1, s1 7287; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7288; GFX6-NEXT: s_endpgm 7289; 7290; GFX9-LABEL: udiv_i64_pow2k_denom: 7291; GFX9: ; %bb.0: 7292; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 7293; GFX9-NEXT: v_mov_b32_e32 v2, 0 7294; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7295; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 7296; GFX9-NEXT: v_mov_b32_e32 v0, s2 7297; GFX9-NEXT: v_mov_b32_e32 v1, s3 7298; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 7299; GFX9-NEXT: s_endpgm 7300 %r = udiv i64 %x, 4096 7301 store i64 %r, i64 addrspace(1)* %out 7302 ret void 7303} 7304 7305define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 7306; CHECK-LABEL: @udiv_i64_pow2_shl_denom( 7307; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 7308; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]] 7309; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 7310; CHECK-NEXT: ret void 7311; 7312; GFX6-LABEL: udiv_i64_pow2_shl_denom: 7313; GFX6: ; %bb.0: 7314; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 7315; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd 7316; GFX6-NEXT: s_mov_b32 s3, 0xf000 7317; GFX6-NEXT: s_mov_b32 s2, -1 7318; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7319; GFX6-NEXT: s_mov_b32 s0, s4 7320; GFX6-NEXT: s_add_i32 s8, s8, 12 7321; GFX6-NEXT: s_mov_b32 s1, s5 7322; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 7323; GFX6-NEXT: v_mov_b32_e32 v0, s4 7324; GFX6-NEXT: v_mov_b32_e32 v1, s5 7325; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 7326; GFX6-NEXT: s_endpgm 7327; 7328; GFX9-LABEL: udiv_i64_pow2_shl_denom: 7329; GFX9: ; %bb.0: 7330; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 7331; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 7332; GFX9-NEXT: v_mov_b32_e32 v2, 0 7333; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7334; GFX9-NEXT: s_add_i32 s2, s2, 12 7335; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s2 7336; GFX9-NEXT: v_mov_b32_e32 v0, s0 7337; GFX9-NEXT: v_mov_b32_e32 v1, s1 7338; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 7339; GFX9-NEXT: s_endpgm 7340 %shl.y = shl i64 4096, %y 7341 %r = udiv i64 %x, %shl.y 7342 store i64 %r, i64 addrspace(1)* %out 7343 ret void 7344} 7345 7346define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 7347; CHECK-LABEL: @udiv_v2i64_pow2k_denom( 7348; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 7349; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 7350; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 7351; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 7352; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096 7353; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 7354; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 7355; CHECK-NEXT: ret void 7356; 7357; GFX6-LABEL: udiv_v2i64_pow2k_denom: 7358; GFX6: ; %bb.0: 7359; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 7360; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 7361; GFX6-NEXT: s_mov_b32 s3, 0xf000 7362; GFX6-NEXT: s_mov_b32 s2, -1 7363; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7364; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], 12 7365; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], 12 7366; GFX6-NEXT: v_mov_b32_e32 v0, s4 7367; GFX6-NEXT: v_mov_b32_e32 v1, s5 7368; GFX6-NEXT: v_mov_b32_e32 v2, s6 7369; GFX6-NEXT: v_mov_b32_e32 v3, s7 7370; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 7371; GFX6-NEXT: s_endpgm 7372; 7373; GFX9-LABEL: udiv_v2i64_pow2k_denom: 7374; GFX9: ; %bb.0: 7375; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 7376; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7377; GFX9-NEXT: v_mov_b32_e32 v4, 0 7378; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7379; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 7380; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 12 7381; GFX9-NEXT: v_mov_b32_e32 v0, s0 7382; GFX9-NEXT: v_mov_b32_e32 v1, s1 7383; GFX9-NEXT: v_mov_b32_e32 v2, s4 7384; GFX9-NEXT: v_mov_b32_e32 v3, s5 7385; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 7386; GFX9-NEXT: s_endpgm 7387 %r = udiv <2 x i64> %x, <i64 4096, i64 4096> 7388 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 7389 ret void 7390} 7391 7392define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 7393; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom( 7394; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 7395; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 7396; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 7397; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 7398; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095 7399; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 7400; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 7401; CHECK-NEXT: ret void 7402; 7403; GFX6-LABEL: udiv_v2i64_mixed_pow2k_denom: 7404; GFX6: ; %bb.0: 7405; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 7406; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 7407; GFX6-NEXT: v_rcp_f32_e32 v0, v0 7408; GFX6-NEXT: s_movk_i32 s6, 0xf001 7409; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7410; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 7411; GFX6-NEXT: s_mov_b32 s7, 0xf000 7412; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7413; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7414; GFX6-NEXT: v_trunc_f32_e32 v1, v1 7415; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7416; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 7417; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 7418; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7419; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], 12 7420; GFX6-NEXT: s_movk_i32 s0, 0xfff 7421; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 7422; GFX6-NEXT: v_mul_lo_u32 v4, v1, s6 7423; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 7424; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 7425; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 7426; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 7427; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 7428; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 7429; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 7430; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 7431; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 7432; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 7433; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 7434; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 7435; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 7436; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 7437; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 7438; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7439; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 7440; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 7441; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 7442; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 7443; GFX6-NEXT: v_mul_lo_u32 v3, v1, s6 7444; GFX6-NEXT: v_mul_lo_u32 v4, v0, s6 7445; GFX6-NEXT: s_mov_b32 s6, -1 7446; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 7447; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 7448; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 7449; GFX6-NEXT: v_mul_hi_u32 v5, v0, v4 7450; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 7451; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 7452; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 7453; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 7454; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 7455; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 7456; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 7457; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 7458; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 7459; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 7460; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7461; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 7462; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 7463; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 7464; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 7465; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 7466; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 7467; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 7468; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 7469; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7470; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 7471; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 7472; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 7473; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7474; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 7475; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 7476; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7477; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 7478; GFX6-NEXT: v_mul_lo_u32 v4, v1, s0 7479; GFX6-NEXT: v_mul_hi_u32 v5, v0, s0 7480; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 7481; GFX6-NEXT: v_mul_lo_u32 v8, v0, s0 7482; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 7483; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 7484; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 7485; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 7486; GFX6-NEXT: v_mov_b32_e32 v5, s3 7487; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s2, v8 7488; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc 7489; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s0, v8 7490; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc 7491; GFX6-NEXT: s_movk_i32 s0, 0xffe 7492; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 7493; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 7494; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 7495; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 7496; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 7497; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 7498; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 7499; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 7500; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] 7501; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 7502; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 7503; GFX6-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] 7504; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc 7505; GFX6-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] 7506; GFX6-NEXT: v_mov_b32_e32 v0, s8 7507; GFX6-NEXT: v_mov_b32_e32 v1, s9 7508; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 7509; GFX6-NEXT: s_endpgm 7510; 7511; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom: 7512; GFX9: ; %bb.0: 7513; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 7514; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 7515; GFX9-NEXT: v_rcp_f32_e32 v0, v0 7516; GFX9-NEXT: s_movk_i32 s2, 0xf001 7517; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7518; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7519; GFX9-NEXT: v_trunc_f32_e32 v1, v1 7520; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7521; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 7522; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 7523; GFX9-NEXT: v_mul_hi_u32 v2, v0, s2 7524; GFX9-NEXT: v_mul_lo_u32 v4, v1, s2 7525; GFX9-NEXT: v_mul_lo_u32 v3, v0, s2 7526; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 7527; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 7528; GFX9-NEXT: v_mul_hi_u32 v5, v0, v3 7529; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2 7530; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 7531; GFX9-NEXT: v_mul_lo_u32 v6, v1, v3 7532; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 7533; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 7534; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 7535; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc 7536; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 7537; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 7538; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc 7539; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc 7540; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7541; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 7542; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 7543; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 7544; GFX9-NEXT: v_mul_hi_u32 v2, v0, s2 7545; GFX9-NEXT: v_mul_lo_u32 v3, v1, s2 7546; GFX9-NEXT: v_mul_lo_u32 v5, v0, s2 7547; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7548; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 7549; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 7550; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 7551; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 7552; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5 7553; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 7554; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 7555; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 7556; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 7557; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 7558; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 7559; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 7560; GFX9-NEXT: s_movk_i32 s0, 0xfff 7561; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7562; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 12 7563; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 7564; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc 7565; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v8, vcc 7566; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7567; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc 7568; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 7569; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 7570; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 7571; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 7572; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 7573; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 7574; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 7575; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7576; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc 7577; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 7578; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 7579; GFX9-NEXT: v_mov_b32_e32 v4, 0 7580; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 7581; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 7582; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v6, vcc 7583; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 7584; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 7585; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 2, v0 7586; GFX9-NEXT: v_mul_lo_u32 v5, v1, s0 7587; GFX9-NEXT: v_mul_hi_u32 v6, v0, s0 7588; GFX9-NEXT: v_mul_lo_u32 v9, v0, s0 7589; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc 7590; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 7591; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc 7592; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 7593; GFX9-NEXT: v_mov_b32_e32 v6, s7 7594; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s6, v9 7595; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc 7596; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s0, v9 7597; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc 7598; GFX9-NEXT: s_movk_i32 s0, 0xffe 7599; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v6 7600; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 7601; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 7602; GFX9-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc 7603; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v9 7604; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 7605; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 7606; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 7607; GFX9-NEXT: v_cndmask_b32_e64 v5, -1, v6, s[0:1] 7608; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc 7609; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 7610; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] 7611; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc 7612; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] 7613; GFX9-NEXT: v_mov_b32_e32 v0, s4 7614; GFX9-NEXT: v_mov_b32_e32 v1, s5 7615; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 7616; GFX9-NEXT: s_endpgm 7617 %r = udiv <2 x i64> %x, <i64 4096, i64 4095> 7618 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 7619 ret void 7620} 7621 7622define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 7623; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom( 7624; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 7625; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 7626; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 7627; CHECK-NEXT: [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]] 7628; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 7629; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 7630; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 7631; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]] 7632; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 7633; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 7634; CHECK-NEXT: ret void 7635; 7636; GFX6-LABEL: udiv_v2i64_pow2_shl_denom: 7637; GFX6: ; %bb.0: 7638; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 7639; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 7640; GFX6-NEXT: s_mov_b32 s3, 0xf000 7641; GFX6-NEXT: s_mov_b32 s2, -1 7642; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7643; GFX6-NEXT: s_add_i32 s8, s8, 12 7644; GFX6-NEXT: s_add_i32 s9, s10, 12 7645; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 7646; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], s9 7647; GFX6-NEXT: v_mov_b32_e32 v0, s4 7648; GFX6-NEXT: v_mov_b32_e32 v1, s5 7649; GFX6-NEXT: v_mov_b32_e32 v2, s6 7650; GFX6-NEXT: v_mov_b32_e32 v3, s7 7651; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 7652; GFX6-NEXT: s_endpgm 7653; 7654; GFX9-LABEL: udiv_v2i64_pow2_shl_denom: 7655; GFX9: ; %bb.0: 7656; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 7657; GFX9-NEXT: v_mov_b32_e32 v4, 0 7658; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 7659; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7660; GFX9-NEXT: s_add_i32 s2, s8, 12 7661; GFX9-NEXT: s_add_i32 s8, s10, 12 7662; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], s2 7663; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 7664; GFX9-NEXT: v_mov_b32_e32 v0, s2 7665; GFX9-NEXT: v_mov_b32_e32 v1, s3 7666; GFX9-NEXT: v_mov_b32_e32 v2, s4 7667; GFX9-NEXT: v_mov_b32_e32 v3, s5 7668; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 7669; GFX9-NEXT: s_endpgm 7670 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 7671 %r = udiv <2 x i64> %x, %shl.y 7672 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 7673 ret void 7674} 7675 7676define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 7677; CHECK-LABEL: @urem_i64_oddk_denom( 7678; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993 7679; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 7680; CHECK-NEXT: ret void 7681; 7682; GFX6-LABEL: urem_i64_oddk_denom: 7683; GFX6: ; %bb.0: 7684; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 7685; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 7686; GFX6-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 7687; GFX6-NEXT: v_rcp_f32_e32 v0, v0 7688; GFX6-NEXT: s_movk_i32 s2, 0xfee0 7689; GFX6-NEXT: s_mov_b32 s3, 0x689e0837 7690; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 7691; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7692; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7693; GFX6-NEXT: v_trunc_f32_e32 v1, v1 7694; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7695; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 7696; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 7697; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7698; GFX6-NEXT: s_mov_b32 s8, s4 7699; GFX6-NEXT: s_movk_i32 s4, 0x11f 7700; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 7701; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 7702; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 7703; GFX6-NEXT: v_mul_lo_u32 v5, v0, s3 7704; GFX6-NEXT: s_mov_b32 s12, 0x9761f7c9 7705; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7706; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 7707; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 7708; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 7709; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 7710; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 7711; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 7712; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 7713; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 7714; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 7715; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 7716; GFX6-NEXT: s_mov_b32 s9, s5 7717; GFX6-NEXT: s_movk_i32 s5, 0x11e 7718; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 7719; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 7720; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 7721; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7722; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 7723; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 7724; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 7725; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 7726; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 7727; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 7728; GFX6-NEXT: s_mov_b32 s11, 0xf000 7729; GFX6-NEXT: s_mov_b32 s10, -1 7730; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7731; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 7732; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7733; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 7734; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 7735; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 7736; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 7737; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 7738; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 7739; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 7740; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 7741; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 7742; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 7743; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 7744; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 7745; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7746; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 7747; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 7748; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 7749; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 7750; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 7751; GFX6-NEXT: v_mul_hi_u32 v4, s6, v1 7752; GFX6-NEXT: v_mul_hi_u32 v5, s7, v1 7753; GFX6-NEXT: v_mul_lo_u32 v1, s7, v1 7754; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7755; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 7756; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 7757; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 7758; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7759; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 7760; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 7761; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7762; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 7763; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 7764; GFX6-NEXT: v_mul_hi_u32 v3, v0, s12 7765; GFX6-NEXT: v_mul_lo_u32 v1, v1, s12 7766; GFX6-NEXT: v_mul_lo_u32 v0, v0, s12 7767; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7768; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 7769; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 7770; GFX6-NEXT: v_mov_b32_e32 v3, 0x11f 7771; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 7772; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 7773; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 7774; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 7775; GFX6-NEXT: v_cmp_lt_u32_e64 s[2:3], s5, v5 7776; GFX6-NEXT: s_mov_b32 s6, 0x9761f7c8 7777; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 7778; GFX6-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v4 7779; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 7780; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 7781; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v5 7782; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 7783; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 7784; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 7785; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 7786; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 7787; GFX6-NEXT: v_mov_b32_e32 v5, s7 7788; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 7789; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s5, v1 7790; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 7791; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0 7792; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 7793; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 7794; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 7795; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 7796; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 7797; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 7798; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 7799; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 7800; GFX6-NEXT: s_endpgm 7801; 7802; GFX9-LABEL: urem_i64_oddk_denom: 7803; GFX9: ; %bb.0: 7804; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 7805; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 7806; GFX9-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 7807; GFX9-NEXT: v_rcp_f32_e32 v0, v0 7808; GFX9-NEXT: s_movk_i32 s2, 0xfee0 7809; GFX9-NEXT: s_mov_b32 s3, 0x689e0837 7810; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 7811; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7812; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7813; GFX9-NEXT: v_trunc_f32_e32 v1, v1 7814; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7815; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 7816; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 7817; GFX9-NEXT: s_movk_i32 s8, 0x11f 7818; GFX9-NEXT: s_mov_b32 s9, 0x9761f7c9 7819; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 7820; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 7821; GFX9-NEXT: v_mul_lo_u32 v5, v1, s3 7822; GFX9-NEXT: v_mul_lo_u32 v4, v0, s3 7823; GFX9-NEXT: s_mov_b32 s10, 0x9761f7c8 7824; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 7825; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 7826; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 7827; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 7828; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 7829; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 7830; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 7831; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 7832; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc 7833; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 7834; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 7835; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 7836; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc 7837; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc 7838; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7839; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 7840; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 7841; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 7842; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 7843; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 7844; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 7845; GFX9-NEXT: v_mul_lo_u32 v5, v0, s3 7846; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 7847; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 7848; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 7849; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 7850; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 7851; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 7852; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 7853; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 7854; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc 7855; GFX9-NEXT: v_mul_lo_u32 v6, v1, v5 7856; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 7857; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 7858; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v5, vcc 7859; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc 7860; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7861; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 7862; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 7863; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 7864; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7865; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 7866; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 7867; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 7868; GFX9-NEXT: v_mul_hi_u32 v5, s7, v1 7869; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 7870; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7871; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 7872; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 7873; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 7874; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 7875; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 7876; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc 7877; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 7878; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 7879; GFX9-NEXT: v_mul_lo_u32 v2, v0, s8 7880; GFX9-NEXT: v_mul_hi_u32 v3, v0, s9 7881; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 7882; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 7883; GFX9-NEXT: v_mov_b32_e32 v4, 0 7884; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 7885; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 7886; GFX9-NEXT: v_sub_u32_e32 v2, s7, v1 7887; GFX9-NEXT: v_mov_b32_e32 v3, 0x11f 7888; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 7889; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc 7890; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s9, v0 7891; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1] 7892; GFX9-NEXT: s_movk_i32 s6, 0x11e 7893; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v6 7894; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 7895; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s10, v5 7896; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] 7897; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 7898; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s8, v6 7899; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s9, v5 7900; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] 7901; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] 7902; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 7903; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1] 7904; GFX9-NEXT: v_mov_b32_e32 v6, s7 7905; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v6, v1, vcc 7906; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 7907; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 7908; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 7909; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 7910; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 7911; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc 7912; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 7913; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 7914; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v3, s[0:1] 7915; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 7916; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] 7917; GFX9-NEXT: s_endpgm 7918 %r = urem i64 %x, 1235195393993 7919 store i64 %r, i64 addrspace(1)* %out 7920 ret void 7921} 7922 7923define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 7924; CHECK-LABEL: @urem_i64_pow2k_denom( 7925; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 4096 7926; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 7927; CHECK-NEXT: ret void 7928; 7929; GFX6-LABEL: urem_i64_pow2k_denom: 7930; GFX6: ; %bb.0: 7931; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 7932; GFX6-NEXT: s_mov_b32 s7, 0xf000 7933; GFX6-NEXT: s_mov_b32 s6, -1 7934; GFX6-NEXT: v_mov_b32_e32 v1, 0 7935; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7936; GFX6-NEXT: s_mov_b32 s4, s0 7937; GFX6-NEXT: s_and_b32 s0, s2, 0xfff 7938; GFX6-NEXT: s_mov_b32 s5, s1 7939; GFX6-NEXT: v_mov_b32_e32 v0, s0 7940; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7941; GFX6-NEXT: s_endpgm 7942; 7943; GFX9-LABEL: urem_i64_pow2k_denom: 7944; GFX9: ; %bb.0: 7945; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 7946; GFX9-NEXT: v_mov_b32_e32 v1, 0 7947; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7948; GFX9-NEXT: s_and_b32 s2, s2, 0xfff 7949; GFX9-NEXT: v_mov_b32_e32 v0, s2 7950; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 7951; GFX9-NEXT: s_endpgm 7952 %r = urem i64 %x, 4096 7953 store i64 %r, i64 addrspace(1)* %out 7954 ret void 7955} 7956 7957define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 7958; CHECK-LABEL: @urem_i64_pow2_shl_denom( 7959; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 7960; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]] 7961; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 7962; CHECK-NEXT: ret void 7963; 7964; GFX6-LABEL: urem_i64_pow2_shl_denom: 7965; GFX6: ; %bb.0: 7966; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 7967; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd 7968; GFX6-NEXT: s_mov_b32 s3, 0xf000 7969; GFX6-NEXT: s_mov_b32 s2, -1 7970; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7971; GFX6-NEXT: s_mov_b32 s0, s4 7972; GFX6-NEXT: s_mov_b32 s1, s5 7973; GFX6-NEXT: s_mov_b64 s[4:5], 0x1000 7974; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 7975; GFX6-NEXT: s_add_u32 s4, s4, -1 7976; GFX6-NEXT: s_addc_u32 s5, s5, -1 7977; GFX6-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 7978; GFX6-NEXT: v_mov_b32_e32 v0, s4 7979; GFX6-NEXT: v_mov_b32_e32 v1, s5 7980; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 7981; GFX6-NEXT: s_endpgm 7982; 7983; GFX9-LABEL: urem_i64_pow2_shl_denom: 7984; GFX9: ; %bb.0: 7985; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 7986; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 7987; GFX9-NEXT: s_mov_b64 s[0:1], 0x1000 7988; GFX9-NEXT: v_mov_b32_e32 v2, 0 7989; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7990; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 7991; GFX9-NEXT: s_add_u32 s0, s0, -1 7992; GFX9-NEXT: s_addc_u32 s1, s1, -1 7993; GFX9-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] 7994; GFX9-NEXT: v_mov_b32_e32 v0, s0 7995; GFX9-NEXT: v_mov_b32_e32 v1, s1 7996; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 7997; GFX9-NEXT: s_endpgm 7998 %shl.y = shl i64 4096, %y 7999 %r = urem i64 %x, %shl.y 8000 store i64 %r, i64 addrspace(1)* %out 8001 ret void 8002} 8003 8004define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 8005; CHECK-LABEL: @urem_v2i64_pow2k_denom( 8006; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 8007; CHECK-NEXT: [[TMP2:%.*]] = urem i64 [[TMP1]], 4096 8008; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 8009; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 8010; CHECK-NEXT: [[TMP5:%.*]] = urem i64 [[TMP4]], 4096 8011; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 8012; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 8013; CHECK-NEXT: ret void 8014; 8015; GFX6-LABEL: urem_v2i64_pow2k_denom: 8016; GFX6: ; %bb.0: 8017; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 8018; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 8019; GFX6-NEXT: v_mov_b32_e32 v1, 0 8020; GFX6-NEXT: s_mov_b32 s3, 0xf000 8021; GFX6-NEXT: s_mov_b32 s2, -1 8022; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8023; GFX6-NEXT: s_and_b32 s4, s4, 0xfff 8024; GFX6-NEXT: s_and_b32 s5, s6, 0xfff 8025; GFX6-NEXT: v_mov_b32_e32 v0, s4 8026; GFX6-NEXT: v_mov_b32_e32 v2, s5 8027; GFX6-NEXT: v_mov_b32_e32 v3, v1 8028; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 8029; GFX6-NEXT: s_endpgm 8030; 8031; GFX9-LABEL: urem_v2i64_pow2k_denom: 8032; GFX9: ; %bb.0: 8033; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 8034; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8035; GFX9-NEXT: v_mov_b32_e32 v1, 0 8036; GFX9-NEXT: v_mov_b32_e32 v3, v1 8037; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8038; GFX9-NEXT: s_and_b32 s0, s4, 0xfff 8039; GFX9-NEXT: s_and_b32 s1, s6, 0xfff 8040; GFX9-NEXT: v_mov_b32_e32 v0, s0 8041; GFX9-NEXT: v_mov_b32_e32 v2, s1 8042; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[2:3] 8043; GFX9-NEXT: s_endpgm 8044 %r = urem <2 x i64> %x, <i64 4096, i64 4096> 8045 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 8046 ret void 8047} 8048 8049define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 8050; CHECK-LABEL: @urem_v2i64_pow2_shl_denom( 8051; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 8052; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 8053; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 8054; CHECK-NEXT: [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]] 8055; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 8056; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 8057; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 8058; CHECK-NEXT: [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]] 8059; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 8060; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 8061; CHECK-NEXT: ret void 8062; 8063; GFX6-LABEL: urem_v2i64_pow2_shl_denom: 8064; GFX6: ; %bb.0: 8065; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 8066; GFX6-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0xd 8067; GFX6-NEXT: s_mov_b64 s[12:13], 0x1000 8068; GFX6-NEXT: s_mov_b32 s11, 0xf000 8069; GFX6-NEXT: s_mov_b32 s10, -1 8070; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8071; GFX6-NEXT: s_lshl_b64 s[6:7], s[12:13], s6 8072; GFX6-NEXT: s_lshl_b64 s[4:5], s[12:13], s4 8073; GFX6-NEXT: s_add_u32 s4, s4, -1 8074; GFX6-NEXT: s_addc_u32 s5, s5, -1 8075; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5] 8076; GFX6-NEXT: s_add_u32 s4, s6, -1 8077; GFX6-NEXT: s_addc_u32 s5, s7, -1 8078; GFX6-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5] 8079; GFX6-NEXT: v_mov_b32_e32 v0, s0 8080; GFX6-NEXT: v_mov_b32_e32 v1, s1 8081; GFX6-NEXT: v_mov_b32_e32 v2, s2 8082; GFX6-NEXT: v_mov_b32_e32 v3, s3 8083; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 8084; GFX6-NEXT: s_endpgm 8085; 8086; GFX9-LABEL: urem_v2i64_pow2_shl_denom: 8087; GFX9: ; %bb.0: 8088; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 8089; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 8090; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 8091; GFX9-NEXT: v_mov_b32_e32 v4, 0 8092; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8093; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 8094; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 8095; GFX9-NEXT: s_add_u32 s2, s2, -1 8096; GFX9-NEXT: s_addc_u32 s3, s3, -1 8097; GFX9-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] 8098; GFX9-NEXT: s_add_u32 s4, s10, -1 8099; GFX9-NEXT: s_addc_u32 s5, s11, -1 8100; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 8101; GFX9-NEXT: v_mov_b32_e32 v0, s2 8102; GFX9-NEXT: v_mov_b32_e32 v1, s3 8103; GFX9-NEXT: v_mov_b32_e32 v2, s4 8104; GFX9-NEXT: v_mov_b32_e32 v3, s5 8105; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 8106; GFX9-NEXT: s_endpgm 8107 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 8108 %r = urem <2 x i64> %x, %shl.y 8109 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 8110 ret void 8111} 8112 8113define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 8114; CHECK-LABEL: @sdiv_i64_oddk_denom( 8115; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195 8116; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 8117; CHECK-NEXT: ret void 8118; 8119; GFX6-LABEL: sdiv_i64_oddk_denom: 8120; GFX6: ; %bb.0: 8121; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 8122; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 8123; GFX6-NEXT: v_rcp_f32_e32 v0, v0 8124; GFX6-NEXT: s_mov_b32 s5, 0xffed2705 8125; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 8126; GFX6-NEXT: s_mov_b32 s7, 0xf000 8127; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8128; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8129; GFX6-NEXT: v_trunc_f32_e32 v1, v1 8130; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8131; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 8132; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 8133; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8134; GFX6-NEXT: s_ashr_i32 s8, s3, 31 8135; GFX6-NEXT: s_add_u32 s2, s2, s8 8136; GFX6-NEXT: v_mul_lo_u32 v2, v1, s5 8137; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 8138; GFX6-NEXT: v_mul_lo_u32 v4, v0, s5 8139; GFX6-NEXT: s_mov_b32 s9, s8 8140; GFX6-NEXT: s_addc_u32 s3, s3, s8 8141; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8142; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 8143; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 8144; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 8145; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 8146; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 8147; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 8148; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 8149; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 8150; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 8151; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 8152; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] 8153; GFX6-NEXT: s_mov_b32 s4, s0 8154; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 8155; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 8156; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 8157; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8158; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 8159; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 8160; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 8161; GFX6-NEXT: v_mul_lo_u32 v2, v1, s5 8162; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 8163; GFX6-NEXT: s_mov_b32 s0, 0x12d8fb 8164; GFX6-NEXT: s_mov_b32 s6, -1 8165; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8166; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 8167; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 8168; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 8169; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 8170; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 8171; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 8172; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 8173; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 8174; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 8175; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 8176; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 8177; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 8178; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc 8179; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 8180; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8181; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 8182; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 8183; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 8184; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 8185; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 8186; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 8187; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 8188; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 8189; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8190; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 8191; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 8192; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 8193; GFX6-NEXT: s_mov_b32 s5, s1 8194; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 8195; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 8196; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 8197; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 8198; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 8199; GFX6-NEXT: v_mul_lo_u32 v4, v1, s0 8200; GFX6-NEXT: v_mul_hi_u32 v5, v0, s0 8201; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 8202; GFX6-NEXT: v_mul_lo_u32 v8, v0, s0 8203; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 8204; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 8205; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 8206; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 8207; GFX6-NEXT: v_mov_b32_e32 v5, s3 8208; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s2, v8 8209; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc 8210; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s0, v8 8211; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc 8212; GFX6-NEXT: s_mov_b32 s0, 0x12d8fa 8213; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 8214; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 8215; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 8216; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 8217; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 8218; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 8219; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 8220; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 8221; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] 8222; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 8223; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 8224; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 8225; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 8226; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 8227; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 8228; GFX6-NEXT: v_xor_b32_e32 v1, s8, v1 8229; GFX6-NEXT: v_mov_b32_e32 v2, s8 8230; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 8231; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 8232; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 8233; GFX6-NEXT: s_endpgm 8234; 8235; GFX9-LABEL: sdiv_i64_oddk_denom: 8236; GFX9: ; %bb.0: 8237; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 8238; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 8239; GFX9-NEXT: v_rcp_f32_e32 v0, v0 8240; GFX9-NEXT: s_mov_b32 s2, 0xffed2705 8241; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 8242; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8243; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8244; GFX9-NEXT: v_trunc_f32_e32 v1, v1 8245; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8246; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 8247; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 8248; GFX9-NEXT: v_mul_lo_u32 v2, v1, s2 8249; GFX9-NEXT: v_mul_hi_u32 v3, v0, s2 8250; GFX9-NEXT: v_mul_lo_u32 v4, v0, s2 8251; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 8252; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 8253; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 8254; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 8255; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 8256; GFX9-NEXT: v_mul_lo_u32 v5, v1, v4 8257; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 8258; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 8259; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 8260; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 8261; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 8262; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 8263; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc 8264; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc 8265; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 8266; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 8267; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 8268; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 8269; GFX9-NEXT: v_mul_lo_u32 v2, v1, s2 8270; GFX9-NEXT: v_mul_hi_u32 v3, v0, s2 8271; GFX9-NEXT: v_mul_lo_u32 v4, v0, s2 8272; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8273; GFX9-NEXT: s_ashr_i32 s2, s7, 31 8274; GFX9-NEXT: s_add_u32 s0, s6, s2 8275; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 8276; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 8277; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 8278; GFX9-NEXT: v_mul_hi_u32 v7, v0, v4 8279; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 8280; GFX9-NEXT: v_mul_hi_u32 v5, v1, v4 8281; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 8282; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 8283; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 8284; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc 8285; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 8286; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 8287; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v5, vcc 8288; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 8289; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 8290; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 8291; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 8292; GFX9-NEXT: s_mov_b32 s3, s2 8293; GFX9-NEXT: s_addc_u32 s1, s7, s2 8294; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 8295; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 8296; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 8297; GFX9-NEXT: v_mul_hi_u32 v3, s0, v0 8298; GFX9-NEXT: v_mul_hi_u32 v5, s0, v1 8299; GFX9-NEXT: v_mul_hi_u32 v6, s1, v1 8300; GFX9-NEXT: v_mul_lo_u32 v1, s1, v1 8301; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 8302; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc 8303; GFX9-NEXT: v_mul_lo_u32 v5, s1, v0 8304; GFX9-NEXT: v_mul_hi_u32 v0, s1, v0 8305; GFX9-NEXT: s_mov_b32 s3, 0x12d8fb 8306; GFX9-NEXT: v_mov_b32_e32 v4, 0 8307; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 8308; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 8309; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v6, vcc 8310; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 8311; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 8312; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 2, v0 8313; GFX9-NEXT: v_mul_lo_u32 v5, v1, s3 8314; GFX9-NEXT: v_mul_hi_u32 v6, v0, s3 8315; GFX9-NEXT: v_mul_lo_u32 v9, v0, s3 8316; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc 8317; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 8318; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc 8319; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 8320; GFX9-NEXT: v_mov_b32_e32 v6, s1 8321; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s0, v9 8322; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc 8323; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s3, v9 8324; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc 8325; GFX9-NEXT: s_mov_b32 s0, 0x12d8fa 8326; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v6 8327; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 8328; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 8329; GFX9-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc 8330; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v9 8331; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 8332; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 8333; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 8334; GFX9-NEXT: v_cndmask_b32_e64 v5, -1, v6, s[0:1] 8335; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 8336; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc 8337; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc 8338; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 8339; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 8340; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 8341; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 8342; GFX9-NEXT: v_mov_b32_e32 v2, s2 8343; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s2, v0 8344; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc 8345; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] 8346; GFX9-NEXT: s_endpgm 8347 %r = sdiv i64 %x, 1235195 8348 store i64 %r, i64 addrspace(1)* %out 8349 ret void 8350} 8351 8352define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 8353; CHECK-LABEL: @sdiv_i64_pow2k_denom( 8354; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 4096 8355; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 8356; CHECK-NEXT: ret void 8357; 8358; GFX6-LABEL: sdiv_i64_pow2k_denom: 8359; GFX6: ; %bb.0: 8360; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 8361; GFX6-NEXT: s_mov_b32 s7, 0xf000 8362; GFX6-NEXT: s_mov_b32 s6, -1 8363; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8364; GFX6-NEXT: s_mov_b32 s4, s0 8365; GFX6-NEXT: s_ashr_i32 s0, s3, 31 8366; GFX6-NEXT: s_lshr_b32 s0, s0, 20 8367; GFX6-NEXT: s_add_u32 s0, s2, s0 8368; GFX6-NEXT: s_mov_b32 s5, s1 8369; GFX6-NEXT: s_addc_u32 s1, s3, 0 8370; GFX6-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 8371; GFX6-NEXT: v_mov_b32_e32 v0, s0 8372; GFX6-NEXT: v_mov_b32_e32 v1, s1 8373; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 8374; GFX6-NEXT: s_endpgm 8375; 8376; GFX9-LABEL: sdiv_i64_pow2k_denom: 8377; GFX9: ; %bb.0: 8378; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 8379; GFX9-NEXT: v_mov_b32_e32 v2, 0 8380; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8381; GFX9-NEXT: s_ashr_i32 s4, s3, 31 8382; GFX9-NEXT: s_lshr_b32 s4, s4, 20 8383; GFX9-NEXT: s_add_u32 s2, s2, s4 8384; GFX9-NEXT: s_addc_u32 s3, s3, 0 8385; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 8386; GFX9-NEXT: v_mov_b32_e32 v0, s2 8387; GFX9-NEXT: v_mov_b32_e32 v1, s3 8388; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 8389; GFX9-NEXT: s_endpgm 8390 %r = sdiv i64 %x, 4096 8391 store i64 %r, i64 addrspace(1)* %out 8392 ret void 8393} 8394 8395define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 8396; CHECK-LABEL: @sdiv_i64_pow2_shl_denom( 8397; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 8398; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]] 8399; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 8400; CHECK-NEXT: ret void 8401; 8402; GFX6-LABEL: sdiv_i64_pow2_shl_denom: 8403; GFX6: ; %bb.0: 8404; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd 8405; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 8406; GFX6-NEXT: s_mov_b32 s7, 0xf000 8407; GFX6-NEXT: s_mov_b32 s6, -1 8408; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8409; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 8410; GFX6-NEXT: s_ashr_i32 s8, s3, 31 8411; GFX6-NEXT: s_add_u32 s2, s2, s8 8412; GFX6-NEXT: s_mov_b32 s9, s8 8413; GFX6-NEXT: s_addc_u32 s3, s3, s8 8414; GFX6-NEXT: s_xor_b64 s[10:11], s[2:3], s[8:9] 8415; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 8416; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11 8417; GFX6-NEXT: s_sub_u32 s4, 0, s10 8418; GFX6-NEXT: s_subb_u32 s5, 0, s11 8419; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 8420; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 8421; GFX6-NEXT: v_rcp_f32_e32 v0, v0 8422; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8423; GFX6-NEXT: s_ashr_i32 s12, s3, 31 8424; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8425; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8426; GFX6-NEXT: v_trunc_f32_e32 v1, v1 8427; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8428; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 8429; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 8430; GFX6-NEXT: s_add_u32 s2, s2, s12 8431; GFX6-NEXT: s_mov_b32 s13, s12 8432; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 8433; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 8434; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 8435; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 8436; GFX6-NEXT: s_addc_u32 s3, s3, s12 8437; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8438; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 8439; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 8440; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 8441; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 8442; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 8443; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 8444; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 8445; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 8446; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 8447; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 8448; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] 8449; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 8450; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 8451; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 8452; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8453; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 8454; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 8455; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 8456; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 8457; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 8458; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 8459; GFX6-NEXT: s_mov_b32 s5, s1 8460; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8461; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 8462; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 8463; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 8464; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 8465; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 8466; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 8467; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 8468; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 8469; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 8470; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 8471; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 8472; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 8473; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc 8474; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 8475; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8476; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 8477; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 8478; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 8479; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 8480; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 8481; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 8482; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 8483; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 8484; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8485; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 8486; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 8487; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 8488; GFX6-NEXT: s_mov_b32 s4, s0 8489; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 8490; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 8491; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 8492; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 8493; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 8494; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 8495; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 8496; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 8497; GFX6-NEXT: v_mov_b32_e32 v5, s11 8498; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8499; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 8500; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 8501; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 8502; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 8503; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 8504; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s10, v3 8505; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 8506; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v4 8507; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 8508; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v5 8509; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 8510; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v4 8511; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 8512; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 8513; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 8514; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 8515; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 8516; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 8517; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] 8518; GFX6-NEXT: v_mov_b32_e32 v6, s3 8519; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 8520; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 8521; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 8522; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 8523; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 8524; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s11, v2 8525; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 8526; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 8527; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 8528; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 8529; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[8:9] 8530; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 8531; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 8532; GFX6-NEXT: v_xor_b32_e32 v1, s1, v1 8533; GFX6-NEXT: v_mov_b32_e32 v2, s1 8534; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 8535; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 8536; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 8537; GFX6-NEXT: s_endpgm 8538; 8539; GFX9-LABEL: sdiv_i64_pow2_shl_denom: 8540; GFX9: ; %bb.0: 8541; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 8542; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 8543; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8544; GFX9-NEXT: s_lshl_b64 s[4:5], s[2:3], s4 8545; GFX9-NEXT: s_ashr_i32 s2, s5, 31 8546; GFX9-NEXT: s_add_u32 s4, s4, s2 8547; GFX9-NEXT: s_mov_b32 s3, s2 8548; GFX9-NEXT: s_addc_u32 s5, s5, s2 8549; GFX9-NEXT: s_xor_b64 s[8:9], s[4:5], s[2:3] 8550; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 8551; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 8552; GFX9-NEXT: s_sub_u32 s10, 0, s8 8553; GFX9-NEXT: s_subb_u32 s4, 0, s9 8554; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 8555; GFX9-NEXT: v_rcp_f32_e32 v0, v0 8556; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8557; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8558; GFX9-NEXT: v_trunc_f32_e32 v1, v1 8559; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8560; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 8561; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 8562; GFX9-NEXT: v_mul_lo_u32 v2, s10, v1 8563; GFX9-NEXT: v_mul_hi_u32 v3, s10, v0 8564; GFX9-NEXT: v_mul_lo_u32 v5, s4, v0 8565; GFX9-NEXT: v_mul_lo_u32 v4, s10, v0 8566; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 8567; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 8568; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 8569; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 8570; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 8571; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 8572; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 8573; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 8574; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 8575; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc 8576; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 8577; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 8578; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v6, vcc 8579; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc 8580; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 8581; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 8582; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 8583; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 8584; GFX9-NEXT: v_mul_lo_u32 v2, s10, v1 8585; GFX9-NEXT: v_mul_hi_u32 v3, s10, v0 8586; GFX9-NEXT: v_mul_lo_u32 v4, s4, v0 8587; GFX9-NEXT: v_mul_lo_u32 v5, s10, v0 8588; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 8589; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 8590; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 8591; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 8592; GFX9-NEXT: v_mul_hi_u32 v7, v0, v5 8593; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 8594; GFX9-NEXT: v_mul_hi_u32 v4, v1, v5 8595; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 8596; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 8597; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 8598; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc 8599; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 8600; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 8601; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc 8602; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 8603; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 8604; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8605; GFX9-NEXT: s_ashr_i32 s10, s7, 31 8606; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 8607; GFX9-NEXT: s_add_u32 s0, s6, s10 8608; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 8609; GFX9-NEXT: s_mov_b32 s11, s10 8610; GFX9-NEXT: s_addc_u32 s1, s7, s10 8611; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 8612; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 8613; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 8614; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 8615; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 8616; GFX9-NEXT: v_mul_hi_u32 v5, s7, v1 8617; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 8618; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 8619; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 8620; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 8621; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 8622; GFX9-NEXT: v_mov_b32_e32 v6, s9 8623; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 8624; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 8625; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc 8626; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 8627; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 8628; GFX9-NEXT: v_mul_lo_u32 v2, s8, v1 8629; GFX9-NEXT: v_mul_hi_u32 v3, s8, v0 8630; GFX9-NEXT: v_mul_lo_u32 v4, s9, v0 8631; GFX9-NEXT: v_mov_b32_e32 v5, 0 8632; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 8633; GFX9-NEXT: v_mul_lo_u32 v3, s8, v0 8634; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 8635; GFX9-NEXT: v_sub_u32_e32 v4, s7, v2 8636; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 8637; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc 8638; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s8, v3 8639; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] 8640; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 8641; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 8642; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v6 8643; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 8644; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 8645; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] 8646; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], 2, v0 8647; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] 8648; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], 1, v0 8649; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1] 8650; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 8651; GFX9-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] 8652; GFX9-NEXT: v_mov_b32_e32 v7, s7 8653; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v7, v2, vcc 8654; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 8655; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 8656; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 8657; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 8658; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v2 8659; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc 8660; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 8661; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v6, s[0:1] 8662; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 8663; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[2:3] 8664; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 8665; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 8666; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 8667; GFX9-NEXT: v_mov_b32_e32 v2, s1 8668; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 8669; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc 8670; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] 8671; GFX9-NEXT: s_endpgm 8672 %shl.y = shl i64 4096, %y 8673 %r = sdiv i64 %x, %shl.y 8674 store i64 %r, i64 addrspace(1)* %out 8675 ret void 8676} 8677 8678define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 8679; CHECK-LABEL: @sdiv_v2i64_pow2k_denom( 8680; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 8681; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 8682; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 8683; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 8684; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096 8685; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 8686; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 8687; CHECK-NEXT: ret void 8688; 8689; GFX6-LABEL: sdiv_v2i64_pow2k_denom: 8690; GFX6: ; %bb.0: 8691; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 8692; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 8693; GFX6-NEXT: s_mov_b32 s3, 0xf000 8694; GFX6-NEXT: s_mov_b32 s2, -1 8695; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8696; GFX6-NEXT: s_ashr_i32 s8, s5, 31 8697; GFX6-NEXT: s_lshr_b32 s8, s8, 20 8698; GFX6-NEXT: s_add_u32 s4, s4, s8 8699; GFX6-NEXT: s_addc_u32 s5, s5, 0 8700; GFX6-NEXT: s_ashr_i32 s8, s7, 31 8701; GFX6-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 8702; GFX6-NEXT: s_lshr_b32 s8, s8, 20 8703; GFX6-NEXT: s_add_u32 s6, s6, s8 8704; GFX6-NEXT: s_addc_u32 s7, s7, 0 8705; GFX6-NEXT: s_ashr_i64 s[6:7], s[6:7], 12 8706; GFX6-NEXT: v_mov_b32_e32 v0, s4 8707; GFX6-NEXT: v_mov_b32_e32 v1, s5 8708; GFX6-NEXT: v_mov_b32_e32 v2, s6 8709; GFX6-NEXT: v_mov_b32_e32 v3, s7 8710; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 8711; GFX6-NEXT: s_endpgm 8712; 8713; GFX9-LABEL: sdiv_v2i64_pow2k_denom: 8714; GFX9: ; %bb.0: 8715; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 8716; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8717; GFX9-NEXT: v_mov_b32_e32 v4, 0 8718; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8719; GFX9-NEXT: s_ashr_i32 s0, s5, 31 8720; GFX9-NEXT: s_lshr_b32 s0, s0, 20 8721; GFX9-NEXT: s_add_u32 s0, s4, s0 8722; GFX9-NEXT: s_addc_u32 s1, s5, 0 8723; GFX9-NEXT: s_ashr_i32 s4, s7, 31 8724; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 8725; GFX9-NEXT: s_lshr_b32 s4, s4, 20 8726; GFX9-NEXT: s_add_u32 s4, s6, s4 8727; GFX9-NEXT: s_addc_u32 s5, s7, 0 8728; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 8729; GFX9-NEXT: v_mov_b32_e32 v0, s0 8730; GFX9-NEXT: v_mov_b32_e32 v1, s1 8731; GFX9-NEXT: v_mov_b32_e32 v2, s4 8732; GFX9-NEXT: v_mov_b32_e32 v3, s5 8733; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 8734; GFX9-NEXT: s_endpgm 8735 %r = sdiv <2 x i64> %x, <i64 4096, i64 4096> 8736 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 8737 ret void 8738} 8739 8740define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 8741; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom( 8742; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 8743; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 8744; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 8745; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 8746; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095 8747; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 8748; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 8749; CHECK-NEXT: ret void 8750; 8751; GFX6-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 8752; GFX6: ; %bb.0: 8753; GFX6-NEXT: v_mov_b32_e32 v0, 0x457ff000 8754; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 8755; GFX6-NEXT: v_mac_f32_e32 v0, 0, v1 8756; GFX6-NEXT: v_rcp_f32_e32 v0, v0 8757; GFX6-NEXT: s_movk_i32 s6, 0xf001 8758; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8759; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 8760; GFX6-NEXT: s_mov_b32 s7, 0xf000 8761; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8762; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8763; GFX6-NEXT: v_trunc_f32_e32 v1, v1 8764; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8765; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 8766; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 8767; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8768; GFX6-NEXT: s_ashr_i32 s8, s1, 31 8769; GFX6-NEXT: s_lshr_b32 s8, s8, 20 8770; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 8771; GFX6-NEXT: v_mul_lo_u32 v3, v1, s6 8772; GFX6-NEXT: s_add_u32 s0, s0, s8 8773; GFX6-NEXT: s_addc_u32 s1, s1, 0 8774; GFX6-NEXT: s_ashr_i64 s[8:9], s[0:1], 12 8775; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 8776; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 8777; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 8778; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 8779; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 8780; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 8781; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 8782; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 8783; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 8784; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 8785; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 8786; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 8787; GFX6-NEXT: s_ashr_i32 s10, s3, 31 8788; GFX6-NEXT: s_add_u32 s0, s2, s10 8789; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 8790; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 8791; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 8792; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8793; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 8794; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 8795; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 8796; GFX6-NEXT: v_mul_lo_u32 v2, v1, s6 8797; GFX6-NEXT: v_mul_hi_u32 v3, v0, s6 8798; GFX6-NEXT: s_mov_b32 s11, s10 8799; GFX6-NEXT: s_addc_u32 s1, s3, s10 8800; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[10:11] 8801; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8802; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 8803; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 8804; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 8805; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 8806; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 8807; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 8808; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 8809; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 8810; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 8811; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 8812; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 8813; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 8814; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc 8815; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 8816; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8817; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 8818; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 8819; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 8820; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 8821; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 8822; GFX6-NEXT: v_mul_hi_u32 v4, s0, v1 8823; GFX6-NEXT: v_mul_hi_u32 v5, s1, v1 8824; GFX6-NEXT: v_mul_lo_u32 v1, s1, v1 8825; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8826; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 8827; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 8828; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 8829; GFX6-NEXT: s_movk_i32 s2, 0xfff 8830; GFX6-NEXT: s_mov_b32 s6, -1 8831; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 8832; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 8833; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 8834; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 8835; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 8836; GFX6-NEXT: v_mul_lo_u32 v4, v1, s2 8837; GFX6-NEXT: v_mul_hi_u32 v5, v0, s2 8838; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 8839; GFX6-NEXT: v_mul_lo_u32 v8, v0, s2 8840; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 8841; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 8842; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 8843; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 8844; GFX6-NEXT: v_mov_b32_e32 v5, s1 8845; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s0, v8 8846; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc 8847; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s2, v8 8848; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc 8849; GFX6-NEXT: s_movk_i32 s0, 0xffe 8850; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 8851; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 8852; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 8853; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 8854; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 8855; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 8856; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 8857; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 8858; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] 8859; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 8860; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 8861; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 8862; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 8863; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 8864; GFX6-NEXT: v_xor_b32_e32 v0, s10, v0 8865; GFX6-NEXT: v_xor_b32_e32 v1, s10, v1 8866; GFX6-NEXT: v_mov_b32_e32 v3, s10 8867; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s10, v0 8868; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc 8869; GFX6-NEXT: v_mov_b32_e32 v0, s8 8870; GFX6-NEXT: v_mov_b32_e32 v1, s9 8871; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 8872; GFX6-NEXT: s_endpgm 8873; 8874; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 8875; GFX9: ; %bb.0: 8876; GFX9-NEXT: v_mov_b32_e32 v0, 0x457ff000 8877; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 8878; GFX9-NEXT: v_mac_f32_e32 v0, 0, v1 8879; GFX9-NEXT: v_rcp_f32_e32 v0, v0 8880; GFX9-NEXT: s_movk_i32 s8, 0xf001 8881; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 8882; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8883; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8884; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8885; GFX9-NEXT: v_trunc_f32_e32 v1, v1 8886; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8887; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 8888; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 8889; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8890; GFX9-NEXT: s_ashr_i32 s0, s5, 31 8891; GFX9-NEXT: s_lshr_b32 s0, s0, 20 8892; GFX9-NEXT: v_mul_hi_u32 v2, v0, s8 8893; GFX9-NEXT: v_mul_lo_u32 v3, v1, s8 8894; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 8895; GFX9-NEXT: s_add_u32 s0, s4, s0 8896; GFX9-NEXT: s_addc_u32 s1, s5, 0 8897; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 8898; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 8899; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 8900; GFX9-NEXT: v_mul_hi_u32 v5, v0, v4 8901; GFX9-NEXT: v_mul_hi_u32 v6, v0, v2 8902; GFX9-NEXT: v_mul_hi_u32 v7, v1, v2 8903; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 8904; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 8905; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc 8906; GFX9-NEXT: v_mul_lo_u32 v6, v1, v4 8907; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 8908; GFX9-NEXT: s_ashr_i64 s[4:5], s[0:1], 12 8909; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 8910; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc 8911; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc 8912; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 8913; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 8914; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 8915; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 8916; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 8917; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 8918; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 8919; GFX9-NEXT: s_ashr_i32 s8, s7, 31 8920; GFX9-NEXT: s_add_u32 s0, s6, s8 8921; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 8922; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 8923; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 8924; GFX9-NEXT: v_mul_hi_u32 v7, v0, v4 8925; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 8926; GFX9-NEXT: v_mul_hi_u32 v5, v1, v4 8927; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 8928; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 8929; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 8930; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc 8931; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 8932; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 8933; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v5, vcc 8934; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 8935; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 8936; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 8937; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 8938; GFX9-NEXT: s_mov_b32 s9, s8 8939; GFX9-NEXT: s_addc_u32 s1, s7, s8 8940; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 8941; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] 8942; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 8943; GFX9-NEXT: v_mul_hi_u32 v3, s0, v0 8944; GFX9-NEXT: v_mul_hi_u32 v5, s0, v1 8945; GFX9-NEXT: v_mul_hi_u32 v6, s1, v1 8946; GFX9-NEXT: v_mul_lo_u32 v1, s1, v1 8947; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 8948; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc 8949; GFX9-NEXT: v_mul_lo_u32 v5, s1, v0 8950; GFX9-NEXT: v_mul_hi_u32 v0, s1, v0 8951; GFX9-NEXT: s_movk_i32 s6, 0xfff 8952; GFX9-NEXT: v_mov_b32_e32 v4, 0 8953; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 8954; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 8955; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v6, vcc 8956; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 8957; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 8958; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 2, v0 8959; GFX9-NEXT: v_mul_lo_u32 v5, v1, s6 8960; GFX9-NEXT: v_mul_hi_u32 v6, v0, s6 8961; GFX9-NEXT: v_mul_lo_u32 v9, v0, s6 8962; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc 8963; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 8964; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc 8965; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 8966; GFX9-NEXT: v_mov_b32_e32 v6, s1 8967; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s0, v9 8968; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc 8969; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s6, v9 8970; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc 8971; GFX9-NEXT: s_movk_i32 s0, 0xffe 8972; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v6 8973; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 8974; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 8975; GFX9-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc 8976; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v9 8977; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 8978; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 8979; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 8980; GFX9-NEXT: v_cndmask_b32_e64 v5, -1, v6, s[0:1] 8981; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 8982; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc 8983; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc 8984; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 8985; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 8986; GFX9-NEXT: v_xor_b32_e32 v0, s8, v0 8987; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 8988; GFX9-NEXT: v_mov_b32_e32 v3, s8 8989; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s8, v0 8990; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc 8991; GFX9-NEXT: v_mov_b32_e32 v0, s4 8992; GFX9-NEXT: v_mov_b32_e32 v1, s5 8993; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 8994; GFX9-NEXT: s_endpgm 8995 %r = sdiv <2 x i64> %x, <i64 4096, i64 4095> 8996 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 8997 ret void 8998} 8999 9000define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 9001; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom( 9002; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 9003; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 9004; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 9005; CHECK-NEXT: [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]] 9006; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 9007; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 9008; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 9009; CHECK-NEXT: [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]] 9010; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 9011; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 9012; CHECK-NEXT: ret void 9013; 9014; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom: 9015; GFX6: ; %bb.0: 9016; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 9017; GFX6-NEXT: s_mov_b64 s[12:13], 0x1000 9018; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9019; GFX6-NEXT: s_lshl_b64 s[8:9], s[12:13], s8 9020; GFX6-NEXT: s_lshl_b64 s[2:3], s[12:13], s10 9021; GFX6-NEXT: s_ashr_i32 s14, s9, 31 9022; GFX6-NEXT: s_add_u32 s8, s8, s14 9023; GFX6-NEXT: s_mov_b32 s15, s14 9024; GFX6-NEXT: s_addc_u32 s9, s9, s14 9025; GFX6-NEXT: s_xor_b64 s[12:13], s[8:9], s[14:15] 9026; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 9027; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s13 9028; GFX6-NEXT: s_sub_u32 s10, 0, s12 9029; GFX6-NEXT: s_subb_u32 s11, 0, s13 9030; GFX6-NEXT: s_ashr_i32 s16, s5, 31 9031; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 9032; GFX6-NEXT: v_rcp_f32_e32 v0, v0 9033; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 9034; GFX6-NEXT: s_add_u32 s0, s4, s16 9035; GFX6-NEXT: s_mov_b32 s17, s16 9036; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9037; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9038; GFX6-NEXT: v_trunc_f32_e32 v1, v1 9039; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9040; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9041; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9042; GFX6-NEXT: s_addc_u32 s1, s5, s16 9043; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[16:17] 9044; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 9045; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 9046; GFX6-NEXT: v_mul_lo_u32 v5, s11, v0 9047; GFX6-NEXT: v_mul_lo_u32 v4, s10, v0 9048; GFX6-NEXT: s_xor_b64 s[14:15], s[16:17], s[14:15] 9049; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9050; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 9051; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 9052; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 9053; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 9054; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 9055; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 9056; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 9057; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc 9058; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 9059; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9060; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 9061; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 9062; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 9063; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9064; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9065; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9066; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9067; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 9068; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 9069; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 9070; GFX6-NEXT: s_mov_b32 s11, 0xf000 9071; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9072; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 9073; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 9074; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 9075; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 9076; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 9077; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 9078; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 9079; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 9080; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 9081; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 9082; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9083; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 9084; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc 9085; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 9086; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9087; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9088; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9089; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9090; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 9091; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 9092; GFX6-NEXT: v_mul_hi_u32 v4, s4, v1 9093; GFX6-NEXT: v_mul_hi_u32 v5, s5, v1 9094; GFX6-NEXT: v_mul_lo_u32 v1, s5, v1 9095; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9096; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9097; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 9098; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 9099; GFX6-NEXT: s_mov_b32 s10, -1 9100; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9101; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 9102; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 9103; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 9104; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 9105; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 9106; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 9107; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 9108; GFX6-NEXT: v_mov_b32_e32 v5, s13 9109; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9110; GFX6-NEXT: v_mul_lo_u32 v3, s12, v0 9111; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9112; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s5, v2 9113; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v3 9114; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 9115; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s12, v3 9116; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 9117; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 9118; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 9119; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v5 9120; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 9121; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v4 9122; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 9123; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 9124; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 9125; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 9126; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 9127; GFX6-NEXT: s_ashr_i32 s4, s3, 31 9128; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 9129; GFX6-NEXT: s_add_u32 s2, s2, s4 9130; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] 9131; GFX6-NEXT: v_mov_b32_e32 v6, s5 9132; GFX6-NEXT: s_mov_b32 s5, s4 9133; GFX6-NEXT: s_addc_u32 s3, s3, s4 9134; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] 9135; GFX6-NEXT: v_cvt_f32_u32_e32 v8, s2 9136; GFX6-NEXT: v_cvt_f32_u32_e32 v9, s3 9137; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 9138; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 9139; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 9140; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 9141; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 9142; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 9143; GFX6-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9 9144; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 9145; GFX6-NEXT: v_rcp_f32_e32 v3, v8 9146; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 9147; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 9148; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 9149; GFX6-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 9150; GFX6-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 9151; GFX6-NEXT: v_trunc_f32_e32 v4, v4 9152; GFX6-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 9153; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 9154; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 9155; GFX6-NEXT: s_sub_u32 s0, 0, s2 9156; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 9157; GFX6-NEXT: v_mul_hi_u32 v2, s0, v3 9158; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 9159; GFX6-NEXT: s_subb_u32 s1, 0, s3 9160; GFX6-NEXT: v_mul_lo_u32 v6, s1, v3 9161; GFX6-NEXT: s_ashr_i32 s12, s7, 31 9162; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 9163; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 9164; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 9165; GFX6-NEXT: v_mul_lo_u32 v6, v3, v2 9166; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 9167; GFX6-NEXT: v_mul_hi_u32 v8, v3, v2 9168; GFX6-NEXT: v_mul_hi_u32 v9, v4, v2 9169; GFX6-NEXT: v_mul_lo_u32 v2, v4, v2 9170; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 9171; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 9172; GFX6-NEXT: v_mul_lo_u32 v8, v4, v5 9173; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 9174; GFX6-NEXT: s_mov_b32 s13, s12 9175; GFX6-NEXT: v_xor_b32_e32 v0, s14, v0 9176; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 9177; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc 9178; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc 9179; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 9180; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 9181; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9182; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 9183; GFX6-NEXT: v_mul_lo_u32 v4, s0, v3 9184; GFX6-NEXT: v_mul_hi_u32 v5, s0, v2 9185; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2 9186; GFX6-NEXT: v_xor_b32_e32 v1, s15, v1 9187; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 9188; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 9189; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 9190; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 9191; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 9192; GFX6-NEXT: v_mul_hi_u32 v10, v2, v4 9193; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 9194; GFX6-NEXT: v_mul_lo_u32 v5, v3, v5 9195; GFX6-NEXT: v_mul_hi_u32 v6, v3, v4 9196; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 9197; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 9198; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 9199; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 9200; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc 9201; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc 9202; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 9203; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 9204; GFX6-NEXT: s_add_u32 s0, s6, s12 9205; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9206; GFX6-NEXT: s_addc_u32 s1, s7, s12 9207; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc 9208; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] 9209; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 9210; GFX6-NEXT: v_mul_hi_u32 v5, s6, v2 9211; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 9212; GFX6-NEXT: v_mul_hi_u32 v8, s7, v3 9213; GFX6-NEXT: v_mul_lo_u32 v3, s7, v3 9214; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 9215; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc 9216; GFX6-NEXT: v_mul_lo_u32 v7, s7, v2 9217; GFX6-NEXT: v_mul_hi_u32 v2, s7, v2 9218; GFX6-NEXT: v_mov_b32_e32 v6, s15 9219; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 9220; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc 9221; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc 9222; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 9223; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9224; GFX6-NEXT: v_mul_lo_u32 v4, s2, v3 9225; GFX6-NEXT: v_mul_hi_u32 v5, s2, v2 9226; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s14, v0 9227; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc 9228; GFX6-NEXT: v_mul_lo_u32 v6, s3, v2 9229; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 9230; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 9231; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 9232; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s7, v4 9233; GFX6-NEXT: v_mov_b32_e32 v7, s3 9234; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s6, v5 9235; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc 9236; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s2, v5 9237; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] 9238; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v6 9239; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 9240; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v7 9241; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 9242; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v6 9243; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] 9244; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v2 9245; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] 9246; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v2 9247; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] 9248; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 9249; GFX6-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[0:1] 9250; GFX6-NEXT: v_mov_b32_e32 v8, s7 9251; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc 9252; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v4 9253; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 9254; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v5 9255; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 9256; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v4 9257; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc 9258; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 9259; GFX6-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] 9260; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 9261; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[4:5] 9262; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 9263; GFX6-NEXT: v_xor_b32_e32 v2, s0, v2 9264; GFX6-NEXT: v_xor_b32_e32 v3, s1, v3 9265; GFX6-NEXT: v_mov_b32_e32 v4, s1 9266; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v2 9267; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc 9268; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9269; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 9270; GFX6-NEXT: s_endpgm 9271; 9272; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom: 9273; GFX9: ; %bb.0: 9274; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 9275; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 9276; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9277; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 9278; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 9279; GFX9-NEXT: s_ashr_i32 s12, s3, 31 9280; GFX9-NEXT: s_add_u32 s2, s2, s12 9281; GFX9-NEXT: s_mov_b32 s13, s12 9282; GFX9-NEXT: s_addc_u32 s3, s3, s12 9283; GFX9-NEXT: s_xor_b64 s[8:9], s[2:3], s[12:13] 9284; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 9285; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 9286; GFX9-NEXT: s_sub_u32 s2, 0, s8 9287; GFX9-NEXT: s_subb_u32 s3, 0, s9 9288; GFX9-NEXT: s_ashr_i32 s14, s5, 31 9289; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 9290; GFX9-NEXT: v_rcp_f32_e32 v0, v0 9291; GFX9-NEXT: s_mov_b32 s15, s14 9292; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9293; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9294; GFX9-NEXT: v_trunc_f32_e32 v1, v1 9295; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9296; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 9297; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 9298; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 9299; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 9300; GFX9-NEXT: v_mul_lo_u32 v5, s3, v0 9301; GFX9-NEXT: v_mul_lo_u32 v4, s2, v0 9302; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9303; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 9304; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 9305; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 9306; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 9307; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 9308; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 9309; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 9310; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 9311; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc 9312; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 9313; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 9314; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v6, vcc 9315; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc 9316; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9317; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 9318; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 9319; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 9320; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 9321; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 9322; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 9323; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 9324; GFX9-NEXT: s_add_u32 s2, s4, s14 9325; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9326; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 9327; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 9328; GFX9-NEXT: v_mul_hi_u32 v7, v0, v5 9329; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 9330; GFX9-NEXT: v_mul_hi_u32 v4, v1, v5 9331; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 9332; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 9333; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 9334; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc 9335; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 9336; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 9337; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc 9338; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 9339; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 9340; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 9341; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 9342; GFX9-NEXT: s_addc_u32 s3, s5, s14 9343; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 9344; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], s[14:15] 9345; GFX9-NEXT: v_mul_lo_u32 v2, s4, v1 9346; GFX9-NEXT: v_mul_hi_u32 v3, s4, v0 9347; GFX9-NEXT: v_mul_hi_u32 v4, s4, v1 9348; GFX9-NEXT: v_mul_hi_u32 v5, s5, v1 9349; GFX9-NEXT: v_mul_lo_u32 v1, s5, v1 9350; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9351; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 9352; GFX9-NEXT: v_mul_lo_u32 v4, s5, v0 9353; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 9354; GFX9-NEXT: v_mov_b32_e32 v6, s9 9355; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 9356; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 9357; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 9358; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc 9359; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 9360; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 9361; GFX9-NEXT: v_mul_lo_u32 v3, s8, v2 9362; GFX9-NEXT: v_mul_hi_u32 v4, s8, v1 9363; GFX9-NEXT: v_mul_lo_u32 v5, s9, v1 9364; GFX9-NEXT: s_xor_b64 s[12:13], s[14:15], s[12:13] 9365; GFX9-NEXT: v_mov_b32_e32 v0, 0 9366; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 9367; GFX9-NEXT: v_mul_lo_u32 v4, s8, v1 9368; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 9369; GFX9-NEXT: v_sub_u32_e32 v5, s5, v3 9370; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s4, v4 9371; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc 9372; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s8, v4 9373; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] 9374; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v5 9375; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 9376; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v6 9377; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 9378; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v5 9379; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v6, s[0:1] 9380; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], 2, v1 9381; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v2, s[0:1] 9382; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], 1, v1 9383; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, v2, s[0:1] 9384; GFX9-NEXT: s_ashr_i32 s4, s11, 31 9385; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 9386; GFX9-NEXT: s_add_u32 s10, s10, s4 9387; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v7, s[0:1] 9388; GFX9-NEXT: v_mov_b32_e32 v7, s5 9389; GFX9-NEXT: s_mov_b32 s5, s4 9390; GFX9-NEXT: s_addc_u32 s11, s11, s4 9391; GFX9-NEXT: s_xor_b64 s[10:11], s[10:11], s[4:5] 9392; GFX9-NEXT: v_cvt_f32_u32_e32 v9, s10 9393; GFX9-NEXT: v_cvt_f32_u32_e32 v10, s11 9394; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc 9395; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 9396; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 9397; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v4 9398; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 9399; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 9400; GFX9-NEXT: v_mac_f32_e32 v9, 0x4f800000, v10 9401; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc 9402; GFX9-NEXT: v_rcp_f32_e32 v4, v9 9403; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 9404; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 9405; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v6, s[0:1] 9406; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4 9407; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 9408; GFX9-NEXT: v_trunc_f32_e32 v5, v5 9409; GFX9-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 9410; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 9411; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 9412; GFX9-NEXT: s_sub_u32 s0, 0, s10 9413; GFX9-NEXT: s_subb_u32 s1, 0, s11 9414; GFX9-NEXT: v_mul_hi_u32 v6, s0, v4 9415; GFX9-NEXT: v_mul_lo_u32 v7, s0, v5 9416; GFX9-NEXT: v_mul_lo_u32 v8, s1, v4 9417; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 9418; GFX9-NEXT: v_mul_lo_u32 v3, s0, v4 9419; GFX9-NEXT: v_add_u32_e32 v6, v6, v7 9420; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 9421; GFX9-NEXT: v_mul_lo_u32 v7, v4, v6 9422; GFX9-NEXT: v_mul_hi_u32 v8, v4, v3 9423; GFX9-NEXT: v_mul_hi_u32 v9, v4, v6 9424; GFX9-NEXT: v_mul_hi_u32 v10, v5, v6 9425; GFX9-NEXT: v_mul_lo_u32 v6, v5, v6 9426; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 9427; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc 9428; GFX9-NEXT: v_mul_lo_u32 v9, v5, v3 9429; GFX9-NEXT: v_mul_hi_u32 v3, v5, v3 9430; GFX9-NEXT: s_ashr_i32 s8, s7, 31 9431; GFX9-NEXT: s_mov_b32 s9, s8 9432; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 9433; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc 9434; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v10, vcc 9435; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 9436; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 9437; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 9438; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v5, v6, vcc 9439; GFX9-NEXT: v_mul_lo_u32 v5, s0, v4 9440; GFX9-NEXT: v_mul_hi_u32 v6, s0, v3 9441; GFX9-NEXT: v_mul_lo_u32 v7, s1, v3 9442; GFX9-NEXT: v_mul_lo_u32 v8, s0, v3 9443; GFX9-NEXT: s_add_u32 s0, s6, s8 9444; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 9445; GFX9-NEXT: v_add_u32_e32 v5, v5, v7 9446; GFX9-NEXT: v_mul_lo_u32 v9, v3, v5 9447; GFX9-NEXT: v_mul_hi_u32 v10, v3, v8 9448; GFX9-NEXT: v_mul_hi_u32 v11, v3, v5 9449; GFX9-NEXT: v_mul_hi_u32 v7, v4, v8 9450; GFX9-NEXT: v_mul_lo_u32 v8, v4, v8 9451; GFX9-NEXT: v_mul_hi_u32 v6, v4, v5 9452; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 9453; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v11, vcc 9454; GFX9-NEXT: v_mul_lo_u32 v5, v4, v5 9455; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 9456; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v7, vcc 9457; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 9458; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 9459; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 9460; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 9461; GFX9-NEXT: s_addc_u32 s1, s7, s8 9462; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc 9463; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[8:9] 9464; GFX9-NEXT: v_mul_lo_u32 v5, s6, v4 9465; GFX9-NEXT: v_mul_hi_u32 v6, s6, v3 9466; GFX9-NEXT: v_mul_hi_u32 v8, s6, v4 9467; GFX9-NEXT: v_mul_hi_u32 v9, s7, v4 9468; GFX9-NEXT: v_mul_lo_u32 v4, s7, v4 9469; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 9470; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc 9471; GFX9-NEXT: v_mul_lo_u32 v8, s7, v3 9472; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 9473; GFX9-NEXT: v_xor_b32_e32 v1, s12, v1 9474; GFX9-NEXT: v_xor_b32_e32 v2, s13, v2 9475; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 9476; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc 9477; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v9, vcc 9478; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 9479; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 9480; GFX9-NEXT: v_mul_lo_u32 v5, s10, v4 9481; GFX9-NEXT: v_mul_hi_u32 v6, s10, v3 9482; GFX9-NEXT: v_mul_lo_u32 v8, s11, v3 9483; GFX9-NEXT: v_mov_b32_e32 v7, s13 9484; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s12, v1 9485; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 9486; GFX9-NEXT: v_mul_lo_u32 v6, s10, v3 9487; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc 9488; GFX9-NEXT: v_add_u32_e32 v5, v5, v8 9489; GFX9-NEXT: v_sub_u32_e32 v7, s7, v5 9490; GFX9-NEXT: v_mov_b32_e32 v8, s11 9491; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, s6, v6 9492; GFX9-NEXT: v_subb_co_u32_e64 v7, s[0:1], v7, v8, vcc 9493; GFX9-NEXT: v_subrev_co_u32_e64 v8, s[0:1], s10, v6 9494; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[0:1], 0, v7, s[0:1] 9495; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v7 9496; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] 9497; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v8 9498; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 9499; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v7 9500; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v8, s[0:1] 9501; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], 2, v3 9502; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, v4, s[0:1] 9503; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v3 9504; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v4, s[0:1] 9505; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 9506; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v9, s[0:1] 9507; GFX9-NEXT: v_mov_b32_e32 v9, s7 9508; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v9, v5, vcc 9509; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 9510; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc 9511; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v6 9512; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 9513; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v5 9514; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc 9515; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 9516; GFX9-NEXT: v_cndmask_b32_e64 v5, v10, v8, s[0:1] 9517; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 9518; GFX9-NEXT: s_xor_b64 s[0:1], s[8:9], s[4:5] 9519; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc 9520; GFX9-NEXT: v_xor_b32_e32 v3, s0, v3 9521; GFX9-NEXT: v_xor_b32_e32 v4, s1, v4 9522; GFX9-NEXT: v_mov_b32_e32 v5, s1 9523; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v3 9524; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v5, vcc 9525; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9526; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] 9527; GFX9-NEXT: s_endpgm 9528 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 9529 %r = sdiv <2 x i64> %x, %shl.y 9530 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 9531 ret void 9532} 9533 9534define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 9535; CHECK-LABEL: @srem_i64_oddk_denom( 9536; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 1235195 9537; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 9538; CHECK-NEXT: ret void 9539; 9540; GFX6-LABEL: srem_i64_oddk_denom: 9541; GFX6: ; %bb.0: 9542; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 9543; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 9544; GFX6-NEXT: v_rcp_f32_e32 v0, v0 9545; GFX6-NEXT: s_mov_b32 s4, 0xffed2705 9546; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 9547; GFX6-NEXT: s_mov_b32 s7, 0xf000 9548; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9549; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9550; GFX6-NEXT: v_trunc_f32_e32 v1, v1 9551; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9552; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9553; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9554; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9555; GFX6-NEXT: s_ashr_i32 s8, s3, 31 9556; GFX6-NEXT: s_add_u32 s2, s2, s8 9557; GFX6-NEXT: v_mul_lo_u32 v2, v1, s4 9558; GFX6-NEXT: v_mul_hi_u32 v3, v0, s4 9559; GFX6-NEXT: v_mul_lo_u32 v4, v0, s4 9560; GFX6-NEXT: s_mov_b32 s9, s8 9561; GFX6-NEXT: s_addc_u32 s3, s3, s8 9562; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9563; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 9564; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 9565; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 9566; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 9567; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 9568; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9569; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 9570; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 9571; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 9572; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 9573; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] 9574; GFX6-NEXT: s_mov_b32 s5, s1 9575; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 9576; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 9577; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 9578; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9579; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9580; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9581; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9582; GFX6-NEXT: v_mul_lo_u32 v2, v1, s4 9583; GFX6-NEXT: v_mul_hi_u32 v3, v0, s4 9584; GFX6-NEXT: s_mov_b32 s6, -1 9585; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9586; GFX6-NEXT: v_mul_lo_u32 v3, v0, s4 9587; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 9588; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 9589; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 9590; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 9591; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 9592; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 9593; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 9594; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 9595; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 9596; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9597; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 9598; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc 9599; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 9600; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9601; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9602; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9603; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9604; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 9605; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 9606; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 9607; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 9608; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 9609; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9610; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9611; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 9612; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 9613; GFX6-NEXT: s_mov_b32 s4, s0 9614; GFX6-NEXT: s_mov_b32 s0, 0x12d8fb 9615; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9616; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 9617; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 9618; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 9619; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 9620; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 9621; GFX6-NEXT: v_mul_hi_u32 v2, v0, s0 9622; GFX6-NEXT: v_mul_lo_u32 v0, v0, s0 9623; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 9624; GFX6-NEXT: v_mov_b32_e32 v2, s3 9625; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 9626; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 9627; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v0 9628; GFX6-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc 9629; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s0, v2 9630; GFX6-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v3, vcc 9631; GFX6-NEXT: s_mov_b32 s0, 0x12d8fa 9632; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v2 9633; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 9634; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 9635; GFX6-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc 9636; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 9637; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v0 9638; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 9639; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 9640; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 9641; GFX6-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[0:1] 9642; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 9643; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 9644; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 9645; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 9646; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 9647; GFX6-NEXT: v_xor_b32_e32 v1, s8, v1 9648; GFX6-NEXT: v_mov_b32_e32 v2, s8 9649; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 9650; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 9651; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 9652; GFX6-NEXT: s_endpgm 9653; 9654; GFX9-LABEL: srem_i64_oddk_denom: 9655; GFX9: ; %bb.0: 9656; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 9657; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 9658; GFX9-NEXT: v_rcp_f32_e32 v0, v0 9659; GFX9-NEXT: s_mov_b32 s2, 0xffed2705 9660; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 9661; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9662; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9663; GFX9-NEXT: v_trunc_f32_e32 v1, v1 9664; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9665; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 9666; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 9667; GFX9-NEXT: v_mul_lo_u32 v2, v1, s2 9668; GFX9-NEXT: v_mul_hi_u32 v3, v0, s2 9669; GFX9-NEXT: v_mul_lo_u32 v4, v0, s2 9670; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9671; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 9672; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 9673; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 9674; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 9675; GFX9-NEXT: v_mul_lo_u32 v5, v1, v4 9676; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 9677; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 9678; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 9679; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 9680; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 9681; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 9682; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc 9683; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc 9684; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9685; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 9686; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 9687; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 9688; GFX9-NEXT: v_mul_lo_u32 v2, v1, s2 9689; GFX9-NEXT: v_mul_hi_u32 v3, v0, s2 9690; GFX9-NEXT: v_mul_lo_u32 v4, v0, s2 9691; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9692; GFX9-NEXT: s_ashr_i32 s2, s7, 31 9693; GFX9-NEXT: s_add_u32 s0, s6, s2 9694; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9695; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 9696; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 9697; GFX9-NEXT: v_mul_hi_u32 v7, v0, v4 9698; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 9699; GFX9-NEXT: v_mul_hi_u32 v5, v1, v4 9700; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 9701; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 9702; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 9703; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc 9704; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 9705; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 9706; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v5, vcc 9707; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 9708; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 9709; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 9710; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 9711; GFX9-NEXT: s_mov_b32 s3, s2 9712; GFX9-NEXT: s_addc_u32 s1, s7, s2 9713; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 9714; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 9715; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 9716; GFX9-NEXT: v_mul_hi_u32 v3, s0, v0 9717; GFX9-NEXT: v_mul_hi_u32 v4, s0, v1 9718; GFX9-NEXT: v_mul_hi_u32 v5, s1, v1 9719; GFX9-NEXT: v_mul_lo_u32 v1, s1, v1 9720; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9721; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 9722; GFX9-NEXT: v_mul_lo_u32 v4, s1, v0 9723; GFX9-NEXT: v_mul_hi_u32 v0, s1, v0 9724; GFX9-NEXT: s_mov_b32 s3, 0x12d8fb 9725; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 9726; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 9727; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc 9728; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 9729; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 9730; GFX9-NEXT: v_mul_lo_u32 v1, v1, s3 9731; GFX9-NEXT: v_mul_hi_u32 v2, v0, s3 9732; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 9733; GFX9-NEXT: v_mov_b32_e32 v3, 0 9734; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 9735; GFX9-NEXT: v_mov_b32_e32 v2, s1 9736; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 9737; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 9738; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s3, v0 9739; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v1, vcc 9740; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s3, v2 9741; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v4, vcc 9742; GFX9-NEXT: s_mov_b32 s0, 0x12d8fa 9743; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v2 9744; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 9745; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 9746; GFX9-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc 9747; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 9748; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v0 9749; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc 9750; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 9751; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 9752; GFX9-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[0:1] 9753; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 9754; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 9755; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 9756; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 9757; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 9758; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 9759; GFX9-NEXT: v_mov_b32_e32 v2, s2 9760; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s2, v0 9761; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc 9762; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5] 9763; GFX9-NEXT: s_endpgm 9764 %r = srem i64 %x, 1235195 9765 store i64 %r, i64 addrspace(1)* %out 9766 ret void 9767} 9768 9769define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 9770; CHECK-LABEL: @srem_i64_pow2k_denom( 9771; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 4096 9772; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 9773; CHECK-NEXT: ret void 9774; 9775; GFX6-LABEL: srem_i64_pow2k_denom: 9776; GFX6: ; %bb.0: 9777; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 9778; GFX6-NEXT: s_mov_b32 s7, 0xf000 9779; GFX6-NEXT: s_mov_b32 s6, -1 9780; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9781; GFX6-NEXT: s_mov_b32 s4, s0 9782; GFX6-NEXT: s_ashr_i32 s0, s3, 31 9783; GFX6-NEXT: s_lshr_b32 s0, s0, 20 9784; GFX6-NEXT: s_add_u32 s0, s2, s0 9785; GFX6-NEXT: s_mov_b32 s5, s1 9786; GFX6-NEXT: s_addc_u32 s1, s3, 0 9787; GFX6-NEXT: s_and_b32 s0, s0, 0xfffff000 9788; GFX6-NEXT: s_sub_u32 s0, s2, s0 9789; GFX6-NEXT: s_subb_u32 s1, s3, s1 9790; GFX6-NEXT: v_mov_b32_e32 v0, s0 9791; GFX6-NEXT: v_mov_b32_e32 v1, s1 9792; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 9793; GFX6-NEXT: s_endpgm 9794; 9795; GFX9-LABEL: srem_i64_pow2k_denom: 9796; GFX9: ; %bb.0: 9797; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 9798; GFX9-NEXT: v_mov_b32_e32 v2, 0 9799; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9800; GFX9-NEXT: s_ashr_i32 s4, s3, 31 9801; GFX9-NEXT: s_lshr_b32 s4, s4, 20 9802; GFX9-NEXT: s_add_u32 s4, s2, s4 9803; GFX9-NEXT: s_addc_u32 s5, s3, 0 9804; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 9805; GFX9-NEXT: s_sub_u32 s2, s2, s4 9806; GFX9-NEXT: s_subb_u32 s3, s3, s5 9807; GFX9-NEXT: v_mov_b32_e32 v0, s2 9808; GFX9-NEXT: v_mov_b32_e32 v1, s3 9809; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 9810; GFX9-NEXT: s_endpgm 9811 %r = srem i64 %x, 4096 9812 store i64 %r, i64 addrspace(1)* %out 9813 ret void 9814} 9815 9816define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 9817; CHECK-LABEL: @srem_i64_pow2_shl_denom( 9818; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 9819; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]] 9820; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 9821; CHECK-NEXT: ret void 9822; 9823; GFX6-LABEL: srem_i64_pow2_shl_denom: 9824; GFX6: ; %bb.0: 9825; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd 9826; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 9827; GFX6-NEXT: s_mov_b32 s7, 0xf000 9828; GFX6-NEXT: s_mov_b32 s6, -1 9829; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9830; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 9831; GFX6-NEXT: s_ashr_i32 s4, s3, 31 9832; GFX6-NEXT: s_add_u32 s2, s2, s4 9833; GFX6-NEXT: s_mov_b32 s5, s4 9834; GFX6-NEXT: s_addc_u32 s3, s3, s4 9835; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] 9836; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 9837; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 9838; GFX6-NEXT: s_sub_u32 s4, 0, s8 9839; GFX6-NEXT: s_subb_u32 s5, 0, s9 9840; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 9841; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 9842; GFX6-NEXT: v_rcp_f32_e32 v0, v0 9843; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9844; GFX6-NEXT: s_ashr_i32 s10, s3, 31 9845; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9846; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9847; GFX6-NEXT: v_trunc_f32_e32 v1, v1 9848; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9849; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9850; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9851; GFX6-NEXT: s_add_u32 s2, s2, s10 9852; GFX6-NEXT: s_mov_b32 s11, s10 9853; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 9854; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 9855; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 9856; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 9857; GFX6-NEXT: s_addc_u32 s3, s3, s10 9858; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9859; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 9860; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 9861; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 9862; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 9863; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 9864; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9865; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 9866; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 9867; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 9868; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 9869; GFX6-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] 9870; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 9871; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 9872; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 9873; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9874; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9875; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9876; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9877; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 9878; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 9879; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 9880; GFX6-NEXT: s_mov_b32 s5, s1 9881; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9882; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 9883; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 9884; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 9885; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 9886; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 9887; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 9888; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 9889; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 9890; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 9891; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 9892; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9893; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 9894; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc 9895; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 9896; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9897; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9898; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9899; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9900; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 9901; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 9902; GFX6-NEXT: v_mul_hi_u32 v4, s12, v1 9903; GFX6-NEXT: v_mul_hi_u32 v5, s13, v1 9904; GFX6-NEXT: v_mul_lo_u32 v1, s13, v1 9905; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9906; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9907; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 9908; GFX6-NEXT: v_mul_hi_u32 v0, s13, v0 9909; GFX6-NEXT: s_mov_b32 s4, s0 9910; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9911; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 9912; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 9913; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 9914; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 9915; GFX6-NEXT: v_mul_lo_u32 v1, s8, v1 9916; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0 9917; GFX6-NEXT: v_mul_lo_u32 v3, s9, v0 9918; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 9919; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 9920; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 9921; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s13, v1 9922; GFX6-NEXT: v_mov_b32_e32 v3, s9 9923; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 9924; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 9925; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s8, v0 9926; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 9927; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v5 9928; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 9929; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v4 9930; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 9931; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 9932; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v5 9933; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s8, v4 9934; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 9935; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 9936; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 9937; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 9938; GFX6-NEXT: v_mov_b32_e32 v5, s13 9939; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 9940; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 9941; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 9942; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 9943; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 9944; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 9945; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 9946; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 9947; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 9948; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 9949; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 9950; GFX6-NEXT: v_xor_b32_e32 v0, s10, v0 9951; GFX6-NEXT: v_xor_b32_e32 v1, s10, v1 9952; GFX6-NEXT: v_mov_b32_e32 v2, s10 9953; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0 9954; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 9955; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 9956; GFX6-NEXT: s_endpgm 9957; 9958; GFX9-LABEL: srem_i64_pow2_shl_denom: 9959; GFX9: ; %bb.0: 9960; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 9961; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 9962; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9963; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 9964; GFX9-NEXT: s_ashr_i32 s4, s3, 31 9965; GFX9-NEXT: s_add_u32 s2, s2, s4 9966; GFX9-NEXT: s_mov_b32 s5, s4 9967; GFX9-NEXT: s_addc_u32 s3, s3, s4 9968; GFX9-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] 9969; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 9970; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 9971; GFX9-NEXT: s_sub_u32 s2, 0, s8 9972; GFX9-NEXT: s_subb_u32 s3, 0, s9 9973; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 9974; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 9975; GFX9-NEXT: v_rcp_f32_e32 v0, v0 9976; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9977; GFX9-NEXT: s_ashr_i32 s10, s7, 31 9978; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9979; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9980; GFX9-NEXT: v_trunc_f32_e32 v1, v1 9981; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9982; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 9983; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 9984; GFX9-NEXT: s_add_u32 s0, s6, s10 9985; GFX9-NEXT: s_mov_b32 s11, s10 9986; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 9987; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 9988; GFX9-NEXT: v_mul_lo_u32 v5, s3, v0 9989; GFX9-NEXT: v_mul_lo_u32 v4, s2, v0 9990; GFX9-NEXT: s_addc_u32 s1, s7, s10 9991; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9992; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 9993; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 9994; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 9995; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 9996; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 9997; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 9998; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 9999; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 10000; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc 10001; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 10002; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 10003; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v6, vcc 10004; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc 10005; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10006; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 10007; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 10008; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 10009; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 10010; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 10011; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 10012; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 10013; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 10014; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 10015; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 10016; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 10017; GFX9-NEXT: v_mul_hi_u32 v7, v0, v5 10018; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 10019; GFX9-NEXT: v_mul_hi_u32 v4, v1, v5 10020; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 10021; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 10022; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 10023; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc 10024; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 10025; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 10026; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc 10027; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 10028; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 10029; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 10030; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 10031; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 10032; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 10033; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 10034; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 10035; GFX9-NEXT: v_mul_hi_u32 v5, s7, v1 10036; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 10037; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10038; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 10039; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 10040; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 10041; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 10042; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 10043; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc 10044; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 10045; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 10046; GFX9-NEXT: v_mul_lo_u32 v1, s8, v1 10047; GFX9-NEXT: v_mul_hi_u32 v2, s8, v0 10048; GFX9-NEXT: v_mul_lo_u32 v3, s9, v0 10049; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 10050; GFX9-NEXT: v_mov_b32_e32 v4, 0 10051; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 10052; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 10053; GFX9-NEXT: v_sub_u32_e32 v2, s7, v1 10054; GFX9-NEXT: v_mov_b32_e32 v3, s9 10055; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 10056; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc 10057; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s8, v0 10058; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1] 10059; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v6 10060; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 10061; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v5 10062; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] 10063; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 10064; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v6 10065; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s8, v5 10066; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] 10067; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] 10068; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 10069; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1] 10070; GFX9-NEXT: v_mov_b32_e32 v6, s7 10071; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v6, v1, vcc 10072; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 10073; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 10074; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 10075; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 10076; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 10077; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc 10078; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 10079; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 10080; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v3, s[0:1] 10081; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 10082; GFX9-NEXT: v_xor_b32_e32 v0, s10, v0 10083; GFX9-NEXT: v_xor_b32_e32 v1, s10, v1 10084; GFX9-NEXT: v_mov_b32_e32 v2, s10 10085; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s10, v0 10086; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc 10087; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] 10088; GFX9-NEXT: s_endpgm 10089 %shl.y = shl i64 4096, %y 10090 %r = srem i64 %x, %shl.y 10091 store i64 %r, i64 addrspace(1)* %out 10092 ret void 10093} 10094 10095define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 10096; CHECK-LABEL: @srem_v2i64_pow2k_denom( 10097; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 10098; CHECK-NEXT: [[TMP2:%.*]] = srem i64 [[TMP1]], 4096 10099; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 10100; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 10101; CHECK-NEXT: [[TMP5:%.*]] = srem i64 [[TMP4]], 4096 10102; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 10103; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 10104; CHECK-NEXT: ret void 10105; 10106; GFX6-LABEL: srem_v2i64_pow2k_denom: 10107; GFX6: ; %bb.0: 10108; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 10109; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 10110; GFX6-NEXT: s_mov_b32 s3, 0xf000 10111; GFX6-NEXT: s_mov_b32 s2, -1 10112; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10113; GFX6-NEXT: s_ashr_i32 s8, s5, 31 10114; GFX6-NEXT: s_lshr_b32 s8, s8, 20 10115; GFX6-NEXT: s_add_u32 s8, s4, s8 10116; GFX6-NEXT: s_addc_u32 s9, s5, 0 10117; GFX6-NEXT: s_and_b32 s8, s8, 0xfffff000 10118; GFX6-NEXT: s_sub_u32 s4, s4, s8 10119; GFX6-NEXT: s_subb_u32 s5, s5, s9 10120; GFX6-NEXT: s_ashr_i32 s8, s7, 31 10121; GFX6-NEXT: s_lshr_b32 s8, s8, 20 10122; GFX6-NEXT: s_add_u32 s8, s6, s8 10123; GFX6-NEXT: s_addc_u32 s9, s7, 0 10124; GFX6-NEXT: s_and_b32 s8, s8, 0xfffff000 10125; GFX6-NEXT: s_sub_u32 s6, s6, s8 10126; GFX6-NEXT: s_subb_u32 s7, s7, s9 10127; GFX6-NEXT: v_mov_b32_e32 v0, s4 10128; GFX6-NEXT: v_mov_b32_e32 v1, s5 10129; GFX6-NEXT: v_mov_b32_e32 v2, s6 10130; GFX6-NEXT: v_mov_b32_e32 v3, s7 10131; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 10132; GFX6-NEXT: s_endpgm 10133; 10134; GFX9-LABEL: srem_v2i64_pow2k_denom: 10135; GFX9: ; %bb.0: 10136; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 10137; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10138; GFX9-NEXT: v_mov_b32_e32 v4, 0 10139; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10140; GFX9-NEXT: s_ashr_i32 s0, s5, 31 10141; GFX9-NEXT: s_lshr_b32 s0, s0, 20 10142; GFX9-NEXT: s_add_u32 s0, s4, s0 10143; GFX9-NEXT: s_addc_u32 s1, s5, 0 10144; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 10145; GFX9-NEXT: s_sub_u32 s0, s4, s0 10146; GFX9-NEXT: s_subb_u32 s1, s5, s1 10147; GFX9-NEXT: s_ashr_i32 s4, s7, 31 10148; GFX9-NEXT: s_lshr_b32 s4, s4, 20 10149; GFX9-NEXT: s_add_u32 s4, s6, s4 10150; GFX9-NEXT: s_addc_u32 s5, s7, 0 10151; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 10152; GFX9-NEXT: s_sub_u32 s4, s6, s4 10153; GFX9-NEXT: s_subb_u32 s5, s7, s5 10154; GFX9-NEXT: v_mov_b32_e32 v0, s0 10155; GFX9-NEXT: v_mov_b32_e32 v1, s1 10156; GFX9-NEXT: v_mov_b32_e32 v2, s4 10157; GFX9-NEXT: v_mov_b32_e32 v3, s5 10158; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 10159; GFX9-NEXT: s_endpgm 10160 %r = srem <2 x i64> %x, <i64 4096, i64 4096> 10161 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 10162 ret void 10163} 10164 10165define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 10166; CHECK-LABEL: @srem_v2i64_pow2_shl_denom( 10167; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 10168; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 10169; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 10170; CHECK-NEXT: [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]] 10171; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 10172; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 10173; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 10174; CHECK-NEXT: [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]] 10175; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 10176; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 10177; CHECK-NEXT: ret void 10178; 10179; GFX6-LABEL: srem_v2i64_pow2_shl_denom: 10180; GFX6: ; %bb.0: 10181; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 10182; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 10183; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10184; GFX6-NEXT: s_mov_b32 s11, 0xf000 10185; GFX6-NEXT: s_lshl_b64 s[14:15], s[2:3], s10 10186; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 10187; GFX6-NEXT: s_ashr_i32 s8, s3, 31 10188; GFX6-NEXT: s_add_u32 s2, s2, s8 10189; GFX6-NEXT: s_mov_b32 s9, s8 10190; GFX6-NEXT: s_addc_u32 s3, s3, s8 10191; GFX6-NEXT: s_xor_b64 s[16:17], s[2:3], s[8:9] 10192; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s16 10193; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s17 10194; GFX6-NEXT: s_sub_u32 s2, 0, s16 10195; GFX6-NEXT: s_subb_u32 s3, 0, s17 10196; GFX6-NEXT: s_ashr_i32 s12, s5, 31 10197; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 10198; GFX6-NEXT: v_rcp_f32_e32 v0, v0 10199; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9 10200; GFX6-NEXT: s_add_u32 s0, s4, s12 10201; GFX6-NEXT: s_mov_b32 s13, s12 10202; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10203; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10204; GFX6-NEXT: v_trunc_f32_e32 v1, v1 10205; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10206; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 10207; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 10208; GFX6-NEXT: s_addc_u32 s1, s5, s12 10209; GFX6-NEXT: s_xor_b64 s[4:5], s[0:1], s[12:13] 10210; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 10211; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 10212; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 10213; GFX6-NEXT: v_mul_lo_u32 v4, s2, v0 10214; GFX6-NEXT: s_mov_b32 s10, -1 10215; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10216; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 10217; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 10218; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 10219; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 10220; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 10221; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 10222; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 10223; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc 10224; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 10225; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 10226; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 10227; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 10228; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc 10229; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10230; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 10231; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 10232; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 10233; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 10234; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 10235; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 10236; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10237; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 10238; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 10239; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 10240; GFX6-NEXT: v_mul_hi_u32 v7, v0, v3 10241; GFX6-NEXT: v_mul_hi_u32 v8, v0, v2 10242; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 10243; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 10244; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 10245; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 10246; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 10247; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 10248; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 10249; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v7, v5, vcc 10250; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc 10251; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10252; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 10253; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 10254; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 10255; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 10256; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 10257; GFX6-NEXT: v_mul_hi_u32 v4, s4, v1 10258; GFX6-NEXT: v_mul_hi_u32 v5, s5, v1 10259; GFX6-NEXT: v_mul_lo_u32 v1, s5, v1 10260; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10261; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 10262; GFX6-NEXT: v_mul_lo_u32 v4, s5, v0 10263; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 10264; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 10265; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 10266; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 10267; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 10268; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 10269; GFX6-NEXT: v_mul_lo_u32 v1, s16, v1 10270; GFX6-NEXT: v_mul_hi_u32 v2, s16, v0 10271; GFX6-NEXT: v_mul_lo_u32 v3, s17, v0 10272; GFX6-NEXT: v_mul_lo_u32 v0, s16, v0 10273; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 10274; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 10275; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v1 10276; GFX6-NEXT: v_mov_b32_e32 v3, s17 10277; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 10278; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 10279; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s16, v0 10280; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 10281; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s17, v5 10282; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 10283; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v4 10284; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 10285; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 10286; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v5 10287; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s16, v4 10288; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 10289; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 10290; GFX6-NEXT: s_ashr_i32 s2, s15, 31 10291; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 10292; GFX6-NEXT: s_add_u32 s4, s14, s2 10293; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 10294; GFX6-NEXT: v_mov_b32_e32 v5, s5 10295; GFX6-NEXT: s_mov_b32 s3, s2 10296; GFX6-NEXT: s_addc_u32 s5, s15, s2 10297; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[2:3] 10298; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 10299; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s5 10300; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 10301; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s17, v1 10302; GFX6-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7 10303; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 10304; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s16, v0 10305; GFX6-NEXT: v_rcp_f32_e32 v6, v6 10306; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 10307; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s17, v1 10308; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc 10309; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 10310; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 10311; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 10312; GFX6-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v6 10313; GFX6-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 10314; GFX6-NEXT: v_trunc_f32_e32 v4, v4 10315; GFX6-NEXT: v_mac_f32_e32 v3, 0xcf800000, v4 10316; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 10317; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 10318; GFX6-NEXT: s_sub_u32 s0, 0, s4 10319; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 10320; GFX6-NEXT: v_mul_hi_u32 v2, s0, v3 10321; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 10322; GFX6-NEXT: s_subb_u32 s1, 0, s5 10323; GFX6-NEXT: v_mul_lo_u32 v6, s1, v3 10324; GFX6-NEXT: s_ashr_i32 s14, s7, 31 10325; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 10326; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 10327; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v6 10328; GFX6-NEXT: v_mul_lo_u32 v6, v3, v2 10329; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 10330; GFX6-NEXT: v_mul_hi_u32 v8, v3, v2 10331; GFX6-NEXT: v_mul_hi_u32 v9, v4, v2 10332; GFX6-NEXT: v_mul_lo_u32 v2, v4, v2 10333; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 10334; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v8, vcc 10335; GFX6-NEXT: v_mul_lo_u32 v8, v4, v5 10336; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 10337; GFX6-NEXT: s_mov_b32 s15, s14 10338; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 10339; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 10340; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc 10341; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc 10342; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 10343; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 10344; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10345; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 10346; GFX6-NEXT: v_mul_lo_u32 v4, s0, v3 10347; GFX6-NEXT: v_mul_hi_u32 v5, s0, v2 10348; GFX6-NEXT: v_mul_lo_u32 v6, s1, v2 10349; GFX6-NEXT: v_xor_b32_e32 v1, s12, v1 10350; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 10351; GFX6-NEXT: v_mul_lo_u32 v5, s0, v2 10352; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 10353; GFX6-NEXT: v_mul_lo_u32 v8, v2, v4 10354; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 10355; GFX6-NEXT: v_mul_hi_u32 v10, v2, v4 10356; GFX6-NEXT: v_mul_hi_u32 v7, v3, v5 10357; GFX6-NEXT: v_mul_lo_u32 v5, v3, v5 10358; GFX6-NEXT: v_mul_hi_u32 v6, v3, v4 10359; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 10360; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 10361; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 10362; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 10363; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc 10364; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc 10365; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 10366; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 10367; GFX6-NEXT: s_add_u32 s0, s6, s14 10368; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 10369; GFX6-NEXT: s_addc_u32 s1, s7, s14 10370; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc 10371; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[14:15] 10372; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 10373; GFX6-NEXT: v_mul_hi_u32 v5, s6, v2 10374; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 10375; GFX6-NEXT: v_mul_hi_u32 v8, s7, v3 10376; GFX6-NEXT: v_mul_lo_u32 v3, s7, v3 10377; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 10378; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc 10379; GFX6-NEXT: v_mul_lo_u32 v7, s7, v2 10380; GFX6-NEXT: v_mul_hi_u32 v2, s7, v2 10381; GFX6-NEXT: v_mov_b32_e32 v6, s12 10382; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 10383; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc 10384; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc 10385; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 10386; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 10387; GFX6-NEXT: v_mul_lo_u32 v3, s4, v3 10388; GFX6-NEXT: v_mul_hi_u32 v4, s4, v2 10389; GFX6-NEXT: v_mul_lo_u32 v5, s5, v2 10390; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 10391; GFX6-NEXT: v_mul_lo_u32 v2, s4, v2 10392; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc 10393; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 10394; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 10395; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s7, v3 10396; GFX6-NEXT: v_mov_b32_e32 v5, s5 10397; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 10398; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 10399; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s4, v2 10400; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] 10401; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v7 10402; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 10403; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v6 10404; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1] 10405; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 10406; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s5, v7 10407; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s4, v6 10408; GFX6-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 10409; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 10410; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 10411; GFX6-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] 10412; GFX6-NEXT: v_mov_b32_e32 v7, s7 10413; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc 10414; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 10415; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 10416; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 10417; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 10418; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s5, v3 10419; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 10420; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 10421; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 10422; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 10423; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 10424; GFX6-NEXT: v_xor_b32_e32 v2, s14, v2 10425; GFX6-NEXT: v_xor_b32_e32 v3, s14, v3 10426; GFX6-NEXT: v_mov_b32_e32 v4, s14 10427; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s14, v2 10428; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc 10429; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10430; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 10431; GFX6-NEXT: s_endpgm 10432; 10433; GFX9-LABEL: srem_v2i64_pow2_shl_denom: 10434; GFX9: ; %bb.0: 10435; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 10436; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 10437; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10438; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s10 10439; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s8 10440; GFX9-NEXT: s_ashr_i32 s8, s3, 31 10441; GFX9-NEXT: s_add_u32 s2, s2, s8 10442; GFX9-NEXT: s_mov_b32 s9, s8 10443; GFX9-NEXT: s_addc_u32 s3, s3, s8 10444; GFX9-NEXT: s_xor_b64 s[12:13], s[2:3], s[8:9] 10445; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 10446; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 10447; GFX9-NEXT: s_sub_u32 s2, 0, s12 10448; GFX9-NEXT: s_subb_u32 s3, 0, s13 10449; GFX9-NEXT: s_ashr_i32 s8, s5, 31 10450; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 10451; GFX9-NEXT: v_rcp_f32_e32 v0, v0 10452; GFX9-NEXT: s_mov_b32 s9, s8 10453; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10454; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10455; GFX9-NEXT: v_trunc_f32_e32 v1, v1 10456; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10457; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 10458; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 10459; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 10460; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 10461; GFX9-NEXT: v_mul_lo_u32 v5, s3, v0 10462; GFX9-NEXT: v_mul_lo_u32 v4, s2, v0 10463; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 10464; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 10465; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 10466; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 10467; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 10468; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 10469; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 10470; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 10471; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 10472; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc 10473; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 10474; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 10475; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v6, vcc 10476; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v8, vcc 10477; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10478; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 10479; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 10480; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 10481; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 10482; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 10483; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 10484; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 10485; GFX9-NEXT: s_add_u32 s2, s4, s8 10486; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 10487; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 10488; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 10489; GFX9-NEXT: v_mul_hi_u32 v7, v0, v5 10490; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 10491; GFX9-NEXT: v_mul_hi_u32 v4, v1, v5 10492; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 10493; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 10494; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 10495; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v8, vcc 10496; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 10497; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 10498; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc 10499; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 10500; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 10501; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 10502; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 10503; GFX9-NEXT: s_addc_u32 s3, s5, s8 10504; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 10505; GFX9-NEXT: s_xor_b64 s[14:15], s[2:3], s[8:9] 10506; GFX9-NEXT: v_mul_lo_u32 v2, s14, v1 10507; GFX9-NEXT: v_mul_hi_u32 v3, s14, v0 10508; GFX9-NEXT: v_mul_hi_u32 v4, s14, v1 10509; GFX9-NEXT: v_mul_hi_u32 v5, s15, v1 10510; GFX9-NEXT: v_mul_lo_u32 v1, s15, v1 10511; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10512; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 10513; GFX9-NEXT: v_mul_lo_u32 v4, s15, v0 10514; GFX9-NEXT: v_mul_hi_u32 v0, s15, v0 10515; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 10516; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 10517; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 10518; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc 10519; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1 10520; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v2, vcc 10521; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0 10522; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1 10523; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1 10524; GFX9-NEXT: v_mul_lo_u32 v1, s12, v1 10525; GFX9-NEXT: v_mov_b32_e32 v0, 0 10526; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 10527; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 10528; GFX9-NEXT: v_sub_u32_e32 v3, s15, v2 10529; GFX9-NEXT: v_mov_b32_e32 v4, s13 10530; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s14, v1 10531; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc 10532; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s12, v1 10533; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1] 10534; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v6 10535; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 10536; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v5 10537; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 10538; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v6 10539; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] 10540; GFX9-NEXT: s_ashr_i32 s2, s11, 31 10541; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] 10542; GFX9-NEXT: s_add_u32 s10, s10, s2 10543; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s12, v5 10544; GFX9-NEXT: s_mov_b32 s3, s2 10545; GFX9-NEXT: s_addc_u32 s11, s11, s2 10546; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] 10547; GFX9-NEXT: s_xor_b64 s[10:11], s[10:11], s[2:3] 10548; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 10549; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s10 10550; GFX9-NEXT: v_cvt_f32_u32_e32 v8, s11 10551; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] 10552; GFX9-NEXT: v_mov_b32_e32 v6, s15 10553; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v6, v2, vcc 10554; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 10555; GFX9-NEXT: v_mac_f32_e32 v7, 0x4f800000, v8 10556; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 10557; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v1 10558; GFX9-NEXT: v_rcp_f32_e32 v7, v7 10559; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc 10560; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 10561; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc 10562; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 10563; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 10564; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] 10565; GFX9-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v7 10566; GFX9-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4 10567; GFX9-NEXT: v_trunc_f32_e32 v5, v5 10568; GFX9-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5 10569; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 10570; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 10571; GFX9-NEXT: s_sub_u32 s0, 0, s10 10572; GFX9-NEXT: s_subb_u32 s1, 0, s11 10573; GFX9-NEXT: v_mul_hi_u32 v6, s0, v4 10574; GFX9-NEXT: v_mul_lo_u32 v7, s0, v5 10575; GFX9-NEXT: v_mul_lo_u32 v8, s1, v4 10576; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 10577; GFX9-NEXT: v_mul_lo_u32 v3, s0, v4 10578; GFX9-NEXT: v_add_u32_e32 v6, v6, v7 10579; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 10580; GFX9-NEXT: v_mul_lo_u32 v7, v4, v6 10581; GFX9-NEXT: v_mul_hi_u32 v8, v4, v3 10582; GFX9-NEXT: v_mul_hi_u32 v9, v4, v6 10583; GFX9-NEXT: v_mul_hi_u32 v10, v5, v6 10584; GFX9-NEXT: v_mul_lo_u32 v6, v5, v6 10585; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 10586; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc 10587; GFX9-NEXT: v_mul_lo_u32 v9, v5, v3 10588; GFX9-NEXT: v_mul_hi_u32 v3, v5, v3 10589; GFX9-NEXT: s_ashr_i32 s12, s7, 31 10590; GFX9-NEXT: s_mov_b32 s13, s12 10591; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 10592; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc 10593; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v10, vcc 10594; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 10595; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 10596; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 10597; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v5, v6, vcc 10598; GFX9-NEXT: v_mul_lo_u32 v5, s0, v4 10599; GFX9-NEXT: v_mul_hi_u32 v6, s0, v3 10600; GFX9-NEXT: v_mul_lo_u32 v7, s1, v3 10601; GFX9-NEXT: v_mul_lo_u32 v8, s0, v3 10602; GFX9-NEXT: s_add_u32 s0, s6, s12 10603; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 10604; GFX9-NEXT: v_add_u32_e32 v5, v5, v7 10605; GFX9-NEXT: v_mul_lo_u32 v9, v3, v5 10606; GFX9-NEXT: v_mul_hi_u32 v10, v3, v8 10607; GFX9-NEXT: v_mul_hi_u32 v11, v3, v5 10608; GFX9-NEXT: v_mul_hi_u32 v7, v4, v8 10609; GFX9-NEXT: v_mul_lo_u32 v8, v4, v8 10610; GFX9-NEXT: v_mul_hi_u32 v6, v4, v5 10611; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 10612; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v11, vcc 10613; GFX9-NEXT: v_mul_lo_u32 v5, v4, v5 10614; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 10615; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v7, vcc 10616; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 10617; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 10618; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 10619; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 10620; GFX9-NEXT: s_addc_u32 s1, s7, s12 10621; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc 10622; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] 10623; GFX9-NEXT: v_mul_lo_u32 v5, s6, v4 10624; GFX9-NEXT: v_mul_hi_u32 v6, s6, v3 10625; GFX9-NEXT: v_mul_hi_u32 v8, s6, v4 10626; GFX9-NEXT: v_mul_hi_u32 v9, s7, v4 10627; GFX9-NEXT: v_mul_lo_u32 v4, s7, v4 10628; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 10629; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc 10630; GFX9-NEXT: v_mul_lo_u32 v8, s7, v3 10631; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 10632; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 10633; GFX9-NEXT: v_xor_b32_e32 v2, s8, v2 10634; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 10635; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc 10636; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v9, vcc 10637; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 10638; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 10639; GFX9-NEXT: v_mul_lo_u32 v4, s10, v4 10640; GFX9-NEXT: v_mul_hi_u32 v5, s10, v3 10641; GFX9-NEXT: v_mul_lo_u32 v6, s11, v3 10642; GFX9-NEXT: v_mul_lo_u32 v3, s10, v3 10643; GFX9-NEXT: v_mov_b32_e32 v7, s8 10644; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s8, v1 10645; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 10646; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc 10647; GFX9-NEXT: v_add_u32_e32 v4, v4, v6 10648; GFX9-NEXT: v_sub_u32_e32 v5, s7, v4 10649; GFX9-NEXT: v_mov_b32_e32 v6, s11 10650; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 10651; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc 10652; GFX9-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s10, v3 10653; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[2:3], 0, v5, s[0:1] 10654; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s11, v8 10655; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 10656; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s10, v7 10657; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, s[0:1] 10658; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[2:3] 10659; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s11, v8 10660; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s10, v7 10661; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[2:3] 10662; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] 10663; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 10664; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1] 10665; GFX9-NEXT: v_mov_b32_e32 v8, s7 10666; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v8, v4, vcc 10667; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v4 10668; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 10669; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 10670; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc 10671; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v4 10672; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc 10673; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 10674; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc 10675; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v6, s[0:1] 10676; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 10677; GFX9-NEXT: v_xor_b32_e32 v3, s12, v3 10678; GFX9-NEXT: v_xor_b32_e32 v4, s12, v4 10679; GFX9-NEXT: v_mov_b32_e32 v5, s12 10680; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s12, v3 10681; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v5, vcc 10682; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10683; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[4:5] 10684; GFX9-NEXT: s_endpgm 10685 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 10686 %r = srem <2 x i64> %x, %shl.y 10687 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 10688 ret void 10689} 10690