1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 3; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s 5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s 6 7define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 8; CHECK-LABEL: @udiv_i32( 9; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 10; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 11; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000 12; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 13; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]] 14; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]] 15; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64 16; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 17; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 18; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 19; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 20; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 21; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]] 22; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64 23; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 24; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 25; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 26; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 27; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 28; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]] 29; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]] 30; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]] 31; CHECK-NEXT: [[TMP23:%.*]] = add i32 [[TMP19]], 1 32; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP19]] 33; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP21]], [[Y]] 34; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP22]], i32 [[TMP25]], i32 [[TMP21]] 35; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]] 36; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP24]], 1 37; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]] 38; CHECK-NEXT: store i32 [[TMP29]], i32 addrspace(1)* [[OUT:%.*]], align 4 39; CHECK-NEXT: ret void 40; 41; GFX6-LABEL: udiv_i32: 42; GFX6: ; %bb.0: 43; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 44; GFX6-NEXT: s_mov_b32 s7, 0xf000 45; GFX6-NEXT: s_mov_b32 s6, -1 46; GFX6-NEXT: s_waitcnt lgkmcnt(0) 47; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 48; GFX6-NEXT: s_sub_i32 s4, 0, s3 49; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 50; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 51; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 52; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 53; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 54; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 55; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 56; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 57; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 58; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 59; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 60; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 61; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 62; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 63; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 64; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 65; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 66; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 67; GFX6-NEXT: s_waitcnt lgkmcnt(0) 68; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 69; GFX6-NEXT: s_endpgm 70; GFX9-LABEL: udiv_i32: 71; GFX9: ; %bb.0: 72; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 73; GFX9-NEXT: v_mov_b32_e32 v2, 0 74; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 75; GFX9-NEXT: s_waitcnt lgkmcnt(0) 76; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 77; GFX9-NEXT: s_sub_i32 s4, 0, s3 78; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 79; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 80; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 81; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 82; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 83; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 84; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 85; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3 86; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 87; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 88; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 89; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 90; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 91; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 92; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 93; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 94; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 95; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 96; GFX9-NEXT: s_endpgm 97 %r = udiv i32 %x, %y 98 store i32 %r, i32 addrspace(1)* %out 99 ret void 100} 101 102define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 103; CHECK-LABEL: @urem_i32( 104; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 105; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 106; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000 107; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 108; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]] 109; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]] 110; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64 111; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 112; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 113; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 114; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 115; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 116; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]] 117; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64 118; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 119; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 120; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 121; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 122; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 123; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]] 124; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]] 125; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]] 126; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP21]], [[Y]] 127; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP21]] 128; CHECK-NEXT: [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]] 129; CHECK-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]] 130; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]] 131; CHECK-NEXT: store i32 [[TMP27]], i32 addrspace(1)* [[OUT:%.*]], align 4 132; CHECK-NEXT: ret void 133; 134; GFX6-LABEL: urem_i32: 135; GFX6: ; %bb.0: 136; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 137; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 138; GFX6-NEXT: s_mov_b32 s3, 0xf000 139; GFX6-NEXT: s_waitcnt lgkmcnt(0) 140; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 141; GFX6-NEXT: s_sub_i32 s2, 0, s5 142; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 143; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 144; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 145; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 146; GFX6-NEXT: s_mov_b32 s2, -1 147; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 148; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 149; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 150; GFX6-NEXT: v_mul_lo_u32 v0, v0, s5 151; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 152; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s5, v0 153; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 154; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 155; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s5, v0 156; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 157; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 158; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 159; GFX6-NEXT: s_endpgm 160; GFX9-LABEL: urem_i32: 161; GFX9: ; %bb.0: 162; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 163; GFX9-NEXT: s_nop 0 164; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 165; GFX9-NEXT: s_waitcnt lgkmcnt(0) 166; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 167; GFX9-NEXT: s_sub_i32 s4, 0, s3 168; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 169; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 170; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 171; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 172; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 173; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 174; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 175; GFX9-NEXT: v_mov_b32_e32 v1, 0 176; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 177; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 178; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 179; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 180; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 181; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 182; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 183; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 184; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 185; GFX9-NEXT: s_endpgm 186 %r = urem i32 %x, %y 187 store i32 %r, i32 addrspace(1)* %out 188 ret void 189} 190 191define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 192; CHECK-LABEL: @sdiv_i32( 193; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 194; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 195; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 196; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[X]], [[TMP1]] 197; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]] 198; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]] 199; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]] 200; CHECK-NEXT: [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float 201; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]]) 202; CHECK-NEXT: [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41EFFFFFC0000000 203; CHECK-NEXT: [[TMP11:%.*]] = fptoui float [[TMP10]] to i32 204; CHECK-NEXT: [[TMP12:%.*]] = sub i32 0, [[TMP7]] 205; CHECK-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP11]] 206; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP11]] to i64 207; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 208; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 209; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 210; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 211; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 212; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP11]], [[TMP19]] 213; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP6]] to i64 214; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP20]] to i64 215; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]] 216; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 217; CHECK-NEXT: [[TMP25:%.*]] = lshr i64 [[TMP23]], 32 218; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 219; CHECK-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], [[TMP7]] 220; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP6]], [[TMP27]] 221; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP7]] 222; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 223; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 224; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP28]], [[TMP7]] 225; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP29]], i32 [[TMP32]], i32 [[TMP28]] 226; CHECK-NEXT: [[TMP34:%.*]] = icmp uge i32 [[TMP33]], [[TMP7]] 227; CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP31]], 1 228; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]] 229; CHECK-NEXT: [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]] 230; CHECK-NEXT: [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]] 231; CHECK-NEXT: store i32 [[TMP38]], i32 addrspace(1)* [[OUT:%.*]], align 4 232; CHECK-NEXT: ret void 233; 234; GFX6-LABEL: sdiv_i32: 235; GFX6: ; %bb.0: 236; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 237; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 238; GFX6-NEXT: s_mov_b32 s7, 0xf000 239; GFX6-NEXT: s_mov_b32 s6, -1 240; GFX6-NEXT: s_waitcnt lgkmcnt(0) 241; GFX6-NEXT: s_ashr_i32 s8, s3, 31 242; GFX6-NEXT: s_add_i32 s3, s3, s8 243; GFX6-NEXT: s_xor_b32 s9, s3, s8 244; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 245; GFX6-NEXT: s_sub_i32 s3, 0, s9 246; GFX6-NEXT: s_ashr_i32 s0, s2, 31 247; GFX6-NEXT: s_add_i32 s1, s2, s0 248; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 249; GFX6-NEXT: s_xor_b32 s1, s1, s0 250; GFX6-NEXT: s_xor_b32 s2, s0, s8 251; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 252; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 253; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 254; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 255; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 256; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 257; GFX6-NEXT: v_mul_lo_u32 v1, v0, s9 258; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 259; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 260; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v1 261; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 262; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s9, v1 263; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 264; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 265; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 266; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 267; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 268; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 269; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 270; GFX6-NEXT: s_endpgm 271; GFX9-LABEL: sdiv_i32: 272; GFX9: ; %bb.0: 273; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 274; GFX9-NEXT: v_mov_b32_e32 v2, 0 275; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 276; GFX9-NEXT: s_waitcnt lgkmcnt(0) 277; GFX9-NEXT: s_ashr_i32 s4, s3, 31 278; GFX9-NEXT: s_add_i32 s3, s3, s4 279; GFX9-NEXT: s_xor_b32 s5, s3, s4 280; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 281; GFX9-NEXT: s_sub_i32 s3, 0, s5 282; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 283; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 284; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 285; GFX9-NEXT: v_mul_lo_u32 v1, s3, v0 286; GFX9-NEXT: s_ashr_i32 s3, s2, 31 287; GFX9-NEXT: s_add_i32 s2, s2, s3 288; GFX9-NEXT: s_xor_b32 s2, s2, s3 289; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 290; GFX9-NEXT: s_xor_b32 s3, s3, s4 291; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 292; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 293; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 294; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 295; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 296; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 297; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 298; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 299; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 300; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 301; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 302; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 303; GFX9-NEXT: v_xor_b32_e32 v0, s3, v0 304; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0 305; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 306; GFX9-NEXT: s_endpgm 307 %r = sdiv i32 %x, %y 308 store i32 %r, i32 addrspace(1)* %out 309 ret void 310} 311 312define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 313; CHECK-LABEL: @srem_i32( 314; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 315; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 316; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[X]], [[TMP1]] 317; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]] 318; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]] 319; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]] 320; CHECK-NEXT: [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float 321; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 322; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41EFFFFFC0000000 323; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP9]] to i32 324; CHECK-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP6]] 325; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP10]] 326; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP10]] to i64 327; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64 328; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]] 329; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 330; CHECK-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP15]], 32 331; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 332; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP10]], [[TMP18]] 333; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64 334; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP19]] to i64 335; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]] 336; CHECK-NEXT: [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32 337; CHECK-NEXT: [[TMP24:%.*]] = lshr i64 [[TMP22]], 32 338; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 339; CHECK-NEXT: [[TMP26:%.*]] = mul i32 [[TMP25]], [[TMP6]] 340; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP5]], [[TMP26]] 341; CHECK-NEXT: [[TMP28:%.*]] = icmp uge i32 [[TMP27]], [[TMP6]] 342; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP6]] 343; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP28]], i32 [[TMP29]], i32 [[TMP27]] 344; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP6]] 345; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP30]], [[TMP6]] 346; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]] 347; CHECK-NEXT: [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]] 348; CHECK-NEXT: [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]] 349; CHECK-NEXT: store i32 [[TMP35]], i32 addrspace(1)* [[OUT:%.*]], align 4 350; CHECK-NEXT: ret void 351; 352; GFX6-LABEL: srem_i32: 353; GFX6: ; %bb.0: 354; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 355; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 356; GFX6-NEXT: s_waitcnt lgkmcnt(0) 357; GFX6-NEXT: s_ashr_i32 s4, s3, 31 358; GFX6-NEXT: s_add_i32 s3, s3, s4 359; GFX6-NEXT: s_xor_b32 s6, s3, s4 360; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 361; GFX6-NEXT: s_sub_i32 s3, 0, s6 362; GFX6-NEXT: s_ashr_i32 s4, s2, 31 363; GFX6-NEXT: s_add_i32 s2, s2, s4 364; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 365; GFX6-NEXT: s_xor_b32 s5, s2, s4 366; GFX6-NEXT: s_mov_b32 s2, -1 367; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 368; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 369; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 370; GFX6-NEXT: s_mov_b32 s3, 0xf000 371; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 372; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 373; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 374; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 375; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 376; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s6, v0 377; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 378; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 379; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s6, v0 380; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 381; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 382; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 383; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 384; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 385; GFX6-NEXT: s_endpgm 386; GFX9-LABEL: srem_i32: 387; GFX9: ; %bb.0: 388; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 389; GFX9-NEXT: s_nop 0 390; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 391; GFX9-NEXT: s_waitcnt lgkmcnt(0) 392; GFX9-NEXT: s_ashr_i32 s4, s3, 31 393; GFX9-NEXT: s_add_i32 s3, s3, s4 394; GFX9-NEXT: s_xor_b32 s3, s3, s4 395; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 396; GFX9-NEXT: s_sub_i32 s4, 0, s3 397; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 398; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 399; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 400; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 401; GFX9-NEXT: s_ashr_i32 s4, s2, 31 402; GFX9-NEXT: s_add_i32 s2, s2, s4 403; GFX9-NEXT: s_xor_b32 s2, s2, s4 404; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 405; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 406; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 407; GFX9-NEXT: v_mov_b32_e32 v1, 0 408; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 409; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 410; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 411; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 412; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 413; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 414; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 415; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 416; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 417; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 418; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 419; GFX9-NEXT: s_endpgm 420 %r = srem i32 %x, %y 421 store i32 %r, i32 addrspace(1)* %out 422 ret void 423} 424 425define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 426; CHECK-LABEL: @udiv_i16( 427; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 428; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 429; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 430; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 431; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 432; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 433; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 434; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 435; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 436; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 437; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 438; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 439; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 440; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 441; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 442; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 65535 443; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 444; CHECK-NEXT: store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]], align 2 445; CHECK-NEXT: ret void 446; 447; GFX6-LABEL: udiv_i16: 448; GFX6: ; %bb.0: 449; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb 450; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 451; GFX6-NEXT: s_waitcnt lgkmcnt(0) 452; GFX6-NEXT: s_lshr_b32 s3, s2, 16 453; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 454; GFX6-NEXT: s_and_b32 s2, s2, 0xffff 455; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s2 456; GFX6-NEXT: s_mov_b32 s3, 0xf000 457; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 458; GFX6-NEXT: s_mov_b32 s2, -1 459; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 460; GFX6-NEXT: v_trunc_f32_e32 v2, v2 461; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 462; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 463; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 464; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 465; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 466; GFX6-NEXT: s_endpgm 467; GFX9-LABEL: udiv_i16: 468; GFX9: ; %bb.0: 469; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 470; GFX9-NEXT: v_mov_b32_e32 v3, 0 471; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 472; GFX9-NEXT: s_waitcnt lgkmcnt(0) 473; GFX9-NEXT: s_lshr_b32 s3, s2, 16 474; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 475; GFX9-NEXT: s_and_b32 s2, s2, 0xffff 476; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 477; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 478; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 479; GFX9-NEXT: v_trunc_f32_e32 v2, v2 480; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 481; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 482; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 483; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 484; GFX9-NEXT: global_store_short v3, v0, s[0:1] 485; GFX9-NEXT: s_endpgm 486 %r = udiv i16 %x, %y 487 store i16 %r, i16 addrspace(1)* %out 488 ret void 489} 490 491define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 492; CHECK-LABEL: @urem_i16( 493; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 494; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 495; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 496; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 497; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 498; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 499; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 500; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 501; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 502; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 503; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 504; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 505; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 506; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 507; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 508; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 509; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 510; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 511; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 512; CHECK-NEXT: store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]], align 2 513; CHECK-NEXT: ret void 514; 515; GFX6-LABEL: urem_i16: 516; GFX6: ; %bb.0: 517; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 518; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 519; GFX6-NEXT: s_waitcnt lgkmcnt(0) 520; GFX6-NEXT: s_lshr_b32 s2, s4, 16 521; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 522; GFX6-NEXT: s_and_b32 s3, s4, 0xffff 523; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 524; GFX6-NEXT: s_mov_b32 s3, 0xf000 525; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 526; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 527; GFX6-NEXT: v_trunc_f32_e32 v2, v2 528; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 529; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 530; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 531; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 532; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 533; GFX6-NEXT: s_mov_b32 s2, -1 534; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 535; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 536; GFX6-NEXT: s_endpgm 537; GFX9-LABEL: urem_i16: 538; GFX9: ; %bb.0: 539; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 540; GFX9-NEXT: s_nop 0 541; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 542; GFX9-NEXT: s_waitcnt lgkmcnt(0) 543; GFX9-NEXT: s_lshr_b32 s3, s2, 16 544; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 545; GFX9-NEXT: s_and_b32 s4, s2, 0xffff 546; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 547; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 548; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 549; GFX9-NEXT: v_trunc_f32_e32 v2, v2 550; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 551; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 552; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 553; GFX9-NEXT: v_mov_b32_e32 v1, 0 554; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 555; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 556; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 557; GFX9-NEXT: global_store_short v1, v0, s[0:1] 558; GFX9-NEXT: s_endpgm 559 %r = urem i16 %x, %y 560 store i16 %r, i16 addrspace(1)* %out 561 ret void 562} 563 564define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 565; CHECK-LABEL: @sdiv_i16( 566; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 567; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 568; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 569; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 570; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 571; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 572; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 573; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 574; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 575; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 576; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 577; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 578; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 579; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 580; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 581; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 582; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 583; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 584; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 16 585; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 16 586; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 587; CHECK-NEXT: store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]], align 2 588; CHECK-NEXT: ret void 589; 590; GFX6-LABEL: sdiv_i16: 591; GFX6: ; %bb.0: 592; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 593; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 594; GFX6-NEXT: s_mov_b32 s7, 0xf000 595; GFX6-NEXT: s_mov_b32 s6, -1 596; GFX6-NEXT: s_waitcnt lgkmcnt(0) 597; GFX6-NEXT: s_ashr_i32 s1, s0, 16 598; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 599; GFX6-NEXT: s_sext_i32_i16 s0, s0 600; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 601; GFX6-NEXT: s_xor_b32 s0, s0, s1 602; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 603; GFX6-NEXT: s_ashr_i32 s0, s0, 30 604; GFX6-NEXT: s_or_b32 s0, s0, 1 605; GFX6-NEXT: v_mov_b32_e32 v3, s0 606; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 607; GFX6-NEXT: v_trunc_f32_e32 v2, v2 608; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 609; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 610; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 611; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 612; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 613; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 614; GFX6-NEXT: s_endpgm 615; GFX9-LABEL: sdiv_i16: 616; GFX9: ; %bb.0: 617; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 618; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 619; GFX9-NEXT: v_mov_b32_e32 v1, 0 620; GFX9-NEXT: s_waitcnt lgkmcnt(0) 621; GFX9-NEXT: s_ashr_i32 s0, s4, 16 622; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 623; GFX9-NEXT: s_sext_i32_i16 s1, s4 624; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 625; GFX9-NEXT: s_xor_b32 s0, s1, s0 626; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 627; GFX9-NEXT: s_ashr_i32 s0, s0, 30 628; GFX9-NEXT: s_or_b32 s4, s0, 1 629; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 630; GFX9-NEXT: v_trunc_f32_e32 v3, v3 631; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 632; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 633; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 634; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 635; GFX9-NEXT: s_cselect_b32 s0, s4, 0 636; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 637; GFX9-NEXT: global_store_short v1, v0, s[2:3] 638; GFX9-NEXT: s_endpgm 639 %r = sdiv i16 %x, %y 640 store i16 %r, i16 addrspace(1)* %out 641 ret void 642} 643 644define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 645; CHECK-LABEL: @srem_i16( 646; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 647; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 648; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 649; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 650; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 651; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 652; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 653; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 654; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 655; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 656; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 657; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 658; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 659; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 660; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 661; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 662; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 663; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 664; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 665; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 666; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 667; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 668; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 669; CHECK-NEXT: store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]], align 2 670; CHECK-NEXT: ret void 671; 672; GFX6-LABEL: srem_i16: 673; GFX6: ; %bb.0: 674; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 675; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 676; GFX6-NEXT: s_waitcnt lgkmcnt(0) 677; GFX6-NEXT: s_ashr_i32 s2, s4, 16 678; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 679; GFX6-NEXT: s_sext_i32_i16 s3, s4 680; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 681; GFX6-NEXT: s_xor_b32 s3, s3, s2 682; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 683; GFX6-NEXT: s_ashr_i32 s3, s3, 30 684; GFX6-NEXT: s_or_b32 s3, s3, 1 685; GFX6-NEXT: v_mov_b32_e32 v3, s3 686; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 687; GFX6-NEXT: v_trunc_f32_e32 v2, v2 688; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 689; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 690; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 691; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 692; GFX6-NEXT: s_mov_b32 s3, 0xf000 693; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 694; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 695; GFX6-NEXT: s_mov_b32 s2, -1 696; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 697; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 698; GFX6-NEXT: s_endpgm 699; GFX9-LABEL: srem_i16: 700; GFX9: ; %bb.0: 701; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 702; GFX9-NEXT: s_nop 0 703; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 704; GFX9-NEXT: s_waitcnt lgkmcnt(0) 705; GFX9-NEXT: s_ashr_i32 s5, s4, 16 706; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5 707; GFX9-NEXT: s_sext_i32_i16 s2, s4 708; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 709; GFX9-NEXT: s_xor_b32 s2, s2, s5 710; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 711; GFX9-NEXT: s_ashr_i32 s2, s2, 30 712; GFX9-NEXT: s_or_b32 s6, s2, 1 713; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 714; GFX9-NEXT: v_trunc_f32_e32 v2, v2 715; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 716; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 717; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 718; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 719; GFX9-NEXT: s_cselect_b32 s2, s6, 0 720; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 721; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 722; GFX9-NEXT: v_mov_b32_e32 v1, 0 723; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 724; GFX9-NEXT: global_store_short v1, v0, s[0:1] 725; GFX9-NEXT: s_endpgm 726 %r = srem i16 %x, %y 727 store i16 %r, i16 addrspace(1)* %out 728 ret void 729} 730 731define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 732; CHECK-LABEL: @udiv_i8( 733; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 734; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 735; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 736; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 737; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 738; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 739; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 740; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 741; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 742; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 743; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 744; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 745; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 746; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 747; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 748; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 255 749; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 750; CHECK-NEXT: store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]], align 1 751; CHECK-NEXT: ret void 752; 753; GFX6-LABEL: udiv_i8: 754; GFX6: ; %bb.0: 755; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 756; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 757; GFX6-NEXT: s_mov_b32 s7, 0xf000 758; GFX6-NEXT: s_mov_b32 s6, -1 759; GFX6-NEXT: s_waitcnt lgkmcnt(0) 760; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s0 761; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 762; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 763; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 764; GFX6-NEXT: v_trunc_f32_e32 v1, v1 765; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 766; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 767; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 768; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 769; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 770; GFX6-NEXT: s_endpgm 771; GFX9-LABEL: udiv_i8: 772; GFX9: ; %bb.0: 773; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 774; GFX9-NEXT: v_mov_b32_e32 v2, 0 775; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 776; GFX9-NEXT: s_waitcnt lgkmcnt(0) 777; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 778; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 779; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 780; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 781; GFX9-NEXT: v_trunc_f32_e32 v1, v1 782; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 783; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v3 784; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 785; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 786; GFX9-NEXT: global_store_byte v2, v0, s[0:1] 787; GFX9-NEXT: s_endpgm 788 %r = udiv i8 %x, %y 789 store i8 %r, i8 addrspace(1)* %out 790 ret void 791} 792 793define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 794; CHECK-LABEL: @urem_i8( 795; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 796; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 797; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 798; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 799; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 800; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 801; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 802; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 803; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 804; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 805; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 806; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 807; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 808; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 809; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 810; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 811; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 812; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 255 813; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8 814; CHECK-NEXT: store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]], align 1 815; CHECK-NEXT: ret void 816; 817; GFX6-LABEL: urem_i8: 818; GFX6: ; %bb.0: 819; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 820; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 821; GFX6-NEXT: s_mov_b32 s3, 0xf000 822; GFX6-NEXT: s_waitcnt lgkmcnt(0) 823; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 824; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 825; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 826; GFX6-NEXT: s_lshr_b32 s2, s4, 8 827; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 828; GFX6-NEXT: v_trunc_f32_e32 v1, v1 829; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 830; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 831; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 832; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 833; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 834; GFX6-NEXT: s_mov_b32 s2, -1 835; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 836; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 837; GFX6-NEXT: s_endpgm 838; GFX9-LABEL: urem_i8: 839; GFX9: ; %bb.0: 840; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 841; GFX9-NEXT: s_nop 0 842; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 843; GFX9-NEXT: s_waitcnt lgkmcnt(0) 844; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 845; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 846; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 847; GFX9-NEXT: s_lshr_b32 s3, s2, 8 848; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 849; GFX9-NEXT: v_trunc_f32_e32 v1, v1 850; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 851; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2 852; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 853; GFX9-NEXT: v_mov_b32_e32 v1, 0 854; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 855; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 856; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 857; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 858; GFX9-NEXT: s_endpgm 859 %r = urem i8 %x, %y 860 store i8 %r, i8 addrspace(1)* %out 861 ret void 862} 863 864define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 865; CHECK-LABEL: @sdiv_i8( 866; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 867; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 868; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 869; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 870; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 871; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 872; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 873; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 874; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 875; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 876; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 877; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 878; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 879; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 880; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 881; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 882; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 883; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 884; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 24 885; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 24 886; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8 887; CHECK-NEXT: store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]], align 1 888; CHECK-NEXT: ret void 889; 890; GFX6-LABEL: sdiv_i8: 891; GFX6: ; %bb.0: 892; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 893; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 894; GFX6-NEXT: s_mov_b32 s7, 0xf000 895; GFX6-NEXT: s_mov_b32 s6, -1 896; GFX6-NEXT: s_waitcnt lgkmcnt(0) 897; GFX6-NEXT: s_bfe_i32 s1, s0, 0x80008 898; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 899; GFX6-NEXT: s_sext_i32_i8 s0, s0 900; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 901; GFX6-NEXT: s_xor_b32 s0, s0, s1 902; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 903; GFX6-NEXT: s_ashr_i32 s0, s0, 30 904; GFX6-NEXT: s_or_b32 s0, s0, 1 905; GFX6-NEXT: v_mov_b32_e32 v3, s0 906; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 907; GFX6-NEXT: v_trunc_f32_e32 v2, v2 908; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 909; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 910; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 911; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 912; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 913; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 914; GFX6-NEXT: s_endpgm 915; GFX9-LABEL: sdiv_i8: 916; GFX9: ; %bb.0: 917; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 918; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 919; GFX9-NEXT: v_mov_b32_e32 v1, 0 920; GFX9-NEXT: s_waitcnt lgkmcnt(0) 921; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80008 922; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 923; GFX9-NEXT: s_sext_i32_i8 s1, s4 924; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 925; GFX9-NEXT: s_xor_b32 s0, s1, s0 926; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 927; GFX9-NEXT: s_ashr_i32 s0, s0, 30 928; GFX9-NEXT: s_or_b32 s4, s0, 1 929; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 930; GFX9-NEXT: v_trunc_f32_e32 v3, v3 931; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 932; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 933; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 934; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 935; GFX9-NEXT: s_cselect_b32 s0, s4, 0 936; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 937; GFX9-NEXT: global_store_byte v1, v0, s[2:3] 938; GFX9-NEXT: s_endpgm 939 %r = sdiv i8 %x, %y 940 store i8 %r, i8 addrspace(1)* %out 941 ret void 942} 943 944define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 945; CHECK-LABEL: @srem_i8( 946; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 947; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 948; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 949; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 950; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 951; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 952; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 953; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 954; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 955; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 956; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 957; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 958; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 959; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 960; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 961; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 962; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 963; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 964; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 965; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 966; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 24 967; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 24 968; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8 969; CHECK-NEXT: store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]], align 1 970; CHECK-NEXT: ret void 971; 972; GFX6-LABEL: srem_i8: 973; GFX6: ; %bb.0: 974; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 975; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 976; GFX6-NEXT: s_mov_b32 s7, 0xf000 977; GFX6-NEXT: s_mov_b32 s6, -1 978; GFX6-NEXT: s_waitcnt lgkmcnt(0) 979; GFX6-NEXT: s_bfe_i32 s1, s0, 0x80008 980; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 981; GFX6-NEXT: s_sext_i32_i8 s3, s0 982; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 983; GFX6-NEXT: s_xor_b32 s1, s3, s1 984; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 985; GFX6-NEXT: s_ashr_i32 s1, s1, 30 986; GFX6-NEXT: s_or_b32 s1, s1, 1 987; GFX6-NEXT: v_mov_b32_e32 v3, s1 988; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 989; GFX6-NEXT: v_trunc_f32_e32 v2, v2 990; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 991; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 992; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 993; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 994; GFX6-NEXT: s_lshr_b32 s2, s0, 8 995; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 996; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 997; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 998; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 999; GFX6-NEXT: s_endpgm 1000; GFX9-LABEL: srem_i8: 1001; GFX9: ; %bb.0: 1002; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 1003; GFX9-NEXT: s_nop 0 1004; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1005; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1006; GFX9-NEXT: s_bfe_i32 s2, s4, 0x80008 1007; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 1008; GFX9-NEXT: s_sext_i32_i8 s3, s4 1009; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 1010; GFX9-NEXT: s_xor_b32 s2, s3, s2 1011; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 1012; GFX9-NEXT: s_ashr_i32 s2, s2, 30 1013; GFX9-NEXT: s_lshr_b32 s5, s4, 8 1014; GFX9-NEXT: s_or_b32 s6, s2, 1 1015; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 1016; GFX9-NEXT: v_trunc_f32_e32 v2, v2 1017; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 1018; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 1019; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 1020; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 1021; GFX9-NEXT: s_cselect_b32 s2, s6, 0 1022; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 1023; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 1024; GFX9-NEXT: v_mov_b32_e32 v1, 0 1025; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 1026; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 1027; GFX9-NEXT: s_endpgm 1028 %r = srem i8 %x, %y 1029 store i8 %r, i8 addrspace(1)* %out 1030 ret void 1031} 1032 1033define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1034; CHECK-LABEL: @udiv_v4i32( 1035; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1036; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1037; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 1038; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 1039; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 1040; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 1041; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 1042; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 1043; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 1044; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 1045; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 1046; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 1047; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 1048; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 1049; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 1050; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 1051; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1052; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1053; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1054; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1055; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1056; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 1057; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 1058; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 1059; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1 1060; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]] 1061; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]] 1062; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]] 1063; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]] 1064; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 1065; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 1066; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i64 0 1067; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[X]], i64 1 1068; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1069; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float 1070; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 1071; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000 1072; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32 1073; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]] 1074; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]] 1075; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64 1076; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64 1077; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]] 1078; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 1079; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32 1080; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32 1081; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]] 1082; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64 1083; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64 1084; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]] 1085; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 1086; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32 1087; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32 1088; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]] 1089; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]] 1090; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]] 1091; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1 1092; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]] 1093; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]] 1094; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]] 1095; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]] 1096; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1 1097; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]] 1098; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP63]], i64 1 1099; CHECK-NEXT: [[TMP65:%.*]] = extractelement <4 x i32> [[X]], i64 2 1100; CHECK-NEXT: [[TMP66:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1101; CHECK-NEXT: [[TMP67:%.*]] = uitofp i32 [[TMP66]] to float 1102; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP67]]) 1103; CHECK-NEXT: [[TMP69:%.*]] = fmul fast float [[TMP68]], 0x41EFFFFFC0000000 1104; CHECK-NEXT: [[TMP70:%.*]] = fptoui float [[TMP69]] to i32 1105; CHECK-NEXT: [[TMP71:%.*]] = sub i32 0, [[TMP66]] 1106; CHECK-NEXT: [[TMP72:%.*]] = mul i32 [[TMP71]], [[TMP70]] 1107; CHECK-NEXT: [[TMP73:%.*]] = zext i32 [[TMP70]] to i64 1108; CHECK-NEXT: [[TMP74:%.*]] = zext i32 [[TMP72]] to i64 1109; CHECK-NEXT: [[TMP75:%.*]] = mul i64 [[TMP73]], [[TMP74]] 1110; CHECK-NEXT: [[TMP76:%.*]] = trunc i64 [[TMP75]] to i32 1111; CHECK-NEXT: [[TMP77:%.*]] = lshr i64 [[TMP75]], 32 1112; CHECK-NEXT: [[TMP78:%.*]] = trunc i64 [[TMP77]] to i32 1113; CHECK-NEXT: [[TMP79:%.*]] = add i32 [[TMP70]], [[TMP78]] 1114; CHECK-NEXT: [[TMP80:%.*]] = zext i32 [[TMP65]] to i64 1115; CHECK-NEXT: [[TMP81:%.*]] = zext i32 [[TMP79]] to i64 1116; CHECK-NEXT: [[TMP82:%.*]] = mul i64 [[TMP80]], [[TMP81]] 1117; CHECK-NEXT: [[TMP83:%.*]] = trunc i64 [[TMP82]] to i32 1118; CHECK-NEXT: [[TMP84:%.*]] = lshr i64 [[TMP82]], 32 1119; CHECK-NEXT: [[TMP85:%.*]] = trunc i64 [[TMP84]] to i32 1120; CHECK-NEXT: [[TMP86:%.*]] = mul i32 [[TMP85]], [[TMP66]] 1121; CHECK-NEXT: [[TMP87:%.*]] = sub i32 [[TMP65]], [[TMP86]] 1122; CHECK-NEXT: [[TMP88:%.*]] = icmp uge i32 [[TMP87]], [[TMP66]] 1123; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP85]], 1 1124; CHECK-NEXT: [[TMP90:%.*]] = select i1 [[TMP88]], i32 [[TMP89]], i32 [[TMP85]] 1125; CHECK-NEXT: [[TMP91:%.*]] = sub i32 [[TMP87]], [[TMP66]] 1126; CHECK-NEXT: [[TMP92:%.*]] = select i1 [[TMP88]], i32 [[TMP91]], i32 [[TMP87]] 1127; CHECK-NEXT: [[TMP93:%.*]] = icmp uge i32 [[TMP92]], [[TMP66]] 1128; CHECK-NEXT: [[TMP94:%.*]] = add i32 [[TMP90]], 1 1129; CHECK-NEXT: [[TMP95:%.*]] = select i1 [[TMP93]], i32 [[TMP94]], i32 [[TMP90]] 1130; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP95]], i64 2 1131; CHECK-NEXT: [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 3 1132; CHECK-NEXT: [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1133; CHECK-NEXT: [[TMP99:%.*]] = uitofp i32 [[TMP98]] to float 1134; CHECK-NEXT: [[TMP100:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP99]]) 1135; CHECK-NEXT: [[TMP101:%.*]] = fmul fast float [[TMP100]], 0x41EFFFFFC0000000 1136; CHECK-NEXT: [[TMP102:%.*]] = fptoui float [[TMP101]] to i32 1137; CHECK-NEXT: [[TMP103:%.*]] = sub i32 0, [[TMP98]] 1138; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP102]] 1139; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP102]] to i64 1140; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64 1141; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 1142; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 1143; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 1144; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 1145; CHECK-NEXT: [[TMP111:%.*]] = add i32 [[TMP102]], [[TMP110]] 1146; CHECK-NEXT: [[TMP112:%.*]] = zext i32 [[TMP97]] to i64 1147; CHECK-NEXT: [[TMP113:%.*]] = zext i32 [[TMP111]] to i64 1148; CHECK-NEXT: [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]] 1149; CHECK-NEXT: [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32 1150; CHECK-NEXT: [[TMP116:%.*]] = lshr i64 [[TMP114]], 32 1151; CHECK-NEXT: [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32 1152; CHECK-NEXT: [[TMP118:%.*]] = mul i32 [[TMP117]], [[TMP98]] 1153; CHECK-NEXT: [[TMP119:%.*]] = sub i32 [[TMP97]], [[TMP118]] 1154; CHECK-NEXT: [[TMP120:%.*]] = icmp uge i32 [[TMP119]], [[TMP98]] 1155; CHECK-NEXT: [[TMP121:%.*]] = add i32 [[TMP117]], 1 1156; CHECK-NEXT: [[TMP122:%.*]] = select i1 [[TMP120]], i32 [[TMP121]], i32 [[TMP117]] 1157; CHECK-NEXT: [[TMP123:%.*]] = sub i32 [[TMP119]], [[TMP98]] 1158; CHECK-NEXT: [[TMP124:%.*]] = select i1 [[TMP120]], i32 [[TMP123]], i32 [[TMP119]] 1159; CHECK-NEXT: [[TMP125:%.*]] = icmp uge i32 [[TMP124]], [[TMP98]] 1160; CHECK-NEXT: [[TMP126:%.*]] = add i32 [[TMP122]], 1 1161; CHECK-NEXT: [[TMP127:%.*]] = select i1 [[TMP125]], i32 [[TMP126]], i32 [[TMP122]] 1162; CHECK-NEXT: [[TMP128:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP127]], i64 3 1163; CHECK-NEXT: store <4 x i32> [[TMP128]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1164; CHECK-NEXT: ret void 1165; 1166; GFX6-LABEL: udiv_v4i32: 1167; GFX6: ; %bb.0: 1168; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1169; GFX6-NEXT: s_mov_b32 s3, 0x4f7ffffe 1170; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 1171; GFX6-NEXT: s_mov_b32 s15, 0xf000 1172; GFX6-NEXT: s_mov_b32 s14, -1 1173; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1174; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 1175; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 1176; GFX6-NEXT: s_sub_i32 s2, 0, s8 1177; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s10 1178; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 1179; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 1180; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s11 1181; GFX6-NEXT: v_mul_f32_e32 v0, s3, v0 1182; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 1183; GFX6-NEXT: v_mul_f32_e32 v1, s3, v1 1184; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 1185; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 1186; GFX6-NEXT: s_sub_i32 s2, 0, s9 1187; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 1188; GFX6-NEXT: s_sub_i32 s2, 0, s10 1189; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 1190; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 1191; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1192; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 1193; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 1194; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 1195; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 1196; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 1197; GFX6-NEXT: v_mul_lo_u32 v5, v1, s9 1198; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 1199; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 1200; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 1201; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v2 1202; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 1203; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 1204; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 1205; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v4 1206; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1207; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s5, v5 1208; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1209; GFX6-NEXT: v_mul_f32_e32 v2, s3, v2 1210; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 1211; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v3 1212; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 1213; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s9, v3 1214; GFX6-NEXT: v_mul_lo_u32 v4, s2, v2 1215; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1216; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v1 1217; GFX6-NEXT: s_sub_i32 s0, 0, s11 1218; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4 1219; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 1220; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v6 1221; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 1222; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1223; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 1224; GFX6-NEXT: v_mul_f32_e32 v4, s3, v4 1225; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 1226; GFX6-NEXT: v_mul_lo_u32 v3, v2, s10 1227; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v2 1228; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 1229; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 1230; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v3 1231; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 1232; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 1233; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s10, v3 1234; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 1235; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 1236; GFX6-NEXT: v_mul_hi_u32 v4, s7, v4 1237; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v2 1238; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 1239; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1240; GFX6-NEXT: v_mul_lo_u32 v6, v4, s11 1241; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1242; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v6 1243; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v3 1244; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] 1245; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s11, v3 1246; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1247; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1248; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1249; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 1250; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 1251; GFX6-NEXT: s_endpgm 1252; GFX9-LABEL: udiv_v4i32: 1253; GFX9: ; %bb.0: 1254; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 1255; GFX9-NEXT: s_mov_b32 s12, 0x4f7ffffe 1256; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1257; GFX9-NEXT: v_mov_b32_e32 v4, 0 1258; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1259; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 1260; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 1261; GFX9-NEXT: s_sub_i32 s2, 0, s8 1262; GFX9-NEXT: s_sub_i32 s3, 0, s9 1263; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1264; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 1265; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s10 1266; GFX9-NEXT: v_mul_f32_e32 v0, s12, v0 1267; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1268; GFX9-NEXT: v_mul_f32_e32 v1, s12, v1 1269; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 1270; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 1271; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 1272; GFX9-NEXT: s_sub_i32 s2, 0, s10 1273; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 1274; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 1275; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 1276; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 1277; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 1278; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 1279; GFX9-NEXT: v_mul_f32_e32 v2, s12, v5 1280; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 1281; GFX9-NEXT: v_mul_lo_u32 v3, v0, s8 1282; GFX9-NEXT: v_add_u32_e32 v6, 1, v0 1283; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 1284; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 1285; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 1286; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 1287; GFX9-NEXT: v_subrev_u32_e32 v6, s8, v3 1288; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1289; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 1290; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s11 1291; GFX9-NEXT: v_add_u32_e32 v6, 1, v0 1292; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 1293; GFX9-NEXT: v_mul_lo_u32 v6, s2, v2 1294; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 1295; GFX9-NEXT: v_mul_lo_u32 v5, v1, s9 1296; GFX9-NEXT: s_sub_i32 s2, 0, s11 1297; GFX9-NEXT: v_mul_hi_u32 v6, v2, v6 1298; GFX9-NEXT: v_mul_f32_e32 v3, s12, v3 1299; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 1300; GFX9-NEXT: v_sub_u32_e32 v5, s5, v5 1301; GFX9-NEXT: v_add_u32_e32 v2, v2, v6 1302; GFX9-NEXT: v_add_u32_e32 v7, 1, v1 1303; GFX9-NEXT: v_mul_lo_u32 v6, s2, v3 1304; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 1305; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc 1306; GFX9-NEXT: v_subrev_u32_e32 v7, s9, v5 1307; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc 1308; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 1309; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 1310; GFX9-NEXT: v_mul_hi_u32 v5, v3, v6 1311; GFX9-NEXT: v_add_u32_e32 v7, 1, v1 1312; GFX9-NEXT: v_mul_lo_u32 v8, v2, s10 1313; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc 1314; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 1315; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 1316; GFX9-NEXT: v_sub_u32_e32 v6, s6, v8 1317; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v6 1318; GFX9-NEXT: v_subrev_u32_e32 v5, s10, v6 1319; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc 1320; GFX9-NEXT: v_mul_lo_u32 v6, v3, s11 1321; GFX9-NEXT: v_add_u32_e32 v7, 1, v2 1322; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc 1323; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v5 1324; GFX9-NEXT: v_add_u32_e32 v7, 1, v2 1325; GFX9-NEXT: v_sub_u32_e32 v5, s7, v6 1326; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc 1327; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 1328; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 1329; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1330; GFX9-NEXT: v_subrev_u32_e32 v6, s11, v5 1331; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 1332; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 1333; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 1334; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1335; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1336; GFX9-NEXT: s_endpgm 1337 %r = udiv <4 x i32> %x, %y 1338 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1339 ret void 1340} 1341 1342define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1343; CHECK-LABEL: @urem_v4i32( 1344; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1345; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1346; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 1347; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 1348; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 1349; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 1350; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 1351; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 1352; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 1353; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 1354; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 1355; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 1356; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 1357; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 1358; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 1359; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 1360; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1361; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1362; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1363; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1364; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1365; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 1366; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 1367; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 1368; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]] 1369; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]] 1370; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]] 1371; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]] 1372; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]] 1373; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> undef, i32 [[TMP29]], i64 0 1374; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[X]], i64 1 1375; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1376; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float 1377; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 1378; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000 1379; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32 1380; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]] 1381; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]] 1382; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64 1383; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64 1384; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]] 1385; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32 1386; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32 1387; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 1388; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]] 1389; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64 1390; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64 1391; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]] 1392; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32 1393; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32 1394; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 1395; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]] 1396; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]] 1397; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]] 1398; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]] 1399; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]] 1400; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]] 1401; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]] 1402; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]] 1403; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP59]], i64 1 1404; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i32> [[X]], i64 2 1405; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1406; CHECK-NEXT: [[TMP63:%.*]] = uitofp i32 [[TMP62]] to float 1407; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP63]]) 1408; CHECK-NEXT: [[TMP65:%.*]] = fmul fast float [[TMP64]], 0x41EFFFFFC0000000 1409; CHECK-NEXT: [[TMP66:%.*]] = fptoui float [[TMP65]] to i32 1410; CHECK-NEXT: [[TMP67:%.*]] = sub i32 0, [[TMP62]] 1411; CHECK-NEXT: [[TMP68:%.*]] = mul i32 [[TMP67]], [[TMP66]] 1412; CHECK-NEXT: [[TMP69:%.*]] = zext i32 [[TMP66]] to i64 1413; CHECK-NEXT: [[TMP70:%.*]] = zext i32 [[TMP68]] to i64 1414; CHECK-NEXT: [[TMP71:%.*]] = mul i64 [[TMP69]], [[TMP70]] 1415; CHECK-NEXT: [[TMP72:%.*]] = trunc i64 [[TMP71]] to i32 1416; CHECK-NEXT: [[TMP73:%.*]] = lshr i64 [[TMP71]], 32 1417; CHECK-NEXT: [[TMP74:%.*]] = trunc i64 [[TMP73]] to i32 1418; CHECK-NEXT: [[TMP75:%.*]] = add i32 [[TMP66]], [[TMP74]] 1419; CHECK-NEXT: [[TMP76:%.*]] = zext i32 [[TMP61]] to i64 1420; CHECK-NEXT: [[TMP77:%.*]] = zext i32 [[TMP75]] to i64 1421; CHECK-NEXT: [[TMP78:%.*]] = mul i64 [[TMP76]], [[TMP77]] 1422; CHECK-NEXT: [[TMP79:%.*]] = trunc i64 [[TMP78]] to i32 1423; CHECK-NEXT: [[TMP80:%.*]] = lshr i64 [[TMP78]], 32 1424; CHECK-NEXT: [[TMP81:%.*]] = trunc i64 [[TMP80]] to i32 1425; CHECK-NEXT: [[TMP82:%.*]] = mul i32 [[TMP81]], [[TMP62]] 1426; CHECK-NEXT: [[TMP83:%.*]] = sub i32 [[TMP61]], [[TMP82]] 1427; CHECK-NEXT: [[TMP84:%.*]] = icmp uge i32 [[TMP83]], [[TMP62]] 1428; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP83]], [[TMP62]] 1429; CHECK-NEXT: [[TMP86:%.*]] = select i1 [[TMP84]], i32 [[TMP85]], i32 [[TMP83]] 1430; CHECK-NEXT: [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP62]] 1431; CHECK-NEXT: [[TMP88:%.*]] = sub i32 [[TMP86]], [[TMP62]] 1432; CHECK-NEXT: [[TMP89:%.*]] = select i1 [[TMP87]], i32 [[TMP88]], i32 [[TMP86]] 1433; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP89]], i64 2 1434; CHECK-NEXT: [[TMP91:%.*]] = extractelement <4 x i32> [[X]], i64 3 1435; CHECK-NEXT: [[TMP92:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1436; CHECK-NEXT: [[TMP93:%.*]] = uitofp i32 [[TMP92]] to float 1437; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP93]]) 1438; CHECK-NEXT: [[TMP95:%.*]] = fmul fast float [[TMP94]], 0x41EFFFFFC0000000 1439; CHECK-NEXT: [[TMP96:%.*]] = fptoui float [[TMP95]] to i32 1440; CHECK-NEXT: [[TMP97:%.*]] = sub i32 0, [[TMP92]] 1441; CHECK-NEXT: [[TMP98:%.*]] = mul i32 [[TMP97]], [[TMP96]] 1442; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP96]] to i64 1443; CHECK-NEXT: [[TMP100:%.*]] = zext i32 [[TMP98]] to i64 1444; CHECK-NEXT: [[TMP101:%.*]] = mul i64 [[TMP99]], [[TMP100]] 1445; CHECK-NEXT: [[TMP102:%.*]] = trunc i64 [[TMP101]] to i32 1446; CHECK-NEXT: [[TMP103:%.*]] = lshr i64 [[TMP101]], 32 1447; CHECK-NEXT: [[TMP104:%.*]] = trunc i64 [[TMP103]] to i32 1448; CHECK-NEXT: [[TMP105:%.*]] = add i32 [[TMP96]], [[TMP104]] 1449; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP91]] to i64 1450; CHECK-NEXT: [[TMP107:%.*]] = zext i32 [[TMP105]] to i64 1451; CHECK-NEXT: [[TMP108:%.*]] = mul i64 [[TMP106]], [[TMP107]] 1452; CHECK-NEXT: [[TMP109:%.*]] = trunc i64 [[TMP108]] to i32 1453; CHECK-NEXT: [[TMP110:%.*]] = lshr i64 [[TMP108]], 32 1454; CHECK-NEXT: [[TMP111:%.*]] = trunc i64 [[TMP110]] to i32 1455; CHECK-NEXT: [[TMP112:%.*]] = mul i32 [[TMP111]], [[TMP92]] 1456; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP91]], [[TMP112]] 1457; CHECK-NEXT: [[TMP114:%.*]] = icmp uge i32 [[TMP113]], [[TMP92]] 1458; CHECK-NEXT: [[TMP115:%.*]] = sub i32 [[TMP113]], [[TMP92]] 1459; CHECK-NEXT: [[TMP116:%.*]] = select i1 [[TMP114]], i32 [[TMP115]], i32 [[TMP113]] 1460; CHECK-NEXT: [[TMP117:%.*]] = icmp uge i32 [[TMP116]], [[TMP92]] 1461; CHECK-NEXT: [[TMP118:%.*]] = sub i32 [[TMP116]], [[TMP92]] 1462; CHECK-NEXT: [[TMP119:%.*]] = select i1 [[TMP117]], i32 [[TMP118]], i32 [[TMP116]] 1463; CHECK-NEXT: [[TMP120:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP119]], i64 3 1464; CHECK-NEXT: store <4 x i32> [[TMP120]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1465; CHECK-NEXT: ret void 1466; 1467; GFX6-LABEL: urem_v4i32: 1468; GFX6: ; %bb.0: 1469; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1470; GFX6-NEXT: s_mov_b32 s13, 0x4f7ffffe 1471; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1472; GFX6-NEXT: s_mov_b32 s3, 0xf000 1473; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1474; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 1475; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 1476; GFX6-NEXT: s_sub_i32 s2, 0, s8 1477; GFX6-NEXT: s_sub_i32 s12, 0, s9 1478; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 1479; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 1480; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 1481; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s11 1482; GFX6-NEXT: v_mul_f32_e32 v0, s13, v0 1483; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 1484; GFX6-NEXT: v_mul_f32_e32 v1, s13, v1 1485; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 1486; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 1487; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 1488; GFX6-NEXT: s_mov_b32 s2, -1 1489; GFX6-NEXT: v_mul_lo_u32 v4, s12, v1 1490; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 1491; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 1492; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1493; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 1494; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 1495; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 1496; GFX6-NEXT: v_mul_f32_e32 v2, s13, v3 1497; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 1498; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 1499; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 1500; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1501; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 1502; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1503; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1504; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 1505; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1506; GFX6-NEXT: s_sub_i32 s4, 0, s10 1507; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1508; GFX6-NEXT: v_mul_lo_u32 v3, s4, v2 1509; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 1510; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 1511; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1512; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1513; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 1514; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 1515; GFX6-NEXT: s_sub_i32 s4, 0, s11 1516; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 1517; GFX6-NEXT: v_mul_f32_e32 v3, s13, v4 1518; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 1519; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 1520; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 1521; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1522; GFX6-NEXT: v_mul_lo_u32 v5, s4, v3 1523; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1524; GFX6-NEXT: v_mul_lo_u32 v2, v2, s10 1525; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 1526; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 1527; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 1528; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 1529; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 1530; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1531; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1532; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s10, v2 1533; GFX6-NEXT: v_mul_lo_u32 v3, v3, s11 1534; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1535; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 1536; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 1537; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 1538; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1539; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1540; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 1541; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1542; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1543; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1544; GFX6-NEXT: s_endpgm 1545; GFX9-LABEL: urem_v4i32: 1546; GFX9: ; %bb.0: 1547; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 1548; GFX9-NEXT: s_mov_b32 s12, 0x4f7ffffe 1549; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1550; GFX9-NEXT: v_mov_b32_e32 v4, 0 1551; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1552; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 1553; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 1554; GFX9-NEXT: s_sub_i32 s2, 0, s8 1555; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s10 1556; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1557; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 1558; GFX9-NEXT: s_sub_i32 s3, 0, s9 1559; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 1560; GFX9-NEXT: v_mul_f32_e32 v0, s12, v0 1561; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1562; GFX9-NEXT: v_mul_f32_e32 v1, s12, v1 1563; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 1564; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s11 1565; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 1566; GFX9-NEXT: s_sub_i32 s2, 0, s10 1567; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 1568; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 1569; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 1570; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 1571; GFX9-NEXT: v_mul_f32_e32 v2, s12, v5 1572; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 1573; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 1574; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v6 1575; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 1576; GFX9-NEXT: v_mul_lo_u32 v5, s2, v2 1577; GFX9-NEXT: s_sub_i32 s2, 0, s11 1578; GFX9-NEXT: v_mul_f32_e32 v3, s12, v3 1579; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 1580; GFX9-NEXT: v_mul_hi_u32 v5, v2, v5 1581; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 1582; GFX9-NEXT: v_mul_lo_u32 v0, v0, s8 1583; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 1584; GFX9-NEXT: v_mul_lo_u32 v5, s2, v3 1585; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 1586; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 1587; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 1588; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 1589; GFX9-NEXT: v_subrev_u32_e32 v6, s8, v0 1590; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1591; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 1592; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 1593; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 1594; GFX9-NEXT: v_mul_lo_u32 v2, v2, s10 1595; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 1596; GFX9-NEXT: v_subrev_u32_e32 v6, s8, v0 1597; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1598; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 1599; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v1 1600; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1601; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 1602; GFX9-NEXT: v_mul_lo_u32 v3, v3, s11 1603; GFX9-NEXT: v_sub_u32_e32 v2, s6, v2 1604; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v1 1605; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1606; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 1607; GFX9-NEXT: v_subrev_u32_e32 v5, s10, v2 1608; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1609; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1610; GFX9-NEXT: v_subrev_u32_e32 v5, s10, v2 1611; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1612; GFX9-NEXT: v_sub_u32_e32 v3, s7, v3 1613; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1614; GFX9-NEXT: v_subrev_u32_e32 v5, s11, v3 1615; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1616; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 1617; GFX9-NEXT: v_subrev_u32_e32 v5, s11, v3 1618; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1619; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 1620; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1621; GFX9-NEXT: s_endpgm 1622 %r = urem <4 x i32> %x, %y 1623 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1624 ret void 1625} 1626 1627define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1628; CHECK-LABEL: @sdiv_v4i32( 1629; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1630; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1631; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 1632; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 1633; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 1634; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 1635; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 1636; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 1637; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 1638; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 1639; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 1640; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000 1641; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 1642; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]] 1643; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]] 1644; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64 1645; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1646; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1647; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1648; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1649; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1650; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]] 1651; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64 1652; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64 1653; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 1654; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 1655; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 1656; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 1657; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]] 1658; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]] 1659; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]] 1660; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1 1661; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]] 1662; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]] 1663; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]] 1664; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]] 1665; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1 1666; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]] 1667; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]] 1668; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]] 1669; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> undef, i32 [[TMP40]], i64 0 1670; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[X]], i64 1 1671; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1672; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31 1673; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31 1674; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]] 1675; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]] 1676; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]] 1677; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]] 1678; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]] 1679; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float 1680; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]]) 1681; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000 1682; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32 1683; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]] 1684; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]] 1685; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64 1686; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64 1687; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]] 1688; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32 1689; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32 1690; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32 1691; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]] 1692; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64 1693; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64 1694; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]] 1695; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32 1696; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32 1697; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32 1698; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]] 1699; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]] 1700; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]] 1701; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1 1702; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]] 1703; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]] 1704; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]] 1705; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]] 1706; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1 1707; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]] 1708; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]] 1709; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]] 1710; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP81]], i64 1 1711; CHECK-NEXT: [[TMP83:%.*]] = extractelement <4 x i32> [[X]], i64 2 1712; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1713; CHECK-NEXT: [[TMP85:%.*]] = ashr i32 [[TMP83]], 31 1714; CHECK-NEXT: [[TMP86:%.*]] = ashr i32 [[TMP84]], 31 1715; CHECK-NEXT: [[TMP87:%.*]] = xor i32 [[TMP85]], [[TMP86]] 1716; CHECK-NEXT: [[TMP88:%.*]] = add i32 [[TMP83]], [[TMP85]] 1717; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP84]], [[TMP86]] 1718; CHECK-NEXT: [[TMP90:%.*]] = xor i32 [[TMP88]], [[TMP85]] 1719; CHECK-NEXT: [[TMP91:%.*]] = xor i32 [[TMP89]], [[TMP86]] 1720; CHECK-NEXT: [[TMP92:%.*]] = uitofp i32 [[TMP91]] to float 1721; CHECK-NEXT: [[TMP93:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP92]]) 1722; CHECK-NEXT: [[TMP94:%.*]] = fmul fast float [[TMP93]], 0x41EFFFFFC0000000 1723; CHECK-NEXT: [[TMP95:%.*]] = fptoui float [[TMP94]] to i32 1724; CHECK-NEXT: [[TMP96:%.*]] = sub i32 0, [[TMP91]] 1725; CHECK-NEXT: [[TMP97:%.*]] = mul i32 [[TMP96]], [[TMP95]] 1726; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP95]] to i64 1727; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64 1728; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]] 1729; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 1730; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32 1731; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32 1732; CHECK-NEXT: [[TMP104:%.*]] = add i32 [[TMP95]], [[TMP103]] 1733; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP90]] to i64 1734; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64 1735; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 1736; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 1737; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 1738; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 1739; CHECK-NEXT: [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP91]] 1740; CHECK-NEXT: [[TMP112:%.*]] = sub i32 [[TMP90]], [[TMP111]] 1741; CHECK-NEXT: [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP91]] 1742; CHECK-NEXT: [[TMP114:%.*]] = add i32 [[TMP110]], 1 1743; CHECK-NEXT: [[TMP115:%.*]] = select i1 [[TMP113]], i32 [[TMP114]], i32 [[TMP110]] 1744; CHECK-NEXT: [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP91]] 1745; CHECK-NEXT: [[TMP117:%.*]] = select i1 [[TMP113]], i32 [[TMP116]], i32 [[TMP112]] 1746; CHECK-NEXT: [[TMP118:%.*]] = icmp uge i32 [[TMP117]], [[TMP91]] 1747; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], 1 1748; CHECK-NEXT: [[TMP120:%.*]] = select i1 [[TMP118]], i32 [[TMP119]], i32 [[TMP115]] 1749; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP120]], [[TMP87]] 1750; CHECK-NEXT: [[TMP122:%.*]] = sub i32 [[TMP121]], [[TMP87]] 1751; CHECK-NEXT: [[TMP123:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP122]], i64 2 1752; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i32> [[X]], i64 3 1753; CHECK-NEXT: [[TMP125:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1754; CHECK-NEXT: [[TMP126:%.*]] = ashr i32 [[TMP124]], 31 1755; CHECK-NEXT: [[TMP127:%.*]] = ashr i32 [[TMP125]], 31 1756; CHECK-NEXT: [[TMP128:%.*]] = xor i32 [[TMP126]], [[TMP127]] 1757; CHECK-NEXT: [[TMP129:%.*]] = add i32 [[TMP124]], [[TMP126]] 1758; CHECK-NEXT: [[TMP130:%.*]] = add i32 [[TMP125]], [[TMP127]] 1759; CHECK-NEXT: [[TMP131:%.*]] = xor i32 [[TMP129]], [[TMP126]] 1760; CHECK-NEXT: [[TMP132:%.*]] = xor i32 [[TMP130]], [[TMP127]] 1761; CHECK-NEXT: [[TMP133:%.*]] = uitofp i32 [[TMP132]] to float 1762; CHECK-NEXT: [[TMP134:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP133]]) 1763; CHECK-NEXT: [[TMP135:%.*]] = fmul fast float [[TMP134]], 0x41EFFFFFC0000000 1764; CHECK-NEXT: [[TMP136:%.*]] = fptoui float [[TMP135]] to i32 1765; CHECK-NEXT: [[TMP137:%.*]] = sub i32 0, [[TMP132]] 1766; CHECK-NEXT: [[TMP138:%.*]] = mul i32 [[TMP137]], [[TMP136]] 1767; CHECK-NEXT: [[TMP139:%.*]] = zext i32 [[TMP136]] to i64 1768; CHECK-NEXT: [[TMP140:%.*]] = zext i32 [[TMP138]] to i64 1769; CHECK-NEXT: [[TMP141:%.*]] = mul i64 [[TMP139]], [[TMP140]] 1770; CHECK-NEXT: [[TMP142:%.*]] = trunc i64 [[TMP141]] to i32 1771; CHECK-NEXT: [[TMP143:%.*]] = lshr i64 [[TMP141]], 32 1772; CHECK-NEXT: [[TMP144:%.*]] = trunc i64 [[TMP143]] to i32 1773; CHECK-NEXT: [[TMP145:%.*]] = add i32 [[TMP136]], [[TMP144]] 1774; CHECK-NEXT: [[TMP146:%.*]] = zext i32 [[TMP131]] to i64 1775; CHECK-NEXT: [[TMP147:%.*]] = zext i32 [[TMP145]] to i64 1776; CHECK-NEXT: [[TMP148:%.*]] = mul i64 [[TMP146]], [[TMP147]] 1777; CHECK-NEXT: [[TMP149:%.*]] = trunc i64 [[TMP148]] to i32 1778; CHECK-NEXT: [[TMP150:%.*]] = lshr i64 [[TMP148]], 32 1779; CHECK-NEXT: [[TMP151:%.*]] = trunc i64 [[TMP150]] to i32 1780; CHECK-NEXT: [[TMP152:%.*]] = mul i32 [[TMP151]], [[TMP132]] 1781; CHECK-NEXT: [[TMP153:%.*]] = sub i32 [[TMP131]], [[TMP152]] 1782; CHECK-NEXT: [[TMP154:%.*]] = icmp uge i32 [[TMP153]], [[TMP132]] 1783; CHECK-NEXT: [[TMP155:%.*]] = add i32 [[TMP151]], 1 1784; CHECK-NEXT: [[TMP156:%.*]] = select i1 [[TMP154]], i32 [[TMP155]], i32 [[TMP151]] 1785; CHECK-NEXT: [[TMP157:%.*]] = sub i32 [[TMP153]], [[TMP132]] 1786; CHECK-NEXT: [[TMP158:%.*]] = select i1 [[TMP154]], i32 [[TMP157]], i32 [[TMP153]] 1787; CHECK-NEXT: [[TMP159:%.*]] = icmp uge i32 [[TMP158]], [[TMP132]] 1788; CHECK-NEXT: [[TMP160:%.*]] = add i32 [[TMP156]], 1 1789; CHECK-NEXT: [[TMP161:%.*]] = select i1 [[TMP159]], i32 [[TMP160]], i32 [[TMP156]] 1790; CHECK-NEXT: [[TMP162:%.*]] = xor i32 [[TMP161]], [[TMP128]] 1791; CHECK-NEXT: [[TMP163:%.*]] = sub i32 [[TMP162]], [[TMP128]] 1792; CHECK-NEXT: [[TMP164:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP163]], i64 3 1793; CHECK-NEXT: store <4 x i32> [[TMP164]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1794; CHECK-NEXT: ret void 1795; 1796; GFX6-LABEL: sdiv_v4i32: 1797; GFX6: ; %bb.0: 1798; GFX6-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0xd 1799; GFX6-NEXT: s_mov_b32 s16, 0x4f7ffffe 1800; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1801; GFX6-NEXT: s_mov_b32 s7, 0xf000 1802; GFX6-NEXT: s_mov_b32 s6, -1 1803; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1804; GFX6-NEXT: s_ashr_i32 s2, s12, 31 1805; GFX6-NEXT: s_add_i32 s3, s12, s2 1806; GFX6-NEXT: s_xor_b32 s12, s3, s2 1807; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 1808; GFX6-NEXT: s_ashr_i32 s3, s13, 31 1809; GFX6-NEXT: s_add_i32 s0, s13, s3 1810; GFX6-NEXT: s_xor_b32 s13, s0, s3 1811; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 1812; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s13 1813; GFX6-NEXT: s_sub_i32 s1, 0, s12 1814; GFX6-NEXT: s_ashr_i32 s0, s8, 31 1815; GFX6-NEXT: v_mul_f32_e32 v0, s16, v0 1816; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 1817; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 1818; GFX6-NEXT: s_xor_b32 s2, s0, s2 1819; GFX6-NEXT: v_mul_lo_u32 v2, s1, v0 1820; GFX6-NEXT: s_add_i32 s1, s8, s0 1821; GFX6-NEXT: v_mul_f32_e32 v1, s16, v1 1822; GFX6-NEXT: s_xor_b32 s1, s1, s0 1823; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 1824; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 1825; GFX6-NEXT: s_sub_i32 s0, 0, s13 1826; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1827; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 1828; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 1829; GFX6-NEXT: v_mul_lo_u32 v3, v0, s12 1830; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 1831; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 1832; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 1833; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v3 1834; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 1835; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s12, v3 1836; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] 1837; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 1838; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 1839; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 1840; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 1841; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 1842; GFX6-NEXT: s_ashr_i32 s0, s9, 31 1843; GFX6-NEXT: s_add_i32 s1, s9, s0 1844; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 1845; GFX6-NEXT: s_xor_b32 s2, s0, s3 1846; GFX6-NEXT: s_ashr_i32 s3, s14, 31 1847; GFX6-NEXT: s_xor_b32 s1, s1, s0 1848; GFX6-NEXT: s_add_i32 s0, s14, s3 1849; GFX6-NEXT: s_xor_b32 s9, s0, s3 1850; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s9 1851; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 1852; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 1853; GFX6-NEXT: v_mul_lo_u32 v2, v1, s13 1854; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1855; GFX6-NEXT: v_mul_f32_e32 v3, s16, v3 1856; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 1857; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 1858; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v2 1859; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 1860; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s13, v2 1861; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 1862; GFX6-NEXT: s_sub_i32 s0, 0, s9 1863; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 1864; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1865; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 1866; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1867; GFX6-NEXT: v_mul_hi_u32 v2, v3, v5 1868; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 1869; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 1870; GFX6-NEXT: s_ashr_i32 s2, s15, 31 1871; GFX6-NEXT: s_ashr_i32 s0, s10, 31 1872; GFX6-NEXT: s_add_i32 s8, s15, s2 1873; GFX6-NEXT: s_add_i32 s1, s10, s0 1874; GFX6-NEXT: s_xor_b32 s8, s8, s2 1875; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s8 1876; GFX6-NEXT: s_xor_b32 s1, s1, s0 1877; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 1878; GFX6-NEXT: v_mul_hi_u32 v2, s1, v2 1879; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v4 1880; GFX6-NEXT: s_xor_b32 s3, s0, s3 1881; GFX6-NEXT: v_mul_lo_u32 v3, v2, s9 1882; GFX6-NEXT: v_mul_f32_e32 v4, s16, v4 1883; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 1884; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v2 1885; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 1886; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v3 1887; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 1888; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s9, v3 1889; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1890; GFX6-NEXT: s_sub_i32 s0, 0, s8 1891; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 1892; GFX6-NEXT: s_ashr_i32 s0, s11, 31 1893; GFX6-NEXT: s_add_i32 s1, s11, s0 1894; GFX6-NEXT: s_xor_b32 s1, s1, s0 1895; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 1896; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v2 1897; GFX6-NEXT: s_xor_b32 s2, s0, s2 1898; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 1899; GFX6-NEXT: v_mul_hi_u32 v4, s1, v4 1900; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 1901; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 1902; GFX6-NEXT: v_xor_b32_e32 v2, s3, v2 1903; GFX6-NEXT: v_mul_lo_u32 v3, v4, s8 1904; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1905; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v2 1906; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 1907; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v3 1908; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] 1909; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s8, v3 1910; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1911; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1912; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 1913; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 1914; GFX6-NEXT: v_xor_b32_e32 v3, s2, v3 1915; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v3 1916; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 1917; GFX6-NEXT: s_endpgm 1918; GFX9-LABEL: sdiv_v4i32: 1919; GFX9: ; %bb.0: 1920; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 1921; GFX9-NEXT: s_mov_b32 s13, 0x4f7ffffe 1922; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1923; GFX9-NEXT: v_mov_b32_e32 v4, 0 1924; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1925; GFX9-NEXT: s_ashr_i32 s2, s8, 31 1926; GFX9-NEXT: s_add_i32 s3, s8, s2 1927; GFX9-NEXT: s_xor_b32 s14, s3, s2 1928; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s14 1929; GFX9-NEXT: s_ashr_i32 s8, s9, 31 1930; GFX9-NEXT: s_add_i32 s9, s9, s8 1931; GFX9-NEXT: s_xor_b32 s15, s9, s8 1932; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1933; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s15 1934; GFX9-NEXT: s_sub_i32 s12, 0, s14 1935; GFX9-NEXT: s_ashr_i32 s3, s4, 31 1936; GFX9-NEXT: v_mul_f32_e32 v0, s13, v0 1937; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1938; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 1939; GFX9-NEXT: s_add_i32 s4, s4, s3 1940; GFX9-NEXT: s_xor_b32 s4, s4, s3 1941; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0 1942; GFX9-NEXT: v_mul_f32_e32 v1, s13, v1 1943; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 1944; GFX9-NEXT: s_sub_i32 s12, 0, s15 1945; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 1946; GFX9-NEXT: s_ashr_i32 s9, s5, 31 1947; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 1948; GFX9-NEXT: s_xor_b32 s2, s3, s2 1949; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 1950; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 1951; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 1952; GFX9-NEXT: s_add_i32 s3, s5, s9 1953; GFX9-NEXT: s_xor_b32 s3, s3, s9 1954; GFX9-NEXT: v_mul_lo_u32 v3, v0, s14 1955; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 1956; GFX9-NEXT: v_mul_hi_u32 v1, s3, v1 1957; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 1958; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 1959; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v3 1960; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 1961; GFX9-NEXT: v_subrev_u32_e32 v2, s14, v3 1962; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 1963; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v2 1964; GFX9-NEXT: v_mul_lo_u32 v2, v1, s15 1965; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 1966; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1967; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 1968; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2 1969; GFX9-NEXT: s_ashr_i32 s3, s10, 31 1970; GFX9-NEXT: s_add_i32 s4, s10, s3 1971; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 1972; GFX9-NEXT: s_xor_b32 s2, s9, s8 1973; GFX9-NEXT: s_xor_b32 s9, s4, s3 1974; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s9 1975; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 1976; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v2 1977; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1978; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 1979; GFX9-NEXT: v_subrev_u32_e32 v5, s15, v2 1980; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1981; GFX9-NEXT: s_sub_i32 s4, 0, s9 1982; GFX9-NEXT: v_mul_f32_e32 v3, s13, v3 1983; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 1984; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v2 1985; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 1986; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1987; GFX9-NEXT: v_mul_lo_u32 v2, s4, v3 1988; GFX9-NEXT: s_ashr_i32 s4, s6, 31 1989; GFX9-NEXT: s_add_i32 s5, s6, s4 1990; GFX9-NEXT: s_ashr_i32 s6, s11, 31 1991; GFX9-NEXT: s_add_i32 s8, s11, s6 1992; GFX9-NEXT: s_xor_b32 s8, s8, s6 1993; GFX9-NEXT: v_mul_hi_u32 v2, v3, v2 1994; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s8 1995; GFX9-NEXT: s_xor_b32 s5, s5, s4 1996; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 1997; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 1998; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v5 1999; GFX9-NEXT: v_mul_hi_u32 v2, s5, v2 2000; GFX9-NEXT: v_subrev_u32_e32 v1, s2, v1 2001; GFX9-NEXT: s_xor_b32 s2, s4, s3 2002; GFX9-NEXT: v_mul_f32_e32 v3, s13, v3 2003; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 2004; GFX9-NEXT: v_mul_lo_u32 v5, v2, s9 2005; GFX9-NEXT: s_sub_i32 s3, 0, s8 2006; GFX9-NEXT: v_add_u32_e32 v6, 1, v2 2007; GFX9-NEXT: v_mul_lo_u32 v7, s3, v3 2008; GFX9-NEXT: v_sub_u32_e32 v5, s5, v5 2009; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 2010; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2011; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v5 2012; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 2013; GFX9-NEXT: v_mul_hi_u32 v6, v3, v7 2014; GFX9-NEXT: s_ashr_i32 s3, s7, 31 2015; GFX9-NEXT: s_add_i32 s4, s7, s3 2016; GFX9-NEXT: s_xor_b32 s4, s4, s3 2017; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 2018; GFX9-NEXT: v_mul_hi_u32 v3, s4, v3 2019; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 2020; GFX9-NEXT: v_add_u32_e32 v6, 1, v2 2021; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2022; GFX9-NEXT: v_mul_lo_u32 v5, v3, s8 2023; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 2024; GFX9-NEXT: v_xor_b32_e32 v2, s2, v2 2025; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v2 2026; GFX9-NEXT: v_sub_u32_e32 v5, s4, v5 2027; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 2028; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2029; GFX9-NEXT: v_subrev_u32_e32 v6, s8, v5 2030; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 2031; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 2032; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 2033; GFX9-NEXT: s_xor_b32 s2, s3, s6 2034; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2035; GFX9-NEXT: v_xor_b32_e32 v3, s2, v3 2036; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v3 2037; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2038; GFX9-NEXT: s_endpgm 2039 %r = sdiv <4 x i32> %x, %y 2040 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 2041 ret void 2042} 2043 2044define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 2045; CHECK-LABEL: @srem_v4i32( 2046; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 2047; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 2048; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 2049; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 2050; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 2051; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 2052; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 2053; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 2054; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 2055; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2056; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000 2057; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 2058; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]] 2059; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]] 2060; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64 2061; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 2062; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]] 2063; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 2064; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32 2065; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 2066; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] 2067; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 2068; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64 2069; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 2070; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 2071; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 2072; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 2073; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]] 2074; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]] 2075; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]] 2076; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]] 2077; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]] 2078; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]] 2079; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]] 2080; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]] 2081; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]] 2082; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]] 2083; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i32> undef, i32 [[TMP37]], i64 0 2084; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i32> [[X]], i64 1 2085; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[Y]], i64 1 2086; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31 2087; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31 2088; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]] 2089; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]] 2090; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]] 2091; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]] 2092; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float 2093; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]]) 2094; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000 2095; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32 2096; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]] 2097; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]] 2098; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64 2099; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64 2100; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]] 2101; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32 2102; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32 2103; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32 2104; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]] 2105; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64 2106; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64 2107; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]] 2108; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32 2109; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32 2110; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32 2111; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]] 2112; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]] 2113; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]] 2114; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]] 2115; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]] 2116; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]] 2117; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]] 2118; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]] 2119; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]] 2120; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]] 2121; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP75]], i64 1 2122; CHECK-NEXT: [[TMP77:%.*]] = extractelement <4 x i32> [[X]], i64 2 2123; CHECK-NEXT: [[TMP78:%.*]] = extractelement <4 x i32> [[Y]], i64 2 2124; CHECK-NEXT: [[TMP79:%.*]] = ashr i32 [[TMP77]], 31 2125; CHECK-NEXT: [[TMP80:%.*]] = ashr i32 [[TMP78]], 31 2126; CHECK-NEXT: [[TMP81:%.*]] = add i32 [[TMP77]], [[TMP79]] 2127; CHECK-NEXT: [[TMP82:%.*]] = add i32 [[TMP78]], [[TMP80]] 2128; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP79]] 2129; CHECK-NEXT: [[TMP84:%.*]] = xor i32 [[TMP82]], [[TMP80]] 2130; CHECK-NEXT: [[TMP85:%.*]] = uitofp i32 [[TMP84]] to float 2131; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP85]]) 2132; CHECK-NEXT: [[TMP87:%.*]] = fmul fast float [[TMP86]], 0x41EFFFFFC0000000 2133; CHECK-NEXT: [[TMP88:%.*]] = fptoui float [[TMP87]] to i32 2134; CHECK-NEXT: [[TMP89:%.*]] = sub i32 0, [[TMP84]] 2135; CHECK-NEXT: [[TMP90:%.*]] = mul i32 [[TMP89]], [[TMP88]] 2136; CHECK-NEXT: [[TMP91:%.*]] = zext i32 [[TMP88]] to i64 2137; CHECK-NEXT: [[TMP92:%.*]] = zext i32 [[TMP90]] to i64 2138; CHECK-NEXT: [[TMP93:%.*]] = mul i64 [[TMP91]], [[TMP92]] 2139; CHECK-NEXT: [[TMP94:%.*]] = trunc i64 [[TMP93]] to i32 2140; CHECK-NEXT: [[TMP95:%.*]] = lshr i64 [[TMP93]], 32 2141; CHECK-NEXT: [[TMP96:%.*]] = trunc i64 [[TMP95]] to i32 2142; CHECK-NEXT: [[TMP97:%.*]] = add i32 [[TMP88]], [[TMP96]] 2143; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP83]] to i64 2144; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64 2145; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]] 2146; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 2147; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32 2148; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32 2149; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP84]] 2150; CHECK-NEXT: [[TMP105:%.*]] = sub i32 [[TMP83]], [[TMP104]] 2151; CHECK-NEXT: [[TMP106:%.*]] = icmp uge i32 [[TMP105]], [[TMP84]] 2152; CHECK-NEXT: [[TMP107:%.*]] = sub i32 [[TMP105]], [[TMP84]] 2153; CHECK-NEXT: [[TMP108:%.*]] = select i1 [[TMP106]], i32 [[TMP107]], i32 [[TMP105]] 2154; CHECK-NEXT: [[TMP109:%.*]] = icmp uge i32 [[TMP108]], [[TMP84]] 2155; CHECK-NEXT: [[TMP110:%.*]] = sub i32 [[TMP108]], [[TMP84]] 2156; CHECK-NEXT: [[TMP111:%.*]] = select i1 [[TMP109]], i32 [[TMP110]], i32 [[TMP108]] 2157; CHECK-NEXT: [[TMP112:%.*]] = xor i32 [[TMP111]], [[TMP79]] 2158; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP112]], [[TMP79]] 2159; CHECK-NEXT: [[TMP114:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP113]], i64 2 2160; CHECK-NEXT: [[TMP115:%.*]] = extractelement <4 x i32> [[X]], i64 3 2161; CHECK-NEXT: [[TMP116:%.*]] = extractelement <4 x i32> [[Y]], i64 3 2162; CHECK-NEXT: [[TMP117:%.*]] = ashr i32 [[TMP115]], 31 2163; CHECK-NEXT: [[TMP118:%.*]] = ashr i32 [[TMP116]], 31 2164; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], [[TMP117]] 2165; CHECK-NEXT: [[TMP120:%.*]] = add i32 [[TMP116]], [[TMP118]] 2166; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP119]], [[TMP117]] 2167; CHECK-NEXT: [[TMP122:%.*]] = xor i32 [[TMP120]], [[TMP118]] 2168; CHECK-NEXT: [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float 2169; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]]) 2170; CHECK-NEXT: [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41EFFFFFC0000000 2171; CHECK-NEXT: [[TMP126:%.*]] = fptoui float [[TMP125]] to i32 2172; CHECK-NEXT: [[TMP127:%.*]] = sub i32 0, [[TMP122]] 2173; CHECK-NEXT: [[TMP128:%.*]] = mul i32 [[TMP127]], [[TMP126]] 2174; CHECK-NEXT: [[TMP129:%.*]] = zext i32 [[TMP126]] to i64 2175; CHECK-NEXT: [[TMP130:%.*]] = zext i32 [[TMP128]] to i64 2176; CHECK-NEXT: [[TMP131:%.*]] = mul i64 [[TMP129]], [[TMP130]] 2177; CHECK-NEXT: [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32 2178; CHECK-NEXT: [[TMP133:%.*]] = lshr i64 [[TMP131]], 32 2179; CHECK-NEXT: [[TMP134:%.*]] = trunc i64 [[TMP133]] to i32 2180; CHECK-NEXT: [[TMP135:%.*]] = add i32 [[TMP126]], [[TMP134]] 2181; CHECK-NEXT: [[TMP136:%.*]] = zext i32 [[TMP121]] to i64 2182; CHECK-NEXT: [[TMP137:%.*]] = zext i32 [[TMP135]] to i64 2183; CHECK-NEXT: [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]] 2184; CHECK-NEXT: [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32 2185; CHECK-NEXT: [[TMP140:%.*]] = lshr i64 [[TMP138]], 32 2186; CHECK-NEXT: [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32 2187; CHECK-NEXT: [[TMP142:%.*]] = mul i32 [[TMP141]], [[TMP122]] 2188; CHECK-NEXT: [[TMP143:%.*]] = sub i32 [[TMP121]], [[TMP142]] 2189; CHECK-NEXT: [[TMP144:%.*]] = icmp uge i32 [[TMP143]], [[TMP122]] 2190; CHECK-NEXT: [[TMP145:%.*]] = sub i32 [[TMP143]], [[TMP122]] 2191; CHECK-NEXT: [[TMP146:%.*]] = select i1 [[TMP144]], i32 [[TMP145]], i32 [[TMP143]] 2192; CHECK-NEXT: [[TMP147:%.*]] = icmp uge i32 [[TMP146]], [[TMP122]] 2193; CHECK-NEXT: [[TMP148:%.*]] = sub i32 [[TMP146]], [[TMP122]] 2194; CHECK-NEXT: [[TMP149:%.*]] = select i1 [[TMP147]], i32 [[TMP148]], i32 [[TMP146]] 2195; CHECK-NEXT: [[TMP150:%.*]] = xor i32 [[TMP149]], [[TMP117]] 2196; CHECK-NEXT: [[TMP151:%.*]] = sub i32 [[TMP150]], [[TMP117]] 2197; CHECK-NEXT: [[TMP152:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP151]], i64 3 2198; CHECK-NEXT: store <4 x i32> [[TMP152]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 2199; CHECK-NEXT: ret void 2200; 2201; GFX6-LABEL: srem_v4i32: 2202; GFX6: ; %bb.0: 2203; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 2204; GFX6-NEXT: s_mov_b32 s13, 0x4f7ffffe 2205; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2206; GFX6-NEXT: s_mov_b32 s3, 0xf000 2207; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2208; GFX6-NEXT: s_ashr_i32 s2, s8, 31 2209; GFX6-NEXT: s_add_i32 s8, s8, s2 2210; GFX6-NEXT: s_xor_b32 s12, s8, s2 2211; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 2212; GFX6-NEXT: s_ashr_i32 s8, s9, 31 2213; GFX6-NEXT: s_add_i32 s9, s9, s8 2214; GFX6-NEXT: s_xor_b32 s14, s9, s8 2215; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 2216; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s14 2217; GFX6-NEXT: s_sub_i32 s9, 0, s12 2218; GFX6-NEXT: s_ashr_i32 s8, s4, 31 2219; GFX6-NEXT: v_mul_f32_e32 v0, s13, v0 2220; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 2221; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 2222; GFX6-NEXT: s_add_i32 s4, s4, s8 2223; GFX6-NEXT: s_xor_b32 s4, s4, s8 2224; GFX6-NEXT: v_mul_lo_u32 v2, s9, v0 2225; GFX6-NEXT: v_mul_f32_e32 v1, s13, v1 2226; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2227; GFX6-NEXT: s_sub_i32 s9, 0, s14 2228; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 2229; GFX6-NEXT: s_mov_b32 s2, -1 2230; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 2231; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 2232; GFX6-NEXT: v_mul_lo_u32 v2, s9, v1 2233; GFX6-NEXT: s_ashr_i32 s9, s5, 31 2234; GFX6-NEXT: s_add_i32 s5, s5, s9 2235; GFX6-NEXT: v_mul_lo_u32 v0, v0, s12 2236; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 2237; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 2238; GFX6-NEXT: s_xor_b32 s4, s5, s9 2239; GFX6-NEXT: s_ashr_i32 s5, s10, 31 2240; GFX6-NEXT: s_add_i32 s10, s10, s5 2241; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s12, v0 2242; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 2243; GFX6-NEXT: s_xor_b32 s10, s10, s5 2244; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 2245; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2246; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s10 2247; GFX6-NEXT: v_mul_hi_u32 v1, s4, v1 2248; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s12, v0 2249; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 2250; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 2251; GFX6-NEXT: v_mul_lo_u32 v1, v1, s14 2252; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 2253; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 2254; GFX6-NEXT: v_mul_f32_e32 v2, s13, v2 2255; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 2256; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 2257; GFX6-NEXT: s_sub_i32 s4, 0, s10 2258; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 2259; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s14, v1 2260; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s14, v1 2261; GFX6-NEXT: v_mul_lo_u32 v4, s4, v2 2262; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2263; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s14, v1 2264; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s14, v1 2265; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2266; GFX6-NEXT: v_mul_hi_u32 v3, v2, v4 2267; GFX6-NEXT: s_ashr_i32 s4, s6, 31 2268; GFX6-NEXT: s_add_i32 s5, s6, s4 2269; GFX6-NEXT: s_ashr_i32 s6, s11, 31 2270; GFX6-NEXT: s_add_i32 s8, s11, s6 2271; GFX6-NEXT: s_xor_b32 s8, s8, s6 2272; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 2273; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 2274; GFX6-NEXT: s_xor_b32 s5, s5, s4 2275; GFX6-NEXT: v_mul_hi_u32 v2, s5, v2 2276; GFX6-NEXT: v_xor_b32_e32 v1, s9, v1 2277; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 2278; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s9, v1 2279; GFX6-NEXT: v_mul_lo_u32 v2, v2, s10 2280; GFX6-NEXT: v_mul_f32_e32 v3, s13, v3 2281; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 2282; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v2 2283; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s10, v2 2284; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 2285; GFX6-NEXT: s_sub_i32 s5, 0, s8 2286; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 2287; GFX6-NEXT: v_mul_lo_u32 v4, s5, v3 2288; GFX6-NEXT: s_ashr_i32 s5, s7, 31 2289; GFX6-NEXT: s_add_i32 s6, s7, s5 2290; GFX6-NEXT: s_xor_b32 s6, s6, s5 2291; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 2292; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 2293; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 2294; GFX6-NEXT: v_mul_hi_u32 v3, s6, v3 2295; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 2296; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2297; GFX6-NEXT: v_xor_b32_e32 v2, s4, v2 2298; GFX6-NEXT: v_mul_lo_u32 v3, v3, s8 2299; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s4, v2 2300; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 2301; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 2302; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 2303; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 2304; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 2305; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 2306; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 2307; GFX6-NEXT: v_xor_b32_e32 v3, s5, v3 2308; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s5, v3 2309; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2310; GFX6-NEXT: s_endpgm 2311; GFX9-LABEL: srem_v4i32: 2312; GFX9: ; %bb.0: 2313; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 2314; GFX9-NEXT: s_mov_b32 s13, 0x4f7ffffe 2315; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2316; GFX9-NEXT: v_mov_b32_e32 v4, 0 2317; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2318; GFX9-NEXT: s_ashr_i32 s2, s8, 31 2319; GFX9-NEXT: s_add_i32 s8, s8, s2 2320; GFX9-NEXT: s_xor_b32 s2, s8, s2 2321; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 2322; GFX9-NEXT: s_ashr_i32 s3, s9, 31 2323; GFX9-NEXT: s_sub_i32 s12, 0, s2 2324; GFX9-NEXT: s_add_i32 s8, s9, s3 2325; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 2326; GFX9-NEXT: s_xor_b32 s3, s8, s3 2327; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 2328; GFX9-NEXT: s_ashr_i32 s8, s4, 31 2329; GFX9-NEXT: v_mul_f32_e32 v0, s13, v0 2330; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 2331; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 2332; GFX9-NEXT: s_add_i32 s4, s4, s8 2333; GFX9-NEXT: s_xor_b32 s4, s4, s8 2334; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0 2335; GFX9-NEXT: v_mul_f32_e32 v1, s13, v1 2336; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 2337; GFX9-NEXT: s_sub_i32 s12, 0, s3 2338; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 2339; GFX9-NEXT: s_ashr_i32 s9, s5, 31 2340; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 2341; GFX9-NEXT: s_add_i32 s5, s5, s9 2342; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 2343; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 2344; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 2345; GFX9-NEXT: s_xor_b32 s5, s5, s9 2346; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 2347; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 2348; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 2349; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 2350; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v0 2351; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 2352; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2353; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v0 2354; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 2355; GFX9-NEXT: s_ashr_i32 s2, s10, 31 2356; GFX9-NEXT: s_add_i32 s4, s10, s2 2357; GFX9-NEXT: s_xor_b32 s2, s4, s2 2358; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2359; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 2360; GFX9-NEXT: v_mul_lo_u32 v1, v1, s3 2361; GFX9-NEXT: s_sub_i32 s4, 0, s2 2362; GFX9-NEXT: v_xor_b32_e32 v0, s8, v0 2363; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 2364; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 2365; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 2366; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 2367; GFX9-NEXT: v_mul_f32_e32 v2, s13, v2 2368; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 2369; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2370; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 2371; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 2372; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2373; GFX9-NEXT: v_mul_lo_u32 v3, s4, v2 2374; GFX9-NEXT: s_ashr_i32 s4, s11, 31 2375; GFX9-NEXT: s_add_i32 s5, s11, s4 2376; GFX9-NEXT: s_xor_b32 s4, s5, s4 2377; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s4 2378; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 2379; GFX9-NEXT: s_ashr_i32 s3, s6, 31 2380; GFX9-NEXT: s_add_i32 s5, s6, s3 2381; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 2382; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 2383; GFX9-NEXT: s_xor_b32 s5, s5, s3 2384; GFX9-NEXT: v_mul_hi_u32 v2, s5, v2 2385; GFX9-NEXT: v_mul_f32_e32 v3, s13, v5 2386; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 2387; GFX9-NEXT: s_sub_i32 s6, 0, s4 2388; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 2389; GFX9-NEXT: v_xor_b32_e32 v1, s9, v1 2390; GFX9-NEXT: v_mul_lo_u32 v5, s6, v3 2391; GFX9-NEXT: v_subrev_u32_e32 v0, s8, v0 2392; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 2393; GFX9-NEXT: s_ashr_i32 s5, s7, 31 2394; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 2395; GFX9-NEXT: s_add_i32 s6, s7, s5 2396; GFX9-NEXT: s_xor_b32 s6, s6, s5 2397; GFX9-NEXT: v_subrev_u32_e32 v6, s2, v2 2398; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 2399; GFX9-NEXT: v_mul_hi_u32 v3, s6, v3 2400; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 2401; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2402; GFX9-NEXT: v_subrev_u32_e32 v5, s2, v2 2403; GFX9-NEXT: v_mul_lo_u32 v3, v3, s4 2404; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 2405; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2406; GFX9-NEXT: v_xor_b32_e32 v2, s3, v2 2407; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 2408; GFX9-NEXT: v_subrev_u32_e32 v5, s4, v3 2409; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 2410; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2411; GFX9-NEXT: v_subrev_u32_e32 v5, s4, v3 2412; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 2413; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2414; GFX9-NEXT: v_xor_b32_e32 v3, s5, v3 2415; GFX9-NEXT: v_subrev_u32_e32 v1, s9, v1 2416; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v2 2417; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v3 2418; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2419; GFX9-NEXT: s_endpgm 2420 %r = srem <4 x i32> %x, %y 2421 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 2422 ret void 2423} 2424 2425define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 2426; CHECK-LABEL: @udiv_v4i16( 2427; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2428; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2429; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 2430; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 2431; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 2432; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 2433; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 2434; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 2435; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 2436; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 2437; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 2438; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 2439; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 2440; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 2441; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 2442; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 2443; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 2444; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 2445; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 2446; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i16> undef, i16 [[TMP19]], i64 0 2447; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1 2448; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2449; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 2450; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 2451; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 2452; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 2453; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 2454; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 2455; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 2456; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 2457; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 2458; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 2459; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 2460; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 2461; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 2462; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 2463; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 2464; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 2465; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 2466; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1 2467; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2 2468; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2469; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 2470; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 2471; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 2472; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 2473; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 2474; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 2475; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 2476; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 2477; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 2478; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 2479; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 2480; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 2481; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 2482; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 2483; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 2484; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 2485; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 2486; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2 2487; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3 2488; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2489; CHECK-NEXT: [[TMP63:%.*]] = zext i16 [[TMP61]] to i32 2490; CHECK-NEXT: [[TMP64:%.*]] = zext i16 [[TMP62]] to i32 2491; CHECK-NEXT: [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float 2492; CHECK-NEXT: [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float 2493; CHECK-NEXT: [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]]) 2494; CHECK-NEXT: [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]] 2495; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]]) 2496; CHECK-NEXT: [[TMP70:%.*]] = fneg fast float [[TMP69]] 2497; CHECK-NEXT: [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]]) 2498; CHECK-NEXT: [[TMP72:%.*]] = fptoui float [[TMP69]] to i32 2499; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]]) 2500; CHECK-NEXT: [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 2501; CHECK-NEXT: [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]] 2502; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0 2503; CHECK-NEXT: [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]] 2504; CHECK-NEXT: [[TMP78:%.*]] = and i32 [[TMP77]], 65535 2505; CHECK-NEXT: [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16 2506; CHECK-NEXT: [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3 2507; CHECK-NEXT: store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 2508; CHECK-NEXT: ret void 2509; 2510; GFX6-LABEL: udiv_v4i16: 2511; GFX6: ; %bb.0: 2512; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2513; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 2514; GFX6-NEXT: s_mov_b32 s8, 0xffff 2515; GFX6-NEXT: s_mov_b32 s7, 0xf000 2516; GFX6-NEXT: s_mov_b32 s6, -1 2517; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2518; GFX6-NEXT: s_and_b32 s9, s2, s8 2519; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 2520; GFX6-NEXT: s_lshr_b32 s9, s0, 16 2521; GFX6-NEXT: s_and_b32 s0, s0, s8 2522; GFX6-NEXT: s_lshr_b32 s2, s2, 16 2523; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 2524; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 2525; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 2526; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s9 2527; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 2528; GFX6-NEXT: s_and_b32 s2, s3, s8 2529; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 2530; GFX6-NEXT: v_trunc_f32_e32 v2, v2 2531; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 2532; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 2533; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2534; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 2535; GFX6-NEXT: v_trunc_f32_e32 v1, v1 2536; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 2537; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 2538; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s2 2539; GFX6-NEXT: s_lshr_b32 s0, s1, 16 2540; GFX6-NEXT: s_and_b32 s1, s1, s8 2541; GFX6-NEXT: s_lshr_b32 s10, s3, 16 2542; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 2543; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2544; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 2545; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 2546; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 2547; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc 2548; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v3 2549; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2550; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 2551; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s0 2552; GFX6-NEXT: v_trunc_f32_e32 v1, v1 2553; GFX6-NEXT: v_mad_f32 v5, -v1, v4, v5 2554; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 2555; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2556; GFX6-NEXT: v_mul_f32_e32 v4, v6, v7 2557; GFX6-NEXT: v_trunc_f32_e32 v4, v4 2558; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v4 2559; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2560; GFX6-NEXT: v_mad_f32 v4, -v4, v3, v6 2561; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 2562; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 2563; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 2564; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2565; GFX6-NEXT: v_and_b32_e32 v1, s8, v1 2566; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 2567; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 2568; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2569; GFX6-NEXT: s_endpgm 2570; GFX9-LABEL: udiv_v4i16: 2571; GFX9: ; %bb.0: 2572; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 2573; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 2574; GFX9-NEXT: s_mov_b32 s8, 0xffff 2575; GFX9-NEXT: v_mov_b32_e32 v2, 0 2576; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2577; GFX9-NEXT: s_and_b32 s1, s6, s8 2578; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 2579; GFX9-NEXT: s_lshr_b32 s0, s4, 16 2580; GFX9-NEXT: s_and_b32 s4, s4, s8 2581; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 2582; GFX9-NEXT: s_lshr_b32 s4, s6, 16 2583; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 2584; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 2585; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 2586; GFX9-NEXT: s_and_b32 s0, s7, s8 2587; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 2588; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 2589; GFX9-NEXT: v_trunc_f32_e32 v3, v3 2590; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 2591; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 2592; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2593; GFX9-NEXT: v_mul_f32_e32 v1, v5, v6 2594; GFX9-NEXT: v_trunc_f32_e32 v1, v1 2595; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 2596; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 2597; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 2598; GFX9-NEXT: s_and_b32 s0, s5, s8 2599; GFX9-NEXT: s_lshr_b32 s6, s7, 16 2600; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 2601; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 2602; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 2603; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 2604; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 2605; GFX9-NEXT: s_lshr_b32 s1, s5, 16 2606; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc 2607; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v4 2608; GFX9-NEXT: v_mul_f32_e32 v1, v6, v7 2609; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 2610; GFX9-NEXT: v_trunc_f32_e32 v1, v1 2611; GFX9-NEXT: v_mad_f32 v6, -v1, v5, v6 2612; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 2613; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 2614; GFX9-NEXT: v_mul_f32_e32 v5, v7, v8 2615; GFX9-NEXT: v_trunc_f32_e32 v5, v5 2616; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v5 2617; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 2618; GFX9-NEXT: v_mad_f32 v5, -v5, v4, v7 2619; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 2620; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff 2621; GFX9-NEXT: v_and_b32_e32 v0, v5, v0 2622; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc 2623; GFX9-NEXT: v_and_b32_e32 v1, v5, v1 2624; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 2625; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 2626; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 2627; GFX9-NEXT: s_endpgm 2628 %r = udiv <4 x i16> %x, %y 2629 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 2630 ret void 2631} 2632 2633define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 2634; CHECK-LABEL: @urem_v4i16( 2635; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2636; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2637; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 2638; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 2639; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 2640; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 2641; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 2642; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 2643; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 2644; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 2645; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 2646; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 2647; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 2648; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 2649; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 2650; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 2651; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 2652; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 2653; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 2654; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 2655; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 2656; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i16> undef, i16 [[TMP21]], i64 0 2657; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1 2658; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2659; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 2660; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 2661; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 2662; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 2663; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 2664; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 2665; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 2666; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 2667; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 2668; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 2669; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 2670; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 2671; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 2672; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 2673; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 2674; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 2675; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 2676; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 2677; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 2678; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1 2679; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2 2680; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2681; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 2682; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 2683; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 2684; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 2685; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 2686; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 2687; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 2688; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 2689; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 2690; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 2691; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 2692; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 2693; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 2694; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 2695; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 2696; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 2697; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 2698; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 2699; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 2700; CHECK-NEXT: [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2 2701; CHECK-NEXT: [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3 2702; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2703; CHECK-NEXT: [[TMP69:%.*]] = zext i16 [[TMP67]] to i32 2704; CHECK-NEXT: [[TMP70:%.*]] = zext i16 [[TMP68]] to i32 2705; CHECK-NEXT: [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float 2706; CHECK-NEXT: [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float 2707; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]]) 2708; CHECK-NEXT: [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]] 2709; CHECK-NEXT: [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]]) 2710; CHECK-NEXT: [[TMP76:%.*]] = fneg fast float [[TMP75]] 2711; CHECK-NEXT: [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]]) 2712; CHECK-NEXT: [[TMP78:%.*]] = fptoui float [[TMP75]] to i32 2713; CHECK-NEXT: [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]]) 2714; CHECK-NEXT: [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]]) 2715; CHECK-NEXT: [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]] 2716; CHECK-NEXT: [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0 2717; CHECK-NEXT: [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]] 2718; CHECK-NEXT: [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]] 2719; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]] 2720; CHECK-NEXT: [[TMP86:%.*]] = and i32 [[TMP85]], 65535 2721; CHECK-NEXT: [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16 2722; CHECK-NEXT: [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3 2723; CHECK-NEXT: store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 2724; CHECK-NEXT: ret void 2725; 2726; GFX6-LABEL: urem_v4i16: 2727; GFX6: ; %bb.0: 2728; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2729; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 2730; GFX6-NEXT: s_mov_b32 s8, 0xffff 2731; GFX6-NEXT: s_mov_b32 s7, 0xf000 2732; GFX6-NEXT: s_mov_b32 s6, -1 2733; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2734; GFX6-NEXT: s_and_b32 s9, s2, s8 2735; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 2736; GFX6-NEXT: s_and_b32 s10, s0, s8 2737; GFX6-NEXT: s_lshr_b32 s11, s2, 16 2738; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 2739; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 2740; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s11 2741; GFX6-NEXT: s_lshr_b32 s9, s0, 16 2742; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s9 2743; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 2744; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 2745; GFX6-NEXT: v_trunc_f32_e32 v2, v2 2746; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 2747; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 2748; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2749; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 2750; GFX6-NEXT: v_trunc_f32_e32 v1, v1 2751; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 2752; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1 2753; GFX6-NEXT: v_mad_f32 v1, -v1, v3, v4 2754; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3 2755; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 2756; GFX6-NEXT: s_and_b32 s2, s3, s8 2757; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 2758; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 2759; GFX6-NEXT: s_and_b32 s2, s1, s8 2760; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11 2761; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 2762; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 2763; GFX6-NEXT: s_lshr_b32 s12, s3, 16 2764; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s9, v1 2765; GFX6-NEXT: s_lshr_b32 s10, s1, 16 2766; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 2767; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s12 2768; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s10 2769; GFX6-NEXT: v_trunc_f32_e32 v1, v1 2770; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2771; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 2772; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v3 2773; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 2774; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2775; GFX6-NEXT: v_mul_f32_e32 v2, v6, v7 2776; GFX6-NEXT: v_trunc_f32_e32 v2, v2 2777; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 2778; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2779; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v6 2780; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 2781; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 2782; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 2783; GFX6-NEXT: v_mul_lo_u32 v2, v2, s12 2784; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 2785; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 2786; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 2787; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2788; GFX6-NEXT: v_and_b32_e32 v1, s8, v1 2789; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 2790; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 2791; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 2792; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2793; GFX6-NEXT: s_endpgm 2794; GFX9-LABEL: urem_v4i16: 2795; GFX9: ; %bb.0: 2796; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 2797; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 2798; GFX9-NEXT: s_mov_b32 s8, 0xffff 2799; GFX9-NEXT: v_mov_b32_e32 v2, 0 2800; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2801; GFX9-NEXT: s_and_b32 s1, s6, s8 2802; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 2803; GFX9-NEXT: s_and_b32 s9, s4, s8 2804; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 2805; GFX9-NEXT: s_lshr_b32 s9, s6, 16 2806; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 2807; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s9 2808; GFX9-NEXT: s_lshr_b32 s0, s4, 16 2809; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 2810; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 2811; GFX9-NEXT: v_trunc_f32_e32 v3, v3 2812; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 2813; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 2814; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 2815; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2816; GFX9-NEXT: s_lshr_b32 s10, s7, 16 2817; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 2818; GFX9-NEXT: v_mul_f32_e32 v1, v5, v6 2819; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 2820; GFX9-NEXT: v_trunc_f32_e32 v1, v1 2821; GFX9-NEXT: s_and_b32 s6, s7, s8 2822; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 2823; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6 2824; GFX9-NEXT: s_and_b32 s6, s5, s8 2825; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 2826; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s10 2827; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s6 2828; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 2829; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 2830; GFX9-NEXT: s_lshr_b32 s1, s5, 16 2831; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v4 2832; GFX9-NEXT: v_mul_f32_e32 v3, v6, v7 2833; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 2834; GFX9-NEXT: v_trunc_f32_e32 v3, v3 2835; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 2836; GFX9-NEXT: v_mad_f32 v6, -v3, v5, v6 2837; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 2838; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 2839; GFX9-NEXT: v_mul_f32_e32 v5, v7, v8 2840; GFX9-NEXT: v_trunc_f32_e32 v5, v5 2841; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v5 2842; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 2843; GFX9-NEXT: v_mad_f32 v5, -v5, v4, v7 2844; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 2845; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc 2846; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 2847; GFX9-NEXT: v_mul_lo_u32 v3, v3, s7 2848; GFX9-NEXT: v_mul_lo_u32 v4, v4, s10 2849; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 2850; GFX9-NEXT: v_sub_u32_e32 v5, s0, v1 2851; GFX9-NEXT: v_sub_u32_e32 v1, s5, v3 2852; GFX9-NEXT: v_sub_u32_e32 v3, s1, v4 2853; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff 2854; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 2855; GFX9-NEXT: v_and_b32_e32 v0, v4, v0 2856; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 2857; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 2858; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 2859; GFX9-NEXT: s_endpgm 2860 %r = urem <4 x i16> %x, %y 2861 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 2862 ret void 2863} 2864 2865define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 2866; CHECK-LABEL: @sdiv_v4i16( 2867; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2868; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2869; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 2870; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 2871; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 2872; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 2873; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 2874; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 2875; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 2876; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2877; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 2878; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 2879; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 2880; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 2881; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 2882; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 2883; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2884; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 2885; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 2886; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 2887; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 2888; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 2889; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 2890; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> undef, i16 [[TMP23]], i64 0 2891; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1 2892; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2893; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 2894; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 2895; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 2896; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 2897; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 2898; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 2899; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 2900; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 2901; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 2902; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 2903; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 2904; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 2905; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 2906; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 2907; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 2908; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 2909; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 2910; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 2911; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 2912; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 2913; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 2914; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1 2915; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2 2916; CHECK-NEXT: [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2917; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 2918; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 2919; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 2920; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 2921; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 2922; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 2923; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 2924; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 2925; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 2926; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 2927; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 2928; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 2929; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 2930; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 2931; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 2932; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 2933; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 2934; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 2935; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 2936; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 2937; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 2938; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2 2939; CHECK-NEXT: [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3 2940; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2941; CHECK-NEXT: [[TMP75:%.*]] = sext i16 [[TMP73]] to i32 2942; CHECK-NEXT: [[TMP76:%.*]] = sext i16 [[TMP74]] to i32 2943; CHECK-NEXT: [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]] 2944; CHECK-NEXT: [[TMP78:%.*]] = ashr i32 [[TMP77]], 30 2945; CHECK-NEXT: [[TMP79:%.*]] = or i32 [[TMP78]], 1 2946; CHECK-NEXT: [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float 2947; CHECK-NEXT: [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float 2948; CHECK-NEXT: [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]]) 2949; CHECK-NEXT: [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]] 2950; CHECK-NEXT: [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]]) 2951; CHECK-NEXT: [[TMP85:%.*]] = fneg fast float [[TMP84]] 2952; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]]) 2953; CHECK-NEXT: [[TMP87:%.*]] = fptosi float [[TMP84]] to i32 2954; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]]) 2955; CHECK-NEXT: [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]]) 2956; CHECK-NEXT: [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]] 2957; CHECK-NEXT: [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0 2958; CHECK-NEXT: [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]] 2959; CHECK-NEXT: [[TMP93:%.*]] = shl i32 [[TMP92]], 16 2960; CHECK-NEXT: [[TMP94:%.*]] = ashr i32 [[TMP93]], 16 2961; CHECK-NEXT: [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16 2962; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3 2963; CHECK-NEXT: store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 2964; CHECK-NEXT: ret void 2965; 2966; GFX6-LABEL: sdiv_v4i16: 2967; GFX6: ; %bb.0: 2968; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2969; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 2970; GFX6-NEXT: s_mov_b32 s7, 0xf000 2971; GFX6-NEXT: s_mov_b32 s6, -1 2972; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2973; GFX6-NEXT: s_sext_i32_i16 s8, s2 2974; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 2975; GFX6-NEXT: s_sext_i32_i16 s9, s0 2976; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 2977; GFX6-NEXT: s_xor_b32 s8, s9, s8 2978; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 2979; GFX6-NEXT: s_ashr_i32 s2, s2, 16 2980; GFX6-NEXT: s_ashr_i32 s8, s8, 30 2981; GFX6-NEXT: s_or_b32 s8, s8, 1 2982; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 2983; GFX6-NEXT: v_trunc_f32_e32 v2, v2 2984; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 2985; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 2986; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 2987; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2 2988; GFX6-NEXT: v_mov_b32_e32 v3, s8 2989; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 2990; GFX6-NEXT: s_ashr_i32 s0, s0, 16 2991; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 2992; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 2993; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 2994; GFX6-NEXT: s_xor_b32 s0, s0, s2 2995; GFX6-NEXT: s_ashr_i32 s0, s0, 30 2996; GFX6-NEXT: s_or_b32 s0, s0, 1 2997; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 2998; GFX6-NEXT: v_trunc_f32_e32 v3, v3 2999; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 3000; GFX6-NEXT: v_mov_b32_e32 v4, s0 3001; GFX6-NEXT: s_sext_i32_i16 s0, s3 3002; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 3003; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 3004; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 3005; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 3006; GFX6-NEXT: s_sext_i32_i16 s2, s1 3007; GFX6-NEXT: v_add_i32_e32 v3, vcc, v1, v3 3008; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2 3009; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 3010; GFX6-NEXT: s_xor_b32 s0, s2, s0 3011; GFX6-NEXT: s_ashr_i32 s0, s0, 30 3012; GFX6-NEXT: s_or_b32 s0, s0, 1 3013; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 3014; GFX6-NEXT: v_trunc_f32_e32 v4, v4 3015; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1 3016; GFX6-NEXT: v_mov_b32_e32 v5, s0 3017; GFX6-NEXT: s_ashr_i32 s0, s3, 16 3018; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 3019; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 3020; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 3021; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc 3022; GFX6-NEXT: s_ashr_i32 s1, s1, 16 3023; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 3024; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 3025; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 3026; GFX6-NEXT: s_xor_b32 s0, s1, s0 3027; GFX6-NEXT: s_ashr_i32 s0, s0, 30 3028; GFX6-NEXT: s_or_b32 s0, s0, 1 3029; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 3030; GFX6-NEXT: v_trunc_f32_e32 v5, v5 3031; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 3032; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 3033; GFX6-NEXT: v_mov_b32_e32 v6, s0 3034; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| 3035; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc 3036; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 3037; GFX6-NEXT: s_mov_b32 s0, 0xffff 3038; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3039; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 3040; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 3041; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 3042; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 3043; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 3044; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3045; GFX6-NEXT: s_endpgm 3046; GFX9-LABEL: sdiv_v4i16: 3047; GFX9: ; %bb.0: 3048; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3049; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3050; GFX9-NEXT: v_mov_b32_e32 v2, 0 3051; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3052; GFX9-NEXT: s_sext_i32_i16 s0, s6 3053; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 3054; GFX9-NEXT: s_sext_i32_i16 s1, s4 3055; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 3056; GFX9-NEXT: s_xor_b32 s0, s1, s0 3057; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3058; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3059; GFX9-NEXT: s_or_b32 s8, s0, 1 3060; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 3061; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3062; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 3063; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3064; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 3065; GFX9-NEXT: s_cselect_b32 s0, s8, 0 3066; GFX9-NEXT: s_ashr_i32 s1, s6, 16 3067; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 3068; GFX9-NEXT: s_ashr_i32 s4, s4, 16 3069; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s4 3070; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 3071; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 3072; GFX9-NEXT: v_add_u32_e32 v3, s0, v3 3073; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 3074; GFX9-NEXT: s_xor_b32 s0, s4, s1 3075; GFX9-NEXT: v_trunc_f32_e32 v4, v4 3076; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3077; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 3078; GFX9-NEXT: s_or_b32 s4, s0, 1 3079; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3080; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 3081; GFX9-NEXT: s_sext_i32_i16 s1, s7 3082; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 3083; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 3084; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3085; GFX9-NEXT: v_add_u32_e32 v4, s0, v4 3086; GFX9-NEXT: s_sext_i32_i16 s0, s5 3087; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 3088; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 3089; GFX9-NEXT: s_xor_b32 s0, s0, s1 3090; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3091; GFX9-NEXT: s_or_b32 s4, s0, 1 3092; GFX9-NEXT: v_mul_f32_e32 v5, v1, v5 3093; GFX9-NEXT: v_trunc_f32_e32 v5, v5 3094; GFX9-NEXT: v_mad_f32 v1, -v5, v0, v1 3095; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3096; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 3097; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3098; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 3099; GFX9-NEXT: s_ashr_i32 s1, s7, 16 3100; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 3101; GFX9-NEXT: v_add_u32_e32 v1, s0, v5 3102; GFX9-NEXT: s_ashr_i32 s0, s5, 16 3103; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 3104; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v0 3105; GFX9-NEXT: s_xor_b32 s0, s0, s1 3106; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3107; GFX9-NEXT: s_or_b32 s4, s0, 1 3108; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 3109; GFX9-NEXT: v_trunc_f32_e32 v6, v6 3110; GFX9-NEXT: v_mad_f32 v5, -v6, v0, v5 3111; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 3112; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| 3113; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 3114; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3115; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff 3116; GFX9-NEXT: v_add_u32_e32 v0, s0, v6 3117; GFX9-NEXT: v_and_b32_e32 v1, v5, v1 3118; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 3119; GFX9-NEXT: v_and_b32_e32 v0, v5, v3 3120; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 3121; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3122; GFX9-NEXT: s_endpgm 3123 %r = sdiv <4 x i16> %x, %y 3124 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 3125 ret void 3126} 3127 3128define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 3129; CHECK-LABEL: @srem_v4i16( 3130; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 3131; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 3132; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 3133; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 3134; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 3135; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 3136; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 3137; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 3138; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 3139; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 3140; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 3141; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 3142; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 3143; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 3144; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 3145; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 3146; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3147; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 3148; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 3149; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 3150; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 3151; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 3152; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 3153; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 3154; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 3155; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> undef, i16 [[TMP25]], i64 0 3156; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1 3157; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1 3158; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 3159; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 3160; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 3161; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 3162; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 3163; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 3164; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 3165; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 3166; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 3167; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 3168; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 3169; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 3170; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 3171; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 3172; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 3173; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 3174; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 3175; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 3176; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 3177; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 3178; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 3179; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 3180; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 3181; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1 3182; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2 3183; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2 3184; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 3185; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 3186; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 3187; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 3188; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 3189; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 3190; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 3191; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 3192; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 3193; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 3194; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 3195; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 3196; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 3197; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 3198; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 3199; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 3200; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 3201; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 3202; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 3203; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 3204; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 3205; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 3206; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 3207; CHECK-NEXT: [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2 3208; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3 3209; CHECK-NEXT: [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3 3210; CHECK-NEXT: [[TMP81:%.*]] = sext i16 [[TMP79]] to i32 3211; CHECK-NEXT: [[TMP82:%.*]] = sext i16 [[TMP80]] to i32 3212; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]] 3213; CHECK-NEXT: [[TMP84:%.*]] = ashr i32 [[TMP83]], 30 3214; CHECK-NEXT: [[TMP85:%.*]] = or i32 [[TMP84]], 1 3215; CHECK-NEXT: [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float 3216; CHECK-NEXT: [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float 3217; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]]) 3218; CHECK-NEXT: [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]] 3219; CHECK-NEXT: [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]]) 3220; CHECK-NEXT: [[TMP91:%.*]] = fneg fast float [[TMP90]] 3221; CHECK-NEXT: [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]]) 3222; CHECK-NEXT: [[TMP93:%.*]] = fptosi float [[TMP90]] to i32 3223; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]]) 3224; CHECK-NEXT: [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]]) 3225; CHECK-NEXT: [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]] 3226; CHECK-NEXT: [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0 3227; CHECK-NEXT: [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]] 3228; CHECK-NEXT: [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]] 3229; CHECK-NEXT: [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]] 3230; CHECK-NEXT: [[TMP101:%.*]] = shl i32 [[TMP100]], 16 3231; CHECK-NEXT: [[TMP102:%.*]] = ashr i32 [[TMP101]], 16 3232; CHECK-NEXT: [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16 3233; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3 3234; CHECK-NEXT: store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 3235; CHECK-NEXT: ret void 3236; 3237; GFX6-LABEL: srem_v4i16: 3238; GFX6: ; %bb.0: 3239; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3240; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 3241; GFX6-NEXT: s_mov_b32 s7, 0xf000 3242; GFX6-NEXT: s_mov_b32 s6, -1 3243; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3244; GFX6-NEXT: s_sext_i32_i16 s8, s2 3245; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 3246; GFX6-NEXT: s_sext_i32_i16 s9, s0 3247; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 3248; GFX6-NEXT: s_xor_b32 s8, s9, s8 3249; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3250; GFX6-NEXT: s_ashr_i32 s8, s8, 30 3251; GFX6-NEXT: s_or_b32 s8, s8, 1 3252; GFX6-NEXT: v_mov_b32_e32 v3, s8 3253; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3254; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3255; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3256; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 3257; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3258; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3259; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 3260; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 3261; GFX6-NEXT: s_ashr_i32 s2, s2, 16 3262; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2 3263; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 3264; GFX6-NEXT: s_ashr_i32 s0, s0, 16 3265; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 3266; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 3267; GFX6-NEXT: s_xor_b32 s8, s0, s2 3268; GFX6-NEXT: s_ashr_i32 s8, s8, 30 3269; GFX6-NEXT: s_or_b32 s8, s8, 1 3270; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 3271; GFX6-NEXT: v_trunc_f32_e32 v3, v3 3272; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 3273; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 3274; GFX6-NEXT: v_mov_b32_e32 v4, s8 3275; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 3276; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 3277; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 3278; GFX6-NEXT: v_mul_lo_u32 v1, v1, s2 3279; GFX6-NEXT: s_sext_i32_i16 s2, s3 3280; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s2 3281; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s0, v1 3282; GFX6-NEXT: s_sext_i32_i16 s0, s1 3283; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 3284; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 3285; GFX6-NEXT: s_xor_b32 s0, s0, s2 3286; GFX6-NEXT: s_ashr_i32 s0, s0, 30 3287; GFX6-NEXT: s_or_b32 s0, s0, 1 3288; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 3289; GFX6-NEXT: v_trunc_f32_e32 v4, v4 3290; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1 3291; GFX6-NEXT: v_mov_b32_e32 v5, s0 3292; GFX6-NEXT: s_ashr_i32 s0, s3, 16 3293; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 3294; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 3295; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 3296; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc 3297; GFX6-NEXT: s_ashr_i32 s2, s1, 16 3298; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 3299; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s2 3300; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 3301; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 3302; GFX6-NEXT: s_xor_b32 s3, s2, s0 3303; GFX6-NEXT: s_ashr_i32 s3, s3, 30 3304; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 3305; GFX6-NEXT: v_trunc_f32_e32 v5, v5 3306; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 3307; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 3308; GFX6-NEXT: s_or_b32 s3, s3, 1 3309; GFX6-NEXT: v_mov_b32_e32 v6, s3 3310; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| 3311; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc 3312; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 3313; GFX6-NEXT: v_mul_lo_u32 v2, v2, s0 3314; GFX6-NEXT: s_mov_b32 s0, 0xffff 3315; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 3316; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 3317; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 3318; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3319; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 3320; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 3321; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 3322; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 3323; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3324; GFX6-NEXT: s_endpgm 3325; GFX9-LABEL: srem_v4i16: 3326; GFX9: ; %bb.0: 3327; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3328; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3329; GFX9-NEXT: v_mov_b32_e32 v2, 0 3330; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3331; GFX9-NEXT: s_sext_i32_i16 s0, s6 3332; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 3333; GFX9-NEXT: s_sext_i32_i16 s1, s4 3334; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 3335; GFX9-NEXT: s_xor_b32 s0, s1, s0 3336; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3337; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3338; GFX9-NEXT: s_or_b32 s8, s0, 1 3339; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 3340; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3341; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 3342; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3343; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 3344; GFX9-NEXT: s_cselect_b32 s0, s8, 0 3345; GFX9-NEXT: s_ashr_i32 s9, s6, 16 3346; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 3347; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s9 3348; GFX9-NEXT: s_ashr_i32 s8, s4, 16 3349; GFX9-NEXT: v_add_u32_e32 v1, s0, v3 3350; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s8 3351; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 3352; GFX9-NEXT: s_xor_b32 s0, s8, s9 3353; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3354; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 3355; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 3356; GFX9-NEXT: v_trunc_f32_e32 v4, v4 3357; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 3358; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 3359; GFX9-NEXT: s_or_b32 s6, s0, 1 3360; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| 3361; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 3362; GFX9-NEXT: s_cselect_b32 s0, s6, 0 3363; GFX9-NEXT: v_add_u32_e32 v0, s0, v4 3364; GFX9-NEXT: s_sext_i32_i16 s0, s7 3365; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 3366; GFX9-NEXT: s_sext_i32_i16 s1, s5 3367; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 3368; GFX9-NEXT: s_xor_b32 s0, s1, s0 3369; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 3370; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3371; GFX9-NEXT: s_or_b32 s6, s0, 1 3372; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 3373; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 3374; GFX9-NEXT: v_trunc_f32_e32 v5, v5 3375; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 3376; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 3377; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 3378; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 3379; GFX9-NEXT: s_cselect_b32 s0, s6, 0 3380; GFX9-NEXT: s_ashr_i32 s6, s7, 16 3381; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s6 3382; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 3383; GFX9-NEXT: v_mul_lo_u32 v3, v3, s7 3384; GFX9-NEXT: s_ashr_i32 s7, s5, 16 3385; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s7 3386; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 3387; GFX9-NEXT: s_xor_b32 s0, s7, s6 3388; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3389; GFX9-NEXT: s_or_b32 s9, s0, 1 3390; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 3391; GFX9-NEXT: v_trunc_f32_e32 v6, v6 3392; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 3393; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 3394; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| 3395; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 3396; GFX9-NEXT: s_cselect_b32 s0, s9, 0 3397; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 3398; GFX9-NEXT: v_mul_lo_u32 v4, v4, s6 3399; GFX9-NEXT: v_sub_u32_e32 v5, s4, v1 3400; GFX9-NEXT: v_sub_u32_e32 v1, s5, v3 3401; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 3402; GFX9-NEXT: v_sub_u32_e32 v3, s7, v4 3403; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff 3404; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 3405; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 3406; GFX9-NEXT: v_and_b32_e32 v3, v4, v5 3407; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 3408; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3409; GFX9-NEXT: s_endpgm 3410 %r = srem <4 x i16> %x, %y 3411 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 3412 ret void 3413} 3414 3415define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 3416; CHECK-LABEL: @udiv_i3( 3417; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 3418; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 3419; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 3420; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 3421; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 3422; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 3423; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 3424; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 3425; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 3426; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 3427; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3428; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 3429; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 3430; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 3431; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 3432; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 7 3433; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3 3434; CHECK-NEXT: store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]], align 1 3435; CHECK-NEXT: ret void 3436; 3437; GFX6-LABEL: udiv_i3: 3438; GFX6: ; %bb.0: 3439; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3440; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 3441; GFX6-NEXT: s_mov_b32 s7, 0xf000 3442; GFX6-NEXT: s_mov_b32 s6, -1 3443; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3444; GFX6-NEXT: s_bfe_u32 s1, s0, 0x30008 3445; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s1 3446; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 3447; GFX6-NEXT: s_and_b32 s0, s0, 7 3448; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 3449; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 3450; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3451; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 3452; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 3453; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3454; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 3455; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 3456; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 3457; GFX6-NEXT: s_endpgm 3458; GFX9-LABEL: udiv_i3: 3459; GFX9: ; %bb.0: 3460; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3461; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 3462; GFX9-NEXT: v_mov_b32_e32 v2, 0 3463; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3464; GFX9-NEXT: s_bfe_u32 s0, s4, 0x30008 3465; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 3466; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 3467; GFX9-NEXT: s_and_b32 s0, s4, 7 3468; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 3469; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 3470; GFX9-NEXT: v_trunc_f32_e32 v1, v1 3471; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 3472; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v3 3473; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3474; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 3475; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 3476; GFX9-NEXT: global_store_byte v2, v0, s[2:3] 3477; GFX9-NEXT: s_endpgm 3478 %r = udiv i3 %x, %y 3479 store i3 %r, i3 addrspace(1)* %out 3480 ret void 3481} 3482 3483define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 3484; CHECK-LABEL: @urem_i3( 3485; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 3486; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 3487; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 3488; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 3489; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 3490; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 3491; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 3492; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 3493; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 3494; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 3495; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3496; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 3497; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 3498; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 3499; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 3500; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 3501; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 3502; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 7 3503; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3 3504; CHECK-NEXT: store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]], align 1 3505; CHECK-NEXT: ret void 3506; 3507; GFX6-LABEL: urem_i3: 3508; GFX6: ; %bb.0: 3509; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3510; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 3511; GFX6-NEXT: s_mov_b32 s7, 0xf000 3512; GFX6-NEXT: s_mov_b32 s6, -1 3513; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3514; GFX6-NEXT: s_bfe_u32 s1, s0, 0x30008 3515; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s1 3516; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 3517; GFX6-NEXT: s_and_b32 s2, s0, 7 3518; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 3519; GFX6-NEXT: s_lshr_b32 s1, s0, 8 3520; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 3521; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3522; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 3523; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 3524; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3525; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 3526; GFX6-NEXT: v_mul_lo_u32 v0, v0, s1 3527; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 3528; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 3529; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 3530; GFX6-NEXT: s_endpgm 3531; GFX9-LABEL: urem_i3: 3532; GFX9: ; %bb.0: 3533; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 3534; GFX9-NEXT: s_nop 0 3535; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3536; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3537; GFX9-NEXT: s_bfe_u32 s3, s2, 0x30008 3538; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 3539; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 3540; GFX9-NEXT: s_and_b32 s4, s2, 7 3541; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 3542; GFX9-NEXT: s_lshr_b32 s3, s2, 8 3543; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 3544; GFX9-NEXT: v_trunc_f32_e32 v1, v1 3545; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 3546; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2 3547; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3548; GFX9-NEXT: v_mov_b32_e32 v1, 0 3549; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 3550; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 3551; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 3552; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 3553; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 3554; GFX9-NEXT: s_endpgm 3555 %r = urem i3 %x, %y 3556 store i3 %r, i3 addrspace(1)* %out 3557 ret void 3558} 3559 3560define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 3561; CHECK-LABEL: @sdiv_i3( 3562; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 3563; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 3564; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 3565; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 3566; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 3567; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 3568; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 3569; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 3570; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 3571; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 3572; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 3573; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 3574; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 3575; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 3576; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 3577; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 3578; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 3579; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 3580; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 29 3581; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 29 3582; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3 3583; CHECK-NEXT: store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]], align 1 3584; CHECK-NEXT: ret void 3585; 3586; GFX6-LABEL: sdiv_i3: 3587; GFX6: ; %bb.0: 3588; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3589; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 3590; GFX6-NEXT: s_mov_b32 s7, 0xf000 3591; GFX6-NEXT: s_mov_b32 s6, -1 3592; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3593; GFX6-NEXT: s_bfe_i32 s1, s0, 0x30008 3594; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 3595; GFX6-NEXT: s_bfe_i32 s0, s0, 0x30000 3596; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 3597; GFX6-NEXT: s_xor_b32 s0, s0, s1 3598; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3599; GFX6-NEXT: s_ashr_i32 s0, s0, 30 3600; GFX6-NEXT: s_or_b32 s0, s0, 1 3601; GFX6-NEXT: v_mov_b32_e32 v3, s0 3602; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3603; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3604; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3605; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 3606; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3607; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3608; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 3609; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 3610; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 3611; GFX6-NEXT: s_endpgm 3612; GFX9-LABEL: sdiv_i3: 3613; GFX9: ; %bb.0: 3614; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3615; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 3616; GFX9-NEXT: v_mov_b32_e32 v1, 0 3617; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3618; GFX9-NEXT: s_bfe_i32 s0, s4, 0x30008 3619; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 3620; GFX9-NEXT: s_bfe_i32 s1, s4, 0x30000 3621; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 3622; GFX9-NEXT: s_xor_b32 s0, s1, s0 3623; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3624; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3625; GFX9-NEXT: s_or_b32 s4, s0, 1 3626; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 3627; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3628; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 3629; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 3630; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 3631; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 3632; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3633; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 3634; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 3635; GFX9-NEXT: global_store_byte v1, v0, s[2:3] 3636; GFX9-NEXT: s_endpgm 3637 %r = sdiv i3 %x, %y 3638 store i3 %r, i3 addrspace(1)* %out 3639 ret void 3640} 3641 3642define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 3643; CHECK-LABEL: @srem_i3( 3644; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 3645; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 3646; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 3647; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 3648; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 3649; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 3650; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 3651; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 3652; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 3653; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 3654; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 3655; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 3656; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 3657; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 3658; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 3659; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 3660; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 3661; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 3662; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 3663; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 3664; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 29 3665; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 29 3666; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3 3667; CHECK-NEXT: store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]], align 1 3668; CHECK-NEXT: ret void 3669; 3670; GFX6-LABEL: srem_i3: 3671; GFX6: ; %bb.0: 3672; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3673; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 3674; GFX6-NEXT: s_mov_b32 s7, 0xf000 3675; GFX6-NEXT: s_mov_b32 s6, -1 3676; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3677; GFX6-NEXT: s_bfe_i32 s1, s0, 0x30008 3678; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 3679; GFX6-NEXT: s_bfe_i32 s3, s0, 0x30000 3680; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 3681; GFX6-NEXT: s_xor_b32 s1, s3, s1 3682; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3683; GFX6-NEXT: s_ashr_i32 s1, s1, 30 3684; GFX6-NEXT: s_or_b32 s1, s1, 1 3685; GFX6-NEXT: v_mov_b32_e32 v3, s1 3686; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3687; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3688; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3689; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 3690; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3691; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3692; GFX6-NEXT: s_lshr_b32 s2, s0, 8 3693; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 3694; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 3695; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 3696; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 3697; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 3698; GFX6-NEXT: s_endpgm 3699; GFX9-LABEL: srem_i3: 3700; GFX9: ; %bb.0: 3701; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 3702; GFX9-NEXT: s_nop 0 3703; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3704; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3705; GFX9-NEXT: s_bfe_i32 s2, s4, 0x30008 3706; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 3707; GFX9-NEXT: s_bfe_i32 s3, s4, 0x30000 3708; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 3709; GFX9-NEXT: s_xor_b32 s2, s3, s2 3710; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 3711; GFX9-NEXT: s_ashr_i32 s2, s2, 30 3712; GFX9-NEXT: s_lshr_b32 s5, s4, 8 3713; GFX9-NEXT: s_or_b32 s6, s2, 1 3714; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 3715; GFX9-NEXT: v_trunc_f32_e32 v2, v2 3716; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 3717; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 3718; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 3719; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 3720; GFX9-NEXT: s_cselect_b32 s2, s6, 0 3721; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 3722; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 3723; GFX9-NEXT: v_mov_b32_e32 v1, 0 3724; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 3725; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 3726; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 3727; GFX9-NEXT: s_endpgm 3728 %r = srem i3 %x, %y 3729 store i3 %r, i3 addrspace(1)* %out 3730 ret void 3731} 3732 3733define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 3734; CHECK-LABEL: @udiv_v3i16( 3735; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 3736; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 3737; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 3738; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 3739; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3740; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3741; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3742; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3743; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3744; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3745; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3746; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3747; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3748; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3749; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3750; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3751; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3752; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 3753; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 3754; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i16> undef, i16 [[TMP19]], i64 0 3755; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1 3756; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1 3757; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 3758; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 3759; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 3760; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 3761; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 3762; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 3763; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 3764; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 3765; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 3766; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 3767; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 3768; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 3769; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 3770; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 3771; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 3772; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 3773; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 3774; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1 3775; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2 3776; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2 3777; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 3778; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 3779; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 3780; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 3781; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 3782; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 3783; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 3784; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 3785; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 3786; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 3787; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 3788; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 3789; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 3790; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 3791; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 3792; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 3793; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 3794; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2 3795; CHECK-NEXT: store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 3796; CHECK-NEXT: ret void 3797; 3798; GFX6-LABEL: udiv_v3i16: 3799; GFX6: ; %bb.0: 3800; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3801; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3802; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3803; GFX6-NEXT: s_mov_b32 s8, 0xffff 3804; GFX6-NEXT: s_mov_b32 s7, 0xf000 3805; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3806; GFX6-NEXT: s_and_b32 s6, s0, s8 3807; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 3808; GFX6-NEXT: s_and_b32 s6, s2, s8 3809; GFX6-NEXT: s_lshr_b32 s0, s0, 16 3810; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s0 3811; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s6 3812; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3813; GFX6-NEXT: s_lshr_b32 s0, s2, 16 3814; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s0 3815; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 3816; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3817; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3818; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3819; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 3820; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3821; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 3822; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3823; GFX6-NEXT: s_and_b32 s0, s1, s8 3824; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 3825; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 3826; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s0 3827; GFX6-NEXT: s_and_b32 s0, s3, s8 3828; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 3829; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 3830; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 3831; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 3832; GFX6-NEXT: s_mov_b32 s6, -1 3833; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3834; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6 3835; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3836; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 3837; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5 3838; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 3839; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3840; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 3841; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 3842; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 3843; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 3844; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 3845; GFX6-NEXT: s_endpgm 3846; GFX9-LABEL: udiv_v3i16: 3847; GFX9: ; %bb.0: 3848; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3849; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 3850; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 3851; GFX9-NEXT: s_mov_b32 s8, 0xffff 3852; GFX9-NEXT: v_mov_b32_e32 v1, 0 3853; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3854; GFX9-NEXT: s_and_b32 s0, s6, s8 3855; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 3856; GFX9-NEXT: s_and_b32 s0, s4, s8 3857; GFX9-NEXT: s_lshr_b32 s1, s6, 16 3858; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s0 3859; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3860; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s1 3861; GFX9-NEXT: s_lshr_b32 s0, s4, 16 3862; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 3863; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 3864; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 3865; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3866; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 3867; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 3868; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 3869; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6 3870; GFX9-NEXT: v_trunc_f32_e32 v2, v2 3871; GFX9-NEXT: s_and_b32 s0, s7, s8 3872; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 3873; GFX9-NEXT: v_mad_f32 v3, -v2, v4, v5 3874; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 3875; GFX9-NEXT: s_and_b32 s0, s5, s8 3876; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 3877; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 3878; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 3879; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 3880; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 3881; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 3882; GFX9-NEXT: v_mul_f32_e32 v3, v6, v7 3883; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3884; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 3885; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v6 3886; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 3887; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 3888; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 3889; GFX9-NEXT: global_store_short v1, v3, s[2:3] offset:4 3890; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 3891; GFX9-NEXT: s_endpgm 3892 %r = udiv <3 x i16> %x, %y 3893 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 3894 ret void 3895} 3896 3897define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 3898; CHECK-LABEL: @urem_v3i16( 3899; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 3900; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 3901; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 3902; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 3903; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3904; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3905; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3906; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3907; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3908; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3909; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3910; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3911; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3912; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3913; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3914; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3915; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3916; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 3917; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 3918; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 3919; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 3920; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i16> undef, i16 [[TMP21]], i64 0 3921; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1 3922; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1 3923; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 3924; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 3925; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 3926; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 3927; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 3928; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 3929; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 3930; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 3931; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 3932; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 3933; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 3934; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 3935; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 3936; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 3937; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 3938; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 3939; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 3940; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 3941; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 3942; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1 3943; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2 3944; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2 3945; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 3946; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 3947; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 3948; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 3949; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 3950; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 3951; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 3952; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 3953; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 3954; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 3955; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 3956; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 3957; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 3958; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 3959; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 3960; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 3961; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 3962; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 3963; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 3964; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2 3965; CHECK-NEXT: store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 3966; CHECK-NEXT: ret void 3967; 3968; GFX6-LABEL: urem_v3i16: 3969; GFX6: ; %bb.0: 3970; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3971; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3972; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3973; GFX6-NEXT: s_mov_b32 s8, 0xffff 3974; GFX6-NEXT: s_mov_b32 s7, 0xf000 3975; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3976; GFX6-NEXT: v_mov_b32_e32 v1, s2 3977; GFX6-NEXT: s_and_b32 s6, s0, s8 3978; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 3979; GFX6-NEXT: s_and_b32 s6, s2, s8 3980; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 3981; GFX6-NEXT: v_mov_b32_e32 v4, s0 3982; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 3983; GFX6-NEXT: v_alignbit_b32 v4, s1, v4, 16 3984; GFX6-NEXT: v_and_b32_e32 v5, s8, v4 3985; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 3986; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 3987; GFX6-NEXT: v_trunc_f32_e32 v3, v3 3988; GFX6-NEXT: v_mad_f32 v2, -v3, v0, v2 3989; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 3990; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 3991; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v5 3992; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 3993; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc 3994; GFX6-NEXT: v_mul_lo_u32 v0, v0, s0 3995; GFX6-NEXT: s_and_b32 s0, s1, s8 3996; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 3997; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 3998; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s0 3999; GFX6-NEXT: s_and_b32 s0, s3, s8 4000; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s0 4001; GFX6-NEXT: v_mul_f32_e32 v5, v3, v5 4002; GFX6-NEXT: v_trunc_f32_e32 v5, v5 4003; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 4004; GFX6-NEXT: v_mad_f32 v3, -v5, v2, v3 4005; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 4006; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 4007; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 4008; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 4009; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 4010; GFX6-NEXT: v_trunc_f32_e32 v3, v3 4011; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 4012; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3 4013; GFX6-NEXT: v_mad_f32 v3, -v3, v6, v7 4014; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v6 4015; GFX6-NEXT: s_mov_b32 s6, -1 4016; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 4017; GFX6-NEXT: v_mul_lo_u32 v3, v3, s1 4018; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 4019; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 4020; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 4021; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 4022; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 4023; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 4024; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4025; GFX6-NEXT: s_endpgm 4026; GFX9-LABEL: urem_v3i16: 4027; GFX9: ; %bb.0: 4028; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4029; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 4030; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 4031; GFX9-NEXT: s_mov_b32 s8, 0xffff 4032; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4033; GFX9-NEXT: s_and_b32 s0, s4, s8 4034; GFX9-NEXT: s_and_b32 s1, s6, s8 4035; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 4036; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 4037; GFX9-NEXT: s_lshr_b32 s6, s6, 16 4038; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 4039; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 4040; GFX9-NEXT: s_lshr_b32 s4, s4, 16 4041; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 4042; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v2 4043; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 4044; GFX9-NEXT: v_trunc_f32_e32 v3, v3 4045; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v3 4046; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 4047; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 4048; GFX9-NEXT: v_mul_f32_e32 v1, v4, v5 4049; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc 4050; GFX9-NEXT: v_mul_lo_u32 v0, v0, s1 4051; GFX9-NEXT: v_trunc_f32_e32 v1, v1 4052; GFX9-NEXT: s_and_b32 s1, s7, s8 4053; GFX9-NEXT: v_mad_f32 v3, -v1, v2, v4 4054; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s1 4055; GFX9-NEXT: s_and_b32 s5, s5, s8 4056; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5 4057; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 4058; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 4059; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 4060; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 4061; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 4062; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6 4063; GFX9-NEXT: v_trunc_f32_e32 v2, v2 4064; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 4065; GFX9-NEXT: v_mad_f32 v2, -v2, v4, v5 4066; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 4067; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 4068; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc 4069; GFX9-NEXT: v_mul_lo_u32 v2, v2, s1 4070; GFX9-NEXT: v_mov_b32_e32 v3, 0 4071; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 4072; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 4073; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 4074; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 4075; GFX9-NEXT: global_store_short v3, v2, s[2:3] offset:4 4076; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 4077; GFX9-NEXT: s_endpgm 4078 %r = urem <3 x i16> %x, %y 4079 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 4080 ret void 4081} 4082 4083define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 4084; CHECK-LABEL: @sdiv_v3i16( 4085; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 4086; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 4087; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 4088; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 4089; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 4090; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 4091; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 4092; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 4093; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 4094; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 4095; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 4096; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 4097; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 4098; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 4099; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 4100; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 4101; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4102; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 4103; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 4104; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 4105; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 4106; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 4107; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 4108; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i16> undef, i16 [[TMP23]], i64 0 4109; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1 4110; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1 4111; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 4112; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 4113; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 4114; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 4115; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 4116; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 4117; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 4118; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 4119; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 4120; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 4121; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 4122; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 4123; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 4124; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 4125; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 4126; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 4127; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 4128; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 4129; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 4130; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 4131; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 4132; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1 4133; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2 4134; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2 4135; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 4136; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 4137; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 4138; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 4139; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 4140; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 4141; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 4142; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 4143; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 4144; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 4145; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 4146; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 4147; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 4148; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 4149; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 4150; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 4151; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 4152; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 4153; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 4154; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 4155; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 4156; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2 4157; CHECK-NEXT: store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 4158; CHECK-NEXT: ret void 4159; 4160; GFX6-LABEL: sdiv_v3i16: 4161; GFX6: ; %bb.0: 4162; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4163; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4164; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 4165; GFX6-NEXT: s_mov_b32 s7, 0xf000 4166; GFX6-NEXT: s_mov_b32 s6, -1 4167; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4168; GFX6-NEXT: s_sext_i32_i16 s9, s2 4169; GFX6-NEXT: s_sext_i32_i16 s8, s0 4170; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 4171; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 4172; GFX6-NEXT: s_xor_b32 s8, s9, s8 4173; GFX6-NEXT: s_ashr_i32 s0, s0, 16 4174; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 4175; GFX6-NEXT: s_ashr_i32 s8, s8, 30 4176; GFX6-NEXT: s_or_b32 s8, s8, 1 4177; GFX6-NEXT: v_mov_b32_e32 v3, s8 4178; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 4179; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4180; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 4181; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 4182; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 4183; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 4184; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 4185; GFX6-NEXT: s_ashr_i32 s2, s2, 16 4186; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 4187; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s2 4188; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 4189; GFX6-NEXT: s_xor_b32 s0, s2, s0 4190; GFX6-NEXT: s_ashr_i32 s0, s0, 30 4191; GFX6-NEXT: s_or_b32 s0, s0, 1 4192; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 4193; GFX6-NEXT: v_trunc_f32_e32 v3, v3 4194; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 4195; GFX6-NEXT: v_mov_b32_e32 v4, s0 4196; GFX6-NEXT: s_sext_i32_i16 s0, s1 4197; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 4198; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 4199; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 4200; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 4201; GFX6-NEXT: s_sext_i32_i16 s1, s3 4202; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 4203; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s1 4204; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 4205; GFX6-NEXT: s_xor_b32 s0, s1, s0 4206; GFX6-NEXT: s_ashr_i32 s0, s0, 30 4207; GFX6-NEXT: s_or_b32 s0, s0, 1 4208; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 4209; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4210; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 4211; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 4212; GFX6-NEXT: v_mov_b32_e32 v5, s0 4213; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 4214; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 4215; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 4216; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 4217; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 4218; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 4219; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 4220; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4221; GFX6-NEXT: s_endpgm 4222; GFX9-LABEL: sdiv_v3i16: 4223; GFX9: ; %bb.0: 4224; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4225; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 4226; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 4227; GFX9-NEXT: v_mov_b32_e32 v1, 0 4228; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4229; GFX9-NEXT: s_sext_i32_i16 s1, s4 4230; GFX9-NEXT: s_sext_i32_i16 s0, s6 4231; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 4232; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 4233; GFX9-NEXT: s_xor_b32 s0, s1, s0 4234; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4235; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 4236; GFX9-NEXT: s_or_b32 s8, s0, 1 4237; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 4238; GFX9-NEXT: v_trunc_f32_e32 v3, v3 4239; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 4240; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 4241; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 4242; GFX9-NEXT: s_cselect_b32 s0, s8, 0 4243; GFX9-NEXT: s_ashr_i32 s1, s6, 16 4244; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 4245; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 4246; GFX9-NEXT: s_ashr_i32 s4, s4, 16 4247; GFX9-NEXT: v_add_u32_e32 v2, s0, v3 4248; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 4249; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 4250; GFX9-NEXT: s_xor_b32 s0, s4, s1 4251; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4252; GFX9-NEXT: s_or_b32 s4, s0, 1 4253; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 4254; GFX9-NEXT: v_trunc_f32_e32 v4, v4 4255; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 4256; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| 4257; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 4258; GFX9-NEXT: s_sext_i32_i16 s1, s7 4259; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 4260; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 4261; GFX9-NEXT: s_cselect_b32 s0, s4, 0 4262; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 4263; GFX9-NEXT: v_add_u32_e32 v3, s0, v4 4264; GFX9-NEXT: s_sext_i32_i16 s0, s5 4265; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 4266; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 4267; GFX9-NEXT: s_xor_b32 s0, s0, s1 4268; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4269; GFX9-NEXT: s_or_b32 s4, s0, 1 4270; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 4271; GFX9-NEXT: v_trunc_f32_e32 v5, v5 4272; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 4273; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 4274; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| 4275; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 4276; GFX9-NEXT: s_cselect_b32 s0, s4, 0 4277; GFX9-NEXT: v_add_u32_e32 v0, s0, v5 4278; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 4279; GFX9-NEXT: global_store_short v1, v0, s[2:3] offset:4 4280; GFX9-NEXT: global_store_dword v1, v2, s[2:3] 4281; GFX9-NEXT: s_endpgm 4282 %r = sdiv <3 x i16> %x, %y 4283 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 4284 ret void 4285} 4286 4287define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 4288; CHECK-LABEL: @srem_v3i16( 4289; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 4290; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 4291; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 4292; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 4293; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 4294; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 4295; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 4296; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 4297; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 4298; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 4299; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 4300; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 4301; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 4302; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 4303; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 4304; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 4305; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4306; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 4307; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 4308; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 4309; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 4310; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 4311; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 4312; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 4313; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 4314; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i16> undef, i16 [[TMP25]], i64 0 4315; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1 4316; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1 4317; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 4318; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 4319; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 4320; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 4321; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 4322; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 4323; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 4324; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 4325; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 4326; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 4327; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 4328; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 4329; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 4330; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 4331; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 4332; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 4333; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 4334; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 4335; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 4336; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 4337; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 4338; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 4339; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 4340; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1 4341; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2 4342; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2 4343; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 4344; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 4345; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 4346; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 4347; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 4348; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 4349; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 4350; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 4351; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 4352; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 4353; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 4354; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 4355; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 4356; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 4357; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 4358; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 4359; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 4360; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 4361; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 4362; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 4363; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 4364; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 4365; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 4366; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2 4367; CHECK-NEXT: store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 4368; CHECK-NEXT: ret void 4369; 4370; GFX6-LABEL: srem_v3i16: 4371; GFX6: ; %bb.0: 4372; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4373; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4374; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 4375; GFX6-NEXT: s_mov_b32 s7, 0xf000 4376; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4377; GFX6-NEXT: s_sext_i32_i16 s8, s2 4378; GFX6-NEXT: s_sext_i32_i16 s6, s0 4379; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 4380; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s8 4381; GFX6-NEXT: s_xor_b32 s6, s8, s6 4382; GFX6-NEXT: s_ashr_i32 s6, s6, 30 4383; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 4384; GFX6-NEXT: s_or_b32 s6, s6, 1 4385; GFX6-NEXT: v_mov_b32_e32 v3, s6 4386; GFX6-NEXT: s_mov_b32 s6, -1 4387; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 4388; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4389; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 4390; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 4391; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 4392; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 4393; GFX6-NEXT: v_mov_b32_e32 v1, s2 4394; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 4395; GFX6-NEXT: v_mov_b32_e32 v2, s0 4396; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 16 4397; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 4398; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v3 4399; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 4400; GFX6-NEXT: v_bfe_i32 v5, v1, 0, 16 4401; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v5 4402; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 4403; GFX6-NEXT: v_mul_lo_u32 v0, v0, s0 4404; GFX6-NEXT: v_xor_b32_e32 v3, v5, v3 4405; GFX6-NEXT: s_sext_i32_i16 s0, s1 4406; GFX6-NEXT: v_mul_f32_e32 v5, v6, v7 4407; GFX6-NEXT: v_trunc_f32_e32 v5, v5 4408; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 4409; GFX6-NEXT: v_mad_f32 v6, -v5, v4, v6 4410; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 4411; GFX6-NEXT: v_ashrrev_i32_e32 v3, 30, v3 4412; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4| 4413; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s0 4414; GFX6-NEXT: v_or_b32_e32 v3, 1, v3 4415; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 4416; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 4417; GFX6-NEXT: s_sext_i32_i16 s2, s3 4418; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 4419; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s2 4420; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v4 4421; GFX6-NEXT: s_xor_b32 s0, s2, s0 4422; GFX6-NEXT: s_ashr_i32 s0, s0, 30 4423; GFX6-NEXT: s_or_b32 s0, s0, 1 4424; GFX6-NEXT: v_mul_f32_e32 v5, v3, v5 4425; GFX6-NEXT: v_trunc_f32_e32 v5, v5 4426; GFX6-NEXT: v_mad_f32 v3, -v5, v4, v3 4427; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 4428; GFX6-NEXT: v_mov_b32_e32 v6, s0 4429; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v4| 4430; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 4431; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 4432; GFX6-NEXT: v_mul_lo_u32 v3, v3, s1 4433; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 4434; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 4435; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 4436; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 4437; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 4438; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 4439; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4440; GFX6-NEXT: s_endpgm 4441; GFX9-LABEL: srem_v3i16: 4442; GFX9: ; %bb.0: 4443; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 4444; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 4445; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 4446; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4447; GFX9-NEXT: s_sext_i32_i16 s8, s2 4448; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 4449; GFX9-NEXT: s_sext_i32_i16 s9, s6 4450; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 4451; GFX9-NEXT: s_xor_b32 s0, s9, s8 4452; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 4453; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4454; GFX9-NEXT: s_or_b32 s10, s0, 1 4455; GFX9-NEXT: s_sext_i32_i16 s3, s3 4456; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 4457; GFX9-NEXT: v_trunc_f32_e32 v2, v2 4458; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 4459; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 4460; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 4461; GFX9-NEXT: s_cselect_b32 s0, s10, 0 4462; GFX9-NEXT: s_ashr_i32 s2, s2, 16 4463; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 4464; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 4465; GFX9-NEXT: s_ashr_i32 s6, s6, 16 4466; GFX9-NEXT: v_add_u32_e32 v1, s0, v2 4467; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s6 4468; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 4469; GFX9-NEXT: s_xor_b32 s0, s6, s2 4470; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4471; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 4472; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 4473; GFX9-NEXT: v_trunc_f32_e32 v3, v3 4474; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 4475; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 4476; GFX9-NEXT: s_or_b32 s8, s0, 1 4477; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 4478; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 4479; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 4480; GFX9-NEXT: s_cselect_b32 s0, s8, 0 4481; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 4482; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 4483; GFX9-NEXT: s_sext_i32_i16 s2, s7 4484; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2 4485; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 4486; GFX9-NEXT: s_xor_b32 s0, s2, s3 4487; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4488; GFX9-NEXT: s_or_b32 s7, s0, 1 4489; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 4490; GFX9-NEXT: v_trunc_f32_e32 v4, v4 4491; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 4492; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 4493; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| 4494; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 4495; GFX9-NEXT: s_cselect_b32 s0, s7, 0 4496; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 4497; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 4498; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 4499; GFX9-NEXT: v_mov_b32_e32 v3, 0 4500; GFX9-NEXT: v_sub_u32_e32 v0, s6, v0 4501; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 4502; GFX9-NEXT: v_sub_u32_e32 v2, s2, v2 4503; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 4504; GFX9-NEXT: global_store_short v3, v2, s[4:5] offset:4 4505; GFX9-NEXT: global_store_dword v3, v0, s[4:5] 4506; GFX9-NEXT: s_endpgm 4507 %r = srem <3 x i16> %x, %y 4508 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 4509 ret void 4510} 4511 4512define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 4513; CHECK-LABEL: @udiv_v3i15( 4514; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 4515; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 4516; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 4517; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 4518; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 4519; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 4520; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 4521; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 4522; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 4523; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 4524; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 4525; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 4526; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 4527; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 4528; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 4529; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 4530; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 4531; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 32767 4532; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15 4533; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i15> undef, i15 [[TMP19]], i64 0 4534; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1 4535; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1 4536; CHECK-NEXT: [[TMP23:%.*]] = zext i15 [[TMP21]] to i32 4537; CHECK-NEXT: [[TMP24:%.*]] = zext i15 [[TMP22]] to i32 4538; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 4539; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 4540; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 4541; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 4542; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 4543; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 4544; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 4545; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 4546; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 4547; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 4548; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 4549; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 4550; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 4551; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 32767 4552; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15 4553; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1 4554; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2 4555; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2 4556; CHECK-NEXT: [[TMP43:%.*]] = zext i15 [[TMP41]] to i32 4557; CHECK-NEXT: [[TMP44:%.*]] = zext i15 [[TMP42]] to i32 4558; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 4559; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 4560; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 4561; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 4562; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 4563; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 4564; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 4565; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 4566; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 4567; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 4568; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 4569; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 4570; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 4571; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 32767 4572; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15 4573; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2 4574; CHECK-NEXT: store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 4575; CHECK-NEXT: ret void 4576; 4577; GFX6-LABEL: udiv_v3i15: 4578; GFX6: ; %bb.0: 4579; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4580; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4581; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 4582; GFX6-NEXT: s_mov_b32 s7, 0xf000 4583; GFX6-NEXT: s_mov_b32 s6, -1 4584; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4585; GFX6-NEXT: v_mov_b32_e32 v0, s2 4586; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 4587; GFX6-NEXT: s_movk_i32 s3, 0x7fff 4588; GFX6-NEXT: s_and_b32 s9, s0, s3 4589; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 4590; GFX6-NEXT: v_mov_b32_e32 v2, s0 4591; GFX6-NEXT: s_and_b32 s8, s2, s3 4592; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf000f 4593; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 4594; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 4595; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 4596; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf000f 4597; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 4598; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s2 4599; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 4600; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 4601; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 4602; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4603; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 4604; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 4605; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v2 4606; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 4607; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7 4608; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 4609; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4610; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 4611; GFX6-NEXT: v_mad_f32 v4, -v1, v5, v6 4612; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 4613; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0 4614; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 4615; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v5 4616; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 4617; GFX6-NEXT: v_mul_f32_e32 v1, v0, v6 4618; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4619; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v1 4620; GFX6-NEXT: v_mad_f32 v0, -v1, v2, v0 4621; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 4622; GFX6-NEXT: v_and_b32_e32 v2, s3, v3 4623; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc 4624; GFX6-NEXT: v_and_b32_e32 v3, s3, v4 4625; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 4626; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 4627; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 4628; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 4629; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4630; GFX6-NEXT: s_waitcnt expcnt(0) 4631; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 4632; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 4633; GFX6-NEXT: s_endpgm 4634; GFX9-LABEL: udiv_v3i15: 4635; GFX9: ; %bb.0: 4636; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4637; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 4638; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 4639; GFX9-NEXT: s_movk_i32 s8, 0x7fff 4640; GFX9-NEXT: v_mov_b32_e32 v2, 0 4641; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4642; GFX9-NEXT: s_and_b32 s0, s4, s8 4643; GFX9-NEXT: s_and_b32 s1, s6, s8 4644; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 4645; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 4646; GFX9-NEXT: s_bfe_u32 s0, s6, 0xf000f 4647; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 4648; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 4649; GFX9-NEXT: v_mov_b32_e32 v3, s6 4650; GFX9-NEXT: s_bfe_u32 s1, s4, 0xf000f 4651; GFX9-NEXT: v_alignbit_b32 v3, s7, v3, 30 4652; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 4653; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 4654; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 4655; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 4656; GFX9-NEXT: v_trunc_f32_e32 v5, v5 4657; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 4658; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 4659; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 4660; GFX9-NEXT: v_mov_b32_e32 v0, s4 4661; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 4662; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 4663; GFX9-NEXT: v_mul_f32_e32 v1, v7, v8 4664; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 4665; GFX9-NEXT: v_trunc_f32_e32 v1, v1 4666; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 4667; GFX9-NEXT: v_mad_f32 v5, -v1, v6, v7 4668; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 4669; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 4670; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 4671; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v6 4672; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc 4673; GFX9-NEXT: v_mul_f32_e32 v1, v0, v7 4674; GFX9-NEXT: v_trunc_f32_e32 v1, v1 4675; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v1 4676; GFX9-NEXT: v_mad_f32 v0, -v1, v3, v0 4677; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v3 4678; GFX9-NEXT: v_and_b32_e32 v3, s8, v4 4679; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc 4680; GFX9-NEXT: v_and_b32_e32 v4, s8, v5 4681; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 4682; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 4683; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 4684; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 4685; GFX9-NEXT: global_store_dword v2, v0, s[2:3] 4686; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 4687; GFX9-NEXT: global_store_short v2, v0, s[2:3] offset:4 4688; GFX9-NEXT: s_endpgm 4689 %r = udiv <3 x i15> %x, %y 4690 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 4691 ret void 4692} 4693 4694define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 4695; CHECK-LABEL: @urem_v3i15( 4696; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 4697; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 4698; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 4699; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 4700; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 4701; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 4702; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 4703; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 4704; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 4705; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 4706; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 4707; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 4708; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 4709; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 4710; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 4711; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 4712; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 4713; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 4714; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 4715; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 32767 4716; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15 4717; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i15> undef, i15 [[TMP21]], i64 0 4718; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1 4719; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1 4720; CHECK-NEXT: [[TMP25:%.*]] = zext i15 [[TMP23]] to i32 4721; CHECK-NEXT: [[TMP26:%.*]] = zext i15 [[TMP24]] to i32 4722; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 4723; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 4724; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 4725; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 4726; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 4727; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 4728; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 4729; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 4730; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 4731; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 4732; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 4733; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 4734; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 4735; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 4736; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 4737; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 32767 4738; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15 4739; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1 4740; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2 4741; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2 4742; CHECK-NEXT: [[TMP47:%.*]] = zext i15 [[TMP45]] to i32 4743; CHECK-NEXT: [[TMP48:%.*]] = zext i15 [[TMP46]] to i32 4744; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 4745; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 4746; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 4747; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 4748; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 4749; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 4750; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 4751; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 4752; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 4753; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 4754; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 4755; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 4756; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 4757; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 4758; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 4759; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 32767 4760; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15 4761; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2 4762; CHECK-NEXT: store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 4763; CHECK-NEXT: ret void 4764; 4765; GFX6-LABEL: urem_v3i15: 4766; GFX6: ; %bb.0: 4767; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4768; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4769; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 4770; GFX6-NEXT: s_mov_b32 s7, 0xf000 4771; GFX6-NEXT: s_mov_b32 s6, -1 4772; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4773; GFX6-NEXT: v_mov_b32_e32 v0, s2 4774; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 4775; GFX6-NEXT: s_movk_i32 s3, 0x7fff 4776; GFX6-NEXT: s_and_b32 s10, s0, s3 4777; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 4778; GFX6-NEXT: s_and_b32 s9, s2, s3 4779; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s9 4780; GFX6-NEXT: v_mov_b32_e32 v2, s0 4781; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 4782; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 4783; GFX6-NEXT: s_bfe_u32 s1, s0, 0xf000f 4784; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 4785; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 4786; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4787; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 4788; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 4789; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 4790; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf000f 4791; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 4792; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc 4793; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 4794; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 4795; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 4796; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 4797; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s2, v1 4798; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 4799; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2 4800; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0 4801; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4802; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 4803; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 4804; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 4805; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 4806; GFX6-NEXT: s_lshr_b32 s0, s0, 15 4807; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 4808; GFX6-NEXT: v_trunc_f32_e32 v3, v3 4809; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3 4810; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4811; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 4812; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 4813; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 4814; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 4815; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 4816; GFX6-NEXT: s_lshr_b32 s8, s2, 15 4817; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v1 4818; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 4819; GFX6-NEXT: v_and_b32_e32 v3, s3, v3 4820; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 4821; GFX6-NEXT: v_and_b32_e32 v2, s3, v6 4822; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 4823; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 4824; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 4825; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4826; GFX6-NEXT: s_waitcnt expcnt(0) 4827; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 4828; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 4829; GFX6-NEXT: s_endpgm 4830; GFX9-LABEL: urem_v3i15: 4831; GFX9: ; %bb.0: 4832; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4833; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 4834; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 4835; GFX9-NEXT: s_movk_i32 s8, 0x7fff 4836; GFX9-NEXT: v_mov_b32_e32 v2, 0 4837; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4838; GFX9-NEXT: v_mov_b32_e32 v0, s4 4839; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 4840; GFX9-NEXT: s_and_b32 s5, s6, s8 4841; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 4842; GFX9-NEXT: s_and_b32 s0, s4, s8 4843; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 4844; GFX9-NEXT: s_bfe_u32 s5, s6, 0xf000f 4845; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 4846; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s5 4847; GFX9-NEXT: v_mov_b32_e32 v3, s6 4848; GFX9-NEXT: v_alignbit_b32 v3, s7, v3, 30 4849; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 4850; GFX9-NEXT: v_trunc_f32_e32 v5, v5 4851; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 4852; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 4853; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 4854; GFX9-NEXT: s_bfe_u32 s1, s4, 0xf000f 4855; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 4856; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc 4857; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 4858; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 4859; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 4860; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 4861; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v5 4862; GFX9-NEXT: s_lshr_b32 s0, s6, 15 4863; GFX9-NEXT: v_mul_f32_e32 v4, v7, v8 4864; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 4865; GFX9-NEXT: v_trunc_f32_e32 v4, v4 4866; GFX9-NEXT: v_mad_f32 v7, -v4, v6, v7 4867; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 4868; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 4869; GFX9-NEXT: v_mul_f32_e32 v6, v8, v9 4870; GFX9-NEXT: v_trunc_f32_e32 v6, v6 4871; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 4872; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 4873; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v8 4874; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 4875; GFX9-NEXT: v_mul_lo_u32 v4, v4, s0 4876; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc 4877; GFX9-NEXT: v_mul_lo_u32 v3, v5, v3 4878; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 4879; GFX9-NEXT: s_lshr_b32 s0, s4, 15 4880; GFX9-NEXT: v_sub_u32_e32 v4, s0, v4 4881; GFX9-NEXT: v_and_b32_e32 v4, s8, v4 4882; GFX9-NEXT: v_sub_u32_e32 v5, s4, v1 4883; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 4884; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 4885; GFX9-NEXT: v_and_b32_e32 v3, s8, v5 4886; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 4887; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 4888; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 4889; GFX9-NEXT: global_store_dword v2, v0, s[2:3] 4890; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 4891; GFX9-NEXT: global_store_short v2, v0, s[2:3] offset:4 4892; GFX9-NEXT: s_endpgm 4893 %r = urem <3 x i15> %x, %y 4894 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 4895 ret void 4896} 4897 4898define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 4899; CHECK-LABEL: @sdiv_v3i15( 4900; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 4901; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 4902; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 4903; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 4904; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 4905; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 4906; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 4907; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 4908; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 4909; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 4910; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 4911; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 4912; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 4913; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 4914; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 4915; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 4916; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4917; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 4918; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 4919; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 4920; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 17 4921; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 17 4922; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15 4923; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i15> undef, i15 [[TMP23]], i64 0 4924; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1 4925; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1 4926; CHECK-NEXT: [[TMP27:%.*]] = sext i15 [[TMP25]] to i32 4927; CHECK-NEXT: [[TMP28:%.*]] = sext i15 [[TMP26]] to i32 4928; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 4929; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 4930; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 4931; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 4932; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 4933; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 4934; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 4935; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 4936; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 4937; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 4938; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 4939; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 4940; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 4941; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 4942; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 4943; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 4944; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 17 4945; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 17 4946; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15 4947; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1 4948; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2 4949; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2 4950; CHECK-NEXT: [[TMP51:%.*]] = sext i15 [[TMP49]] to i32 4951; CHECK-NEXT: [[TMP52:%.*]] = sext i15 [[TMP50]] to i32 4952; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 4953; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 4954; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 4955; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 4956; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 4957; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 4958; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 4959; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 4960; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 4961; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 4962; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 4963; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 4964; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 4965; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 4966; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 4967; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 4968; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 17 4969; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 17 4970; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15 4971; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2 4972; CHECK-NEXT: store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 4973; CHECK-NEXT: ret void 4974; 4975; GFX6-LABEL: sdiv_v3i15: 4976; GFX6: ; %bb.0: 4977; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4978; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4979; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 4980; GFX6-NEXT: s_mov_b32 s7, 0xf000 4981; GFX6-NEXT: s_mov_b32 s6, -1 4982; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4983; GFX6-NEXT: v_mov_b32_e32 v0, s2 4984; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 4985; GFX6-NEXT: s_bfe_i32 s3, s0, 0xf0000 4986; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s3 4987; GFX6-NEXT: v_mov_b32_e32 v1, s0 4988; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 30 4989; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf0000 4990; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s1 4991; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 4992; GFX6-NEXT: s_xor_b32 s1, s1, s3 4993; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f 4994; GFX6-NEXT: s_ashr_i32 s1, s1, 30 4995; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 4996; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4997; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 4998; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 4999; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 5000; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s0 5001; GFX6-NEXT: s_or_b32 s1, s1, 1 5002; GFX6-NEXT: v_mov_b32_e32 v5, s1 5003; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 5004; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf000f 5005; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 5006; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 5007; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 5008; GFX6-NEXT: s_xor_b32 s0, s1, s0 5009; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 5010; GFX6-NEXT: s_ashr_i32 s0, s0, 30 5011; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 5012; GFX6-NEXT: v_trunc_f32_e32 v5, v5 5013; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 5014; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| 5015; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 5016; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v1 5017; GFX6-NEXT: s_or_b32 s0, s0, 1 5018; GFX6-NEXT: v_mov_b32_e32 v6, s0 5019; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 5020; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 5021; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 5022; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v0 5023; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 5024; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 5025; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 5026; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 5027; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 5028; GFX6-NEXT: v_trunc_f32_e32 v1, v1 5029; GFX6-NEXT: v_mad_f32 v5, -v1, v4, v5 5030; GFX6-NEXT: v_cvt_i32_f32_e32 v1, v1 5031; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| 5032; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 5033; GFX6-NEXT: s_movk_i32 s0, 0x7fff 5034; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5035; GFX6-NEXT: v_and_b32_e32 v3, s0, v3 5036; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 5037; GFX6-NEXT: v_and_b32_e32 v2, s0, v2 5038; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 5039; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 5040; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 5041; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5042; GFX6-NEXT: s_waitcnt expcnt(0) 5043; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5044; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 5045; GFX6-NEXT: s_endpgm 5046; GFX9-LABEL: sdiv_v3i15: 5047; GFX9: ; %bb.0: 5048; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5049; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5050; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 5051; GFX9-NEXT: v_mov_b32_e32 v2, 0 5052; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5053; GFX9-NEXT: s_bfe_i32 s1, s4, 0xf0000 5054; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf0000 5055; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 5056; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 5057; GFX9-NEXT: s_xor_b32 s0, s1, s0 5058; GFX9-NEXT: v_mov_b32_e32 v0, s4 5059; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 5060; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5061; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 5062; GFX9-NEXT: s_or_b32 s5, s0, 1 5063; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 5064; GFX9-NEXT: v_trunc_f32_e32 v5, v5 5065; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 5066; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 5067; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 5068; GFX9-NEXT: s_cselect_b32 s0, s5, 0 5069; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 5070; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf000f 5071; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s1 5072; GFX9-NEXT: v_mov_b32_e32 v1, s6 5073; GFX9-NEXT: v_add_u32_e32 v4, s0, v5 5074; GFX9-NEXT: s_bfe_i32 s0, s4, 0xf000f 5075; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 5076; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 5077; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 5078; GFX9-NEXT: s_xor_b32 s0, s0, s1 5079; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5080; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 5081; GFX9-NEXT: v_trunc_f32_e32 v6, v6 5082; GFX9-NEXT: v_mad_f32 v5, -v6, v3, v5 5083; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 5084; GFX9-NEXT: s_or_b32 s4, s0, 1 5085; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| 5086; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 5087; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 5088; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 5089; GFX9-NEXT: s_cselect_b32 s0, s4, 0 5090; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 15 5091; GFX9-NEXT: v_add_u32_e32 v5, s0, v6 5092; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v0 5093; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 5094; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 5095; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 5096; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 5097; GFX9-NEXT: v_mul_f32_e32 v1, v6, v7 5098; GFX9-NEXT: v_trunc_f32_e32 v1, v1 5099; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v1 5100; GFX9-NEXT: v_mad_f32 v1, -v1, v3, v6 5101; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| 5102; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 5103; GFX9-NEXT: s_movk_i32 s0, 0x7fff 5104; GFX9-NEXT: v_add_u32_e32 v0, v7, v0 5105; GFX9-NEXT: v_and_b32_e32 v3, s0, v4 5106; GFX9-NEXT: v_and_b32_e32 v4, s0, v5 5107; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 5108; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 5109; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 5110; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 5111; GFX9-NEXT: global_store_dword v2, v0, s[2:3] 5112; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5113; GFX9-NEXT: global_store_short v2, v0, s[2:3] offset:4 5114; GFX9-NEXT: s_endpgm 5115 %r = sdiv <3 x i15> %x, %y 5116 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 5117 ret void 5118} 5119 5120define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 5121; CHECK-LABEL: @srem_v3i15( 5122; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 5123; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 5124; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 5125; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 5126; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 5127; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 5128; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 5129; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 5130; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 5131; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 5132; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 5133; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 5134; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 5135; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 5136; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 5137; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 5138; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 5139; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 5140; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 5141; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 5142; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 5143; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 5144; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 17 5145; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 17 5146; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15 5147; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i15> undef, i15 [[TMP25]], i64 0 5148; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1 5149; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1 5150; CHECK-NEXT: [[TMP29:%.*]] = sext i15 [[TMP27]] to i32 5151; CHECK-NEXT: [[TMP30:%.*]] = sext i15 [[TMP28]] to i32 5152; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 5153; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 5154; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 5155; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 5156; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 5157; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 5158; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 5159; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 5160; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 5161; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 5162; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 5163; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 5164; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 5165; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 5166; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 5167; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 5168; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 5169; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 5170; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 17 5171; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 17 5172; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15 5173; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1 5174; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2 5175; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2 5176; CHECK-NEXT: [[TMP55:%.*]] = sext i15 [[TMP53]] to i32 5177; CHECK-NEXT: [[TMP56:%.*]] = sext i15 [[TMP54]] to i32 5178; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 5179; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 5180; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 5181; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 5182; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 5183; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 5184; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 5185; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 5186; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 5187; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 5188; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 5189; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 5190; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 5191; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 5192; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 5193; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 5194; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 5195; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 5196; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 17 5197; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 17 5198; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15 5199; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2 5200; CHECK-NEXT: store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 5201; CHECK-NEXT: ret void 5202; 5203; GFX6-LABEL: srem_v3i15: 5204; GFX6: ; %bb.0: 5205; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5206; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 5207; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 5208; GFX6-NEXT: s_mov_b32 s7, 0xf000 5209; GFX6-NEXT: s_mov_b32 s6, -1 5210; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5211; GFX6-NEXT: v_mov_b32_e32 v0, s2 5212; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 5213; GFX6-NEXT: s_movk_i32 s3, 0x7fff 5214; GFX6-NEXT: s_and_b32 s11, s0, s3 5215; GFX6-NEXT: s_bfe_i32 s11, s11, 0xf0000 5216; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s11 5217; GFX6-NEXT: s_and_b32 s9, s2, s3 5218; GFX6-NEXT: s_bfe_i32 s9, s9, 0xf0000 5219; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s9 5220; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 5221; GFX6-NEXT: s_xor_b32 s9, s9, s11 5222; GFX6-NEXT: s_ashr_i32 s9, s9, 30 5223; GFX6-NEXT: s_or_b32 s9, s9, 1 5224; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 5225; GFX6-NEXT: v_trunc_f32_e32 v4, v4 5226; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 5227; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 5228; GFX6-NEXT: v_mov_b32_e32 v5, s9 5229; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 5230; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 5231; GFX6-NEXT: v_mov_b32_e32 v1, s0 5232; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 5233; GFX6-NEXT: s_bfe_u32 s12, s0, 0xf000f 5234; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 30 5235; GFX6-NEXT: v_mul_lo_u32 v2, v2, s0 5236; GFX6-NEXT: s_lshr_b32 s1, s0, 15 5237; GFX6-NEXT: s_bfe_i32 s0, s12, 0xf0000 5238; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s0 5239; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf000f 5240; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 5241; GFX6-NEXT: s_lshr_b32 s8, s2, 15 5242; GFX6-NEXT: s_bfe_i32 s2, s10, 0xf0000 5243; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s2 5244; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 5245; GFX6-NEXT: s_xor_b32 s0, s2, s0 5246; GFX6-NEXT: s_ashr_i32 s0, s0, 30 5247; GFX6-NEXT: s_or_b32 s0, s0, 1 5248; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 5249; GFX6-NEXT: v_trunc_f32_e32 v5, v5 5250; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 5251; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 5252; GFX6-NEXT: v_and_b32_e32 v1, s3, v1 5253; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| 5254; GFX6-NEXT: v_mov_b32_e32 v6, s0 5255; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 5256; GFX6-NEXT: v_bfe_i32 v4, v1, 0, 15 5257; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 5258; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v4 5259; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 5260; GFX6-NEXT: v_bfe_i32 v6, v0, 0, 15 5261; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v6 5262; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v5 5263; GFX6-NEXT: v_xor_b32_e32 v4, v6, v4 5264; GFX6-NEXT: v_ashrrev_i32_e32 v4, 30, v4 5265; GFX6-NEXT: v_or_b32_e32 v4, 1, v4 5266; GFX6-NEXT: v_mul_f32_e32 v6, v7, v8 5267; GFX6-NEXT: v_trunc_f32_e32 v6, v6 5268; GFX6-NEXT: v_mad_f32 v7, -v6, v5, v7 5269; GFX6-NEXT: v_cvt_i32_f32_e32 v6, v6 5270; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v5| 5271; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc 5272; GFX6-NEXT: v_mul_lo_u32 v3, v3, s1 5273; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 5274; GFX6-NEXT: v_mul_lo_u32 v1, v4, v1 5275; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 5276; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 5277; GFX6-NEXT: v_and_b32_e32 v3, s3, v3 5278; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 5279; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 5280; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 5281; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 5282; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 5283; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5284; GFX6-NEXT: s_waitcnt expcnt(0) 5285; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5286; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 5287; GFX6-NEXT: s_endpgm 5288; GFX9-LABEL: srem_v3i15: 5289; GFX9: ; %bb.0: 5290; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5291; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5292; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 5293; GFX9-NEXT: s_movk_i32 s8, 0x7fff 5294; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5295; GFX9-NEXT: s_and_b32 s0, s4, s8 5296; GFX9-NEXT: s_and_b32 s1, s6, s8 5297; GFX9-NEXT: s_bfe_i32 s1, s1, 0xf0000 5298; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 5299; GFX9-NEXT: s_bfe_i32 s0, s0, 0xf0000 5300; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 5301; GFX9-NEXT: s_xor_b32 s0, s0, s1 5302; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 5303; GFX9-NEXT: v_mov_b32_e32 v0, s4 5304; GFX9-NEXT: v_mov_b32_e32 v1, s6 5305; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5306; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 5307; GFX9-NEXT: v_trunc_f32_e32 v4, v4 5308; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 5309; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 5310; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 5311; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 5312; GFX9-NEXT: s_or_b32 s11, s0, 1 5313; GFX9-NEXT: s_lshr_b32 s9, s4, 15 5314; GFX9-NEXT: s_bfe_u32 s5, s4, 0xf000f 5315; GFX9-NEXT: s_lshr_b32 s7, s6, 15 5316; GFX9-NEXT: s_bfe_u32 s10, s6, 0xf000f 5317; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| 5318; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 5319; GFX9-NEXT: s_cselect_b32 s0, s11, 0 5320; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 5321; GFX9-NEXT: s_bfe_i32 s0, s10, 0xf0000 5322; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 5323; GFX9-NEXT: s_bfe_i32 s1, s5, 0xf0000 5324; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 5325; GFX9-NEXT: s_xor_b32 s0, s1, s0 5326; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 5327; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5328; GFX9-NEXT: s_or_b32 s5, s0, 1 5329; GFX9-NEXT: v_and_b32_e32 v1, s8, v1 5330; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 5331; GFX9-NEXT: v_trunc_f32_e32 v5, v5 5332; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 5333; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 5334; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 5335; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 5336; GFX9-NEXT: s_cselect_b32 s0, s5, 0 5337; GFX9-NEXT: v_bfe_i32 v4, v1, 0, 15 5338; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 5339; GFX9-NEXT: v_cvt_f32_i32_e32 v5, v4 5340; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 5341; GFX9-NEXT: v_bfe_i32 v6, v0, 0, 15 5342; GFX9-NEXT: v_cvt_f32_i32_e32 v7, v6 5343; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 5344; GFX9-NEXT: v_xor_b32_e32 v4, v6, v4 5345; GFX9-NEXT: v_ashrrev_i32_e32 v4, 30, v4 5346; GFX9-NEXT: v_or_b32_e32 v4, 1, v4 5347; GFX9-NEXT: v_mul_f32_e32 v6, v7, v8 5348; GFX9-NEXT: v_trunc_f32_e32 v6, v6 5349; GFX9-NEXT: v_cvt_i32_f32_e32 v8, v6 5350; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v7 5351; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v5| 5352; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc 5353; GFX9-NEXT: v_mul_lo_u32 v3, v3, s7 5354; GFX9-NEXT: v_add_u32_e32 v4, v8, v4 5355; GFX9-NEXT: v_mul_lo_u32 v2, v2, s6 5356; GFX9-NEXT: v_mul_lo_u32 v1, v4, v1 5357; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3 5358; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 5359; GFX9-NEXT: v_sub_u32_e32 v2, s4, v2 5360; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 5361; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 5362; GFX9-NEXT: v_and_b32_e32 v2, s8, v2 5363; GFX9-NEXT: v_lshlrev_b32_e32 v3, 15, v3 5364; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 5365; GFX9-NEXT: v_mov_b32_e32 v4, 0 5366; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 5367; GFX9-NEXT: global_store_dword v4, v0, s[2:3] 5368; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5369; GFX9-NEXT: global_store_short v4, v0, s[2:3] offset:4 5370; GFX9-NEXT: s_endpgm 5371 %r = srem <3 x i15> %x, %y 5372 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 5373 ret void 5374} 5375 5376define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 5377; CHECK-LABEL: @udiv_i32_oddk_denom( 5378; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 1235195 5379; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5380; CHECK-NEXT: ret void 5381; 5382; GFX6-LABEL: udiv_i32_oddk_denom: 5383; GFX6: ; %bb.0: 5384; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5385; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 5386; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 5387; GFX6-NEXT: s_mov_b32 s7, 0xf000 5388; GFX6-NEXT: s_mov_b32 s6, -1 5389; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5390; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 5391; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v0 5392; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 5393; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5394; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 5395; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5396; GFX6-NEXT: s_endpgm 5397; GFX9-LABEL: udiv_i32_oddk_denom: 5398; GFX9: ; %bb.0: 5399; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5400; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 5401; GFX9-NEXT: v_mov_b32_e32 v0, 0 5402; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5403; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 5404; GFX9-NEXT: s_sub_i32 s1, s4, s0 5405; GFX9-NEXT: s_lshr_b32 s1, s1, 1 5406; GFX9-NEXT: s_add_i32 s1, s1, s0 5407; GFX9-NEXT: s_lshr_b32 s0, s1, 20 5408; GFX9-NEXT: v_mov_b32_e32 v1, s0 5409; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 5410; GFX9-NEXT: s_endpgm 5411 %r = udiv i32 %x, 1235195 5412 store i32 %r, i32 addrspace(1)* %out 5413 ret void 5414} 5415 5416define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 5417; CHECK-LABEL: @udiv_i32_pow2k_denom( 5418; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 4096 5419; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5420; CHECK-NEXT: ret void 5421; 5422; GFX6-LABEL: udiv_i32_pow2k_denom: 5423; GFX6: ; %bb.0: 5424; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5425; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 5426; GFX6-NEXT: s_mov_b32 s7, 0xf000 5427; GFX6-NEXT: s_mov_b32 s6, -1 5428; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5429; GFX6-NEXT: s_lshr_b32 s0, s0, 12 5430; GFX6-NEXT: v_mov_b32_e32 v0, s0 5431; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5432; GFX6-NEXT: s_endpgm 5433; GFX9-LABEL: udiv_i32_pow2k_denom: 5434; GFX9: ; %bb.0: 5435; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5436; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 5437; GFX9-NEXT: v_mov_b32_e32 v0, 0 5438; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5439; GFX9-NEXT: s_lshr_b32 s0, s4, 12 5440; GFX9-NEXT: v_mov_b32_e32 v1, s0 5441; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 5442; GFX9-NEXT: s_endpgm 5443 %r = udiv i32 %x, 4096 5444 store i32 %r, i32 addrspace(1)* %out 5445 ret void 5446} 5447 5448define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 5449; CHECK-LABEL: @udiv_i32_pow2_shl_denom( 5450; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 5451; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]] 5452; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5453; CHECK-NEXT: ret void 5454; 5455; GFX6-LABEL: udiv_i32_pow2_shl_denom: 5456; GFX6: ; %bb.0: 5457; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5458; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5459; GFX6-NEXT: s_mov_b32 s7, 0xf000 5460; GFX6-NEXT: s_mov_b32 s6, -1 5461; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5462; GFX6-NEXT: s_add_i32 s1, s1, 12 5463; GFX6-NEXT: s_lshr_b32 s0, s0, s1 5464; GFX6-NEXT: v_mov_b32_e32 v0, s0 5465; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5466; GFX6-NEXT: s_endpgm 5467; GFX9-LABEL: udiv_i32_pow2_shl_denom: 5468; GFX9: ; %bb.0: 5469; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5470; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5471; GFX9-NEXT: v_mov_b32_e32 v0, 0 5472; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5473; GFX9-NEXT: s_add_i32 s0, s5, 12 5474; GFX9-NEXT: s_lshr_b32 s0, s4, s0 5475; GFX9-NEXT: v_mov_b32_e32 v1, s0 5476; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 5477; GFX9-NEXT: s_endpgm 5478 %shl.y = shl i32 4096, %y 5479 %r = udiv i32 %x, %shl.y 5480 store i32 %r, i32 addrspace(1)* %out 5481 ret void 5482} 5483 5484define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 5485; CHECK-LABEL: @udiv_v2i32_pow2k_denom( 5486; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5487; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 5488; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 5489; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 5490; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096 5491; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 5492; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 5493; CHECK-NEXT: ret void 5494; 5495; GFX6-LABEL: udiv_v2i32_pow2k_denom: 5496; GFX6: ; %bb.0: 5497; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5498; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5499; GFX6-NEXT: s_mov_b32 s7, 0xf000 5500; GFX6-NEXT: s_mov_b32 s6, -1 5501; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5502; GFX6-NEXT: s_lshr_b32 s0, s0, 12 5503; GFX6-NEXT: s_lshr_b32 s1, s1, 12 5504; GFX6-NEXT: v_mov_b32_e32 v0, s0 5505; GFX6-NEXT: v_mov_b32_e32 v1, s1 5506; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5507; GFX6-NEXT: s_endpgm 5508; GFX9-LABEL: udiv_v2i32_pow2k_denom: 5509; GFX9: ; %bb.0: 5510; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5511; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5512; GFX9-NEXT: v_mov_b32_e32 v2, 0 5513; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5514; GFX9-NEXT: s_lshr_b32 s0, s4, 12 5515; GFX9-NEXT: s_lshr_b32 s1, s5, 12 5516; GFX9-NEXT: v_mov_b32_e32 v0, s0 5517; GFX9-NEXT: v_mov_b32_e32 v1, s1 5518; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 5519; GFX9-NEXT: s_endpgm 5520 %r = udiv <2 x i32> %x, <i32 4096, i32 4096> 5521 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 5522 ret void 5523} 5524 5525define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 5526; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom( 5527; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5528; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 5529; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 5530; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 5531; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095 5532; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 5533; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 5534; CHECK-NEXT: ret void 5535; 5536; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom: 5537; GFX6: ; %bb.0: 5538; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5539; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5540; GFX6-NEXT: v_mov_b32_e32 v0, 0x100101 5541; GFX6-NEXT: s_mov_b32 s7, 0xf000 5542; GFX6-NEXT: s_mov_b32 s6, -1 5543; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5544; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 5545; GFX6-NEXT: s_lshr_b32 s0, s0, 12 5546; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v0 5547; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 5548; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5549; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v0 5550; GFX6-NEXT: v_mov_b32_e32 v0, s0 5551; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5552; GFX6-NEXT: s_endpgm 5553; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: 5554; GFX9: ; %bb.0: 5555; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5556; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5557; GFX9-NEXT: v_mov_b32_e32 v2, 0 5558; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5559; GFX9-NEXT: s_mul_hi_u32 s1, s5, 0x100101 5560; GFX9-NEXT: s_lshr_b32 s0, s4, 12 5561; GFX9-NEXT: s_sub_i32 s4, s5, s1 5562; GFX9-NEXT: s_lshr_b32 s4, s4, 1 5563; GFX9-NEXT: s_add_i32 s4, s4, s1 5564; GFX9-NEXT: s_lshr_b32 s1, s4, 11 5565; GFX9-NEXT: v_mov_b32_e32 v0, s0 5566; GFX9-NEXT: v_mov_b32_e32 v1, s1 5567; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 5568; GFX9-NEXT: s_endpgm 5569 %r = udiv <2 x i32> %x, <i32 4096, i32 4095> 5570 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 5571 ret void 5572} 5573 5574define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 5575; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom( 5576; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 5577; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5578; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 5579; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 5580; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 5581; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 5582; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 5583; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 5584; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 5585; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 5586; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 5587; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 5588; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 5589; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 5590; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 5591; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 5592; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 5593; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 5594; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 5595; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 5596; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 5597; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 5598; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 5599; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 5600; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 5601; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1 5602; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]] 5603; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]] 5604; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]] 5605; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]] 5606; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 5607; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 5608; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> undef, i32 [[TMP31]], i64 0 5609; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x i32> [[X]], i64 1 5610; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 5611; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float 5612; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 5613; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000 5614; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32 5615; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]] 5616; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]] 5617; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64 5618; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64 5619; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]] 5620; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 5621; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32 5622; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32 5623; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]] 5624; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64 5625; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64 5626; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]] 5627; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 5628; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32 5629; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32 5630; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]] 5631; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]] 5632; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]] 5633; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1 5634; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]] 5635; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]] 5636; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]] 5637; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]] 5638; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1 5639; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]] 5640; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP63]], i64 1 5641; CHECK-NEXT: store <2 x i32> [[TMP64]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 5642; CHECK-NEXT: ret void 5643; 5644; GFX6-LABEL: udiv_v2i32_pow2_shl_denom: 5645; GFX6: ; %bb.0: 5646; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 5647; GFX6-NEXT: s_movk_i32 s4, 0x1000 5648; GFX6-NEXT: s_mov_b32 s7, 0xf000 5649; GFX6-NEXT: s_mov_b32 s6, -1 5650; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5651; GFX6-NEXT: s_lshl_b32 s8, s4, s2 5652; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 5653; GFX6-NEXT: s_lshl_b32 s9, s4, s3 5654; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 5655; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5656; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 5657; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 5658; GFX6-NEXT: s_mov_b32 s0, 0x4f7ffffe 5659; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 5660; GFX6-NEXT: v_mul_f32_e32 v0, s0, v0 5661; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 5662; GFX6-NEXT: v_mul_f32_e32 v1, s0, v1 5663; GFX6-NEXT: s_sub_i32 s0, 0, s8 5664; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 5665; GFX6-NEXT: v_mul_lo_u32 v2, s0, v0 5666; GFX6-NEXT: s_sub_i32 s0, 0, s9 5667; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1 5668; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 5669; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 5670; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 5671; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5672; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 5673; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 5674; GFX6-NEXT: v_mul_hi_u32 v1, s3, v1 5675; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 5676; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 5677; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9 5678; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 5679; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 5680; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 5681; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v2 5682; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 5683; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 5684; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 5685; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 5686; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v4 5687; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 5688; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 5689; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 5690; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v2 5691; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 5692; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 5693; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 5694; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 5695; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5696; GFX6-NEXT: s_endpgm 5697; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: 5698; GFX9: ; %bb.0: 5699; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 5700; GFX9-NEXT: s_movk_i32 s4, 0x1000 5701; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5702; GFX9-NEXT: s_lshl_b32 s7, s4, s2 5703; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7 5704; GFX9-NEXT: s_lshl_b32 s6, s4, s3 5705; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6 5706; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe 5707; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 5708; GFX9-NEXT: s_sub_i32 s3, 0, s6 5709; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 5710; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 5711; GFX9-NEXT: v_mul_f32_e32 v0, s2, v0 5712; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 5713; GFX9-NEXT: v_mul_f32_e32 v1, s2, v1 5714; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 5715; GFX9-NEXT: s_sub_i32 s2, 0, s7 5716; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 5717; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 5718; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 5719; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 5720; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 5721; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 5722; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5723; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 5724; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 5725; GFX9-NEXT: v_mul_hi_u32 v1, s3, v1 5726; GFX9-NEXT: v_mov_b32_e32 v2, 0 5727; GFX9-NEXT: v_mul_lo_u32 v3, v0, s7 5728; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 5729; GFX9-NEXT: v_mul_lo_u32 v4, v1, s6 5730; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 5731; GFX9-NEXT: v_sub_u32_e32 v3, s2, v3 5732; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 5733; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 5734; GFX9-NEXT: v_subrev_u32_e32 v5, s7, v3 5735; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 5736; GFX9-NEXT: v_sub_u32_e32 v4, s3, v4 5737; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v4 5738; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 5739; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v4 5740; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 5741; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[0:1] 5742; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] 5743; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 5744; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 5745; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 5746; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 5747; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 5748; GFX9-NEXT: s_endpgm 5749 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 5750 %r = udiv <2 x i32> %x, %shl.y 5751 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 5752 ret void 5753} 5754 5755define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 5756; CHECK-LABEL: @urem_i32_oddk_denom( 5757; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 1235195 5758; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5759; CHECK-NEXT: ret void 5760; 5761; GFX6-LABEL: urem_i32_oddk_denom: 5762; GFX6: ; %bb.0: 5763; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 5764; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 5765; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb 5766; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5767; GFX6-NEXT: s_mov_b32 s3, 0xf000 5768; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5769; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 5770; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v0 5771; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 5772; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5773; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 5774; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 5775; GFX6-NEXT: s_mov_b32 s2, -1 5776; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 5777; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 5778; GFX6-NEXT: s_endpgm 5779; GFX9-LABEL: urem_i32_oddk_denom: 5780; GFX9: ; %bb.0: 5781; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5782; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 5783; GFX9-NEXT: v_mov_b32_e32 v0, 0 5784; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5785; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 5786; GFX9-NEXT: s_sub_i32 s1, s4, s0 5787; GFX9-NEXT: s_lshr_b32 s1, s1, 1 5788; GFX9-NEXT: s_add_i32 s1, s1, s0 5789; GFX9-NEXT: s_lshr_b32 s0, s1, 20 5790; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb 5791; GFX9-NEXT: s_sub_i32 s0, s4, s0 5792; GFX9-NEXT: v_mov_b32_e32 v1, s0 5793; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 5794; GFX9-NEXT: s_endpgm 5795 %r = urem i32 %x, 1235195 5796 store i32 %r, i32 addrspace(1)* %out 5797 ret void 5798} 5799 5800define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 5801; CHECK-LABEL: @urem_i32_pow2k_denom( 5802; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 4096 5803; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5804; CHECK-NEXT: ret void 5805; 5806; GFX6-LABEL: urem_i32_pow2k_denom: 5807; GFX6: ; %bb.0: 5808; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5809; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 5810; GFX6-NEXT: s_mov_b32 s7, 0xf000 5811; GFX6-NEXT: s_mov_b32 s6, -1 5812; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5813; GFX6-NEXT: s_and_b32 s0, s0, 0xfff 5814; GFX6-NEXT: v_mov_b32_e32 v0, s0 5815; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5816; GFX6-NEXT: s_endpgm 5817; GFX9-LABEL: urem_i32_pow2k_denom: 5818; GFX9: ; %bb.0: 5819; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5820; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 5821; GFX9-NEXT: v_mov_b32_e32 v0, 0 5822; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5823; GFX9-NEXT: s_and_b32 s0, s4, 0xfff 5824; GFX9-NEXT: v_mov_b32_e32 v1, s0 5825; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 5826; GFX9-NEXT: s_endpgm 5827 %r = urem i32 %x, 4096 5828 store i32 %r, i32 addrspace(1)* %out 5829 ret void 5830} 5831 5832define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 5833; CHECK-LABEL: @urem_i32_pow2_shl_denom( 5834; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 5835; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]] 5836; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5837; CHECK-NEXT: ret void 5838; 5839; GFX6-LABEL: urem_i32_pow2_shl_denom: 5840; GFX6: ; %bb.0: 5841; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5842; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5843; GFX6-NEXT: s_mov_b32 s7, 0xf000 5844; GFX6-NEXT: s_mov_b32 s6, -1 5845; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5846; GFX6-NEXT: s_lshl_b32 s1, 0x1000, s1 5847; GFX6-NEXT: s_add_i32 s1, s1, -1 5848; GFX6-NEXT: s_and_b32 s0, s0, s1 5849; GFX6-NEXT: v_mov_b32_e32 v0, s0 5850; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5851; GFX6-NEXT: s_endpgm 5852; GFX9-LABEL: urem_i32_pow2_shl_denom: 5853; GFX9: ; %bb.0: 5854; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5855; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5856; GFX9-NEXT: v_mov_b32_e32 v0, 0 5857; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5858; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s5 5859; GFX9-NEXT: s_add_i32 s0, s0, -1 5860; GFX9-NEXT: s_and_b32 s0, s4, s0 5861; GFX9-NEXT: v_mov_b32_e32 v1, s0 5862; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 5863; GFX9-NEXT: s_endpgm 5864 %shl.y = shl i32 4096, %y 5865 %r = urem i32 %x, %shl.y 5866 store i32 %r, i32 addrspace(1)* %out 5867 ret void 5868} 5869 5870define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 5871; CHECK-LABEL: @urem_v2i32_pow2k_denom( 5872; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5873; CHECK-NEXT: [[TMP2:%.*]] = urem i32 [[TMP1]], 4096 5874; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 5875; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 5876; CHECK-NEXT: [[TMP5:%.*]] = urem i32 [[TMP4]], 4096 5877; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 5878; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 5879; CHECK-NEXT: ret void 5880; 5881; GFX6-LABEL: urem_v2i32_pow2k_denom: 5882; GFX6: ; %bb.0: 5883; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5884; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5885; GFX6-NEXT: s_movk_i32 s2, 0xfff 5886; GFX6-NEXT: s_mov_b32 s7, 0xf000 5887; GFX6-NEXT: s_mov_b32 s6, -1 5888; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5889; GFX6-NEXT: s_and_b32 s0, s0, s2 5890; GFX6-NEXT: s_and_b32 s1, s1, s2 5891; GFX6-NEXT: v_mov_b32_e32 v0, s0 5892; GFX6-NEXT: v_mov_b32_e32 v1, s1 5893; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5894; GFX6-NEXT: s_endpgm 5895; GFX9-LABEL: urem_v2i32_pow2k_denom: 5896; GFX9: ; %bb.0: 5897; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5898; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5899; GFX9-NEXT: s_movk_i32 s0, 0xfff 5900; GFX9-NEXT: v_mov_b32_e32 v2, 0 5901; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5902; GFX9-NEXT: s_and_b32 s1, s4, s0 5903; GFX9-NEXT: s_and_b32 s0, s5, s0 5904; GFX9-NEXT: v_mov_b32_e32 v0, s1 5905; GFX9-NEXT: v_mov_b32_e32 v1, s0 5906; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 5907; GFX9-NEXT: s_endpgm 5908 %r = urem <2 x i32> %x, <i32 4096, i32 4096> 5909 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 5910 ret void 5911} 5912 5913define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 5914; CHECK-LABEL: @urem_v2i32_pow2_shl_denom( 5915; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 5916; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5917; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 5918; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 5919; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 5920; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 5921; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 5922; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 5923; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 5924; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 5925; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 5926; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 5927; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 5928; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 5929; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 5930; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 5931; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 5932; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 5933; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 5934; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 5935; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 5936; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 5937; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 5938; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 5939; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 5940; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]] 5941; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]] 5942; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]] 5943; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]] 5944; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]] 5945; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> undef, i32 [[TMP29]], i64 0 5946; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[X]], i64 1 5947; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 5948; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float 5949; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 5950; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000 5951; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32 5952; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]] 5953; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]] 5954; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64 5955; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64 5956; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]] 5957; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32 5958; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32 5959; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 5960; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]] 5961; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64 5962; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64 5963; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]] 5964; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32 5965; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32 5966; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 5967; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]] 5968; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]] 5969; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]] 5970; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]] 5971; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]] 5972; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]] 5973; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]] 5974; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]] 5975; CHECK-NEXT: [[TMP60:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP59]], i64 1 5976; CHECK-NEXT: store <2 x i32> [[TMP60]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 5977; CHECK-NEXT: ret void 5978; 5979; GFX6-LABEL: urem_v2i32_pow2_shl_denom: 5980; GFX6: ; %bb.0: 5981; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 5982; GFX6-NEXT: s_movk_i32 s4, 0x1000 5983; GFX6-NEXT: s_mov_b32 s7, 0xf000 5984; GFX6-NEXT: s_mov_b32 s6, -1 5985; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5986; GFX6-NEXT: s_lshl_b32 s8, s4, s2 5987; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 5988; GFX6-NEXT: s_lshl_b32 s3, s4, s3 5989; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 5990; GFX6-NEXT: s_mov_b32 s4, 0x4f7ffffe 5991; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 5992; GFX6-NEXT: s_sub_i32 s2, 0, s8 5993; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 5994; GFX6-NEXT: v_mul_f32_e32 v0, s4, v0 5995; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 5996; GFX6-NEXT: v_mul_f32_e32 v1, s4, v1 5997; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 5998; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5999; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6000; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 6001; GFX6-NEXT: s_sub_i32 s2, 0, s3 6002; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 6003; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 6004; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 6005; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 6006; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6007; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 6008; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 6009; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 6010; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 6011; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 6012; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 6013; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0 6014; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 6015; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6016; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0 6017; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 6018; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6019; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 6020; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 6021; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 6022; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6023; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 6024; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 6025; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6026; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6027; GFX6-NEXT: s_endpgm 6028; GFX9-LABEL: urem_v2i32_pow2_shl_denom: 6029; GFX9: ; %bb.0: 6030; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 6031; GFX9-NEXT: s_movk_i32 s4, 0x1000 6032; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6033; GFX9-NEXT: s_lshl_b32 s5, s4, s3 6034; GFX9-NEXT: s_lshl_b32 s4, s4, s2 6035; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 6036; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 6037; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe 6038; GFX9-NEXT: s_sub_i32 s3, 0, s5 6039; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6040; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 6041; GFX9-NEXT: v_mul_f32_e32 v0, s2, v0 6042; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6043; GFX9-NEXT: v_mul_f32_e32 v1, s2, v1 6044; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 6045; GFX9-NEXT: s_sub_i32 s2, 0, s4 6046; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 6047; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 6048; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6049; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 6050; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6051; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 6052; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 6053; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6054; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 6055; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 6056; GFX9-NEXT: v_mul_hi_u32 v1, s3, v1 6057; GFX9-NEXT: v_mov_b32_e32 v2, 0 6058; GFX9-NEXT: v_mul_lo_u32 v0, v0, s4 6059; GFX9-NEXT: v_mul_lo_u32 v1, v1, s5 6060; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 6061; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v0 6062; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 6063; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6064; GFX9-NEXT: v_sub_u32_e32 v1, s3, v1 6065; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v0 6066; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 6067; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6068; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v1 6069; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 6070; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 6071; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 6072; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 6073; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 6074; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 6075; GFX9-NEXT: s_endpgm 6076 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 6077 %r = urem <2 x i32> %x, %shl.y 6078 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 6079 ret void 6080} 6081 6082define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 6083; CHECK-LABEL: @sdiv_i32_oddk_denom( 6084; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195 6085; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6086; CHECK-NEXT: ret void 6087; 6088; GFX6-LABEL: sdiv_i32_oddk_denom: 6089; GFX6: ; %bb.0: 6090; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6091; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 6092; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 6093; GFX6-NEXT: s_mov_b32 s7, 0xf000 6094; GFX6-NEXT: s_mov_b32 s6, -1 6095; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6096; GFX6-NEXT: v_mul_hi_i32 v0, s0, v0 6097; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 6098; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 6099; GFX6-NEXT: v_ashrrev_i32_e32 v0, 20, v0 6100; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 6101; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 6102; GFX6-NEXT: s_endpgm 6103; GFX9-LABEL: sdiv_i32_oddk_denom: 6104; GFX9: ; %bb.0: 6105; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6106; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 6107; GFX9-NEXT: v_mov_b32_e32 v0, 0 6108; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6109; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 6110; GFX9-NEXT: s_add_i32 s0, s0, s4 6111; GFX9-NEXT: s_lshr_b32 s1, s0, 31 6112; GFX9-NEXT: s_ashr_i32 s0, s0, 20 6113; GFX9-NEXT: s_add_i32 s0, s0, s1 6114; GFX9-NEXT: v_mov_b32_e32 v1, s0 6115; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 6116; GFX9-NEXT: s_endpgm 6117 %r = sdiv i32 %x, 1235195 6118 store i32 %r, i32 addrspace(1)* %out 6119 ret void 6120} 6121 6122define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 6123; CHECK-LABEL: @sdiv_i32_pow2k_denom( 6124; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 4096 6125; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6126; CHECK-NEXT: ret void 6127; 6128; GFX6-LABEL: sdiv_i32_pow2k_denom: 6129; GFX6: ; %bb.0: 6130; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6131; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 6132; GFX6-NEXT: s_mov_b32 s7, 0xf000 6133; GFX6-NEXT: s_mov_b32 s6, -1 6134; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6135; GFX6-NEXT: s_ashr_i32 s1, s0, 31 6136; GFX6-NEXT: s_lshr_b32 s1, s1, 20 6137; GFX6-NEXT: s_add_i32 s0, s0, s1 6138; GFX6-NEXT: s_ashr_i32 s0, s0, 12 6139; GFX6-NEXT: v_mov_b32_e32 v0, s0 6140; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 6141; GFX6-NEXT: s_endpgm 6142; GFX9-LABEL: sdiv_i32_pow2k_denom: 6143; GFX9: ; %bb.0: 6144; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6145; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 6146; GFX9-NEXT: v_mov_b32_e32 v0, 0 6147; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6148; GFX9-NEXT: s_ashr_i32 s0, s4, 31 6149; GFX9-NEXT: s_lshr_b32 s0, s0, 20 6150; GFX9-NEXT: s_add_i32 s4, s4, s0 6151; GFX9-NEXT: s_ashr_i32 s0, s4, 12 6152; GFX9-NEXT: v_mov_b32_e32 v1, s0 6153; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 6154; GFX9-NEXT: s_endpgm 6155 %r = sdiv i32 %x, 4096 6156 store i32 %r, i32 addrspace(1)* %out 6157 ret void 6158} 6159 6160define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 6161; CHECK-LABEL: @sdiv_i32_pow2_shl_denom( 6162; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 6163; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]] 6164; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6165; CHECK-NEXT: ret void 6166; 6167; GFX6-LABEL: sdiv_i32_pow2_shl_denom: 6168; GFX6: ; %bb.0: 6169; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 6170; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6171; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6172; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 6173; GFX6-NEXT: s_ashr_i32 s4, s3, 31 6174; GFX6-NEXT: s_add_i32 s3, s3, s4 6175; GFX6-NEXT: s_xor_b32 s7, s3, s4 6176; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s7 6177; GFX6-NEXT: s_sub_i32 s3, 0, s7 6178; GFX6-NEXT: s_ashr_i32 s5, s2, 31 6179; GFX6-NEXT: s_add_i32 s2, s2, s5 6180; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6181; GFX6-NEXT: s_xor_b32 s6, s2, s5 6182; GFX6-NEXT: s_xor_b32 s4, s5, s4 6183; GFX6-NEXT: s_mov_b32 s2, -1 6184; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6185; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6186; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 6187; GFX6-NEXT: s_mov_b32 s3, 0xf000 6188; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 6189; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 6190; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 6191; GFX6-NEXT: v_mul_lo_u32 v1, v0, s7 6192; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 6193; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s6, v1 6194; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s7, v1 6195; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 6196; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6197; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 6198; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 6199; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 6200; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6201; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 6202; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 6203; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 6204; GFX6-NEXT: s_endpgm 6205; GFX9-LABEL: sdiv_i32_pow2_shl_denom: 6206; GFX9: ; %bb.0: 6207; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6208; GFX9-NEXT: v_mov_b32_e32 v2, 0 6209; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6210; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6211; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 6212; GFX9-NEXT: s_ashr_i32 s4, s3, 31 6213; GFX9-NEXT: s_add_i32 s3, s3, s4 6214; GFX9-NEXT: s_xor_b32 s5, s3, s4 6215; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5 6216; GFX9-NEXT: s_sub_i32 s3, 0, s5 6217; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6218; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6219; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6220; GFX9-NEXT: v_mul_lo_u32 v1, s3, v0 6221; GFX9-NEXT: s_ashr_i32 s3, s2, 31 6222; GFX9-NEXT: s_add_i32 s2, s2, s3 6223; GFX9-NEXT: s_xor_b32 s2, s2, s3 6224; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 6225; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 6226; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 6227; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5 6228; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 6229; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 6230; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 6231; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6232; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v1 6233; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 6234; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 6235; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 6236; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 6237; GFX9-NEXT: s_xor_b32 s2, s3, s4 6238; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 6239; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 6240; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 6241; GFX9-NEXT: s_endpgm 6242 %shl.y = shl i32 4096, %y 6243 %r = sdiv i32 %x, %shl.y 6244 store i32 %r, i32 addrspace(1)* %out 6245 ret void 6246} 6247 6248define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 6249; CHECK-LABEL: @sdiv_v2i32_pow2k_denom( 6250; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6251; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 6252; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 6253; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 6254; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096 6255; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 6256; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 6257; CHECK-NEXT: ret void 6258; 6259; GFX6-LABEL: sdiv_v2i32_pow2k_denom: 6260; GFX6: ; %bb.0: 6261; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6262; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6263; GFX6-NEXT: s_mov_b32 s7, 0xf000 6264; GFX6-NEXT: s_mov_b32 s6, -1 6265; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6266; GFX6-NEXT: s_ashr_i32 s2, s0, 31 6267; GFX6-NEXT: s_lshr_b32 s2, s2, 20 6268; GFX6-NEXT: s_ashr_i32 s3, s1, 31 6269; GFX6-NEXT: s_add_i32 s0, s0, s2 6270; GFX6-NEXT: s_lshr_b32 s2, s3, 20 6271; GFX6-NEXT: s_add_i32 s1, s1, s2 6272; GFX6-NEXT: s_ashr_i32 s0, s0, 12 6273; GFX6-NEXT: s_ashr_i32 s1, s1, 12 6274; GFX6-NEXT: v_mov_b32_e32 v0, s0 6275; GFX6-NEXT: v_mov_b32_e32 v1, s1 6276; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6277; GFX6-NEXT: s_endpgm 6278; GFX9-LABEL: sdiv_v2i32_pow2k_denom: 6279; GFX9: ; %bb.0: 6280; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6281; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 6282; GFX9-NEXT: v_mov_b32_e32 v2, 0 6283; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6284; GFX9-NEXT: s_ashr_i32 s0, s4, 31 6285; GFX9-NEXT: s_ashr_i32 s1, s5, 31 6286; GFX9-NEXT: s_lshr_b32 s0, s0, 20 6287; GFX9-NEXT: s_lshr_b32 s1, s1, 20 6288; GFX9-NEXT: s_add_i32 s0, s4, s0 6289; GFX9-NEXT: s_add_i32 s1, s5, s1 6290; GFX9-NEXT: s_ashr_i32 s0, s0, 12 6291; GFX9-NEXT: s_ashr_i32 s1, s1, 12 6292; GFX9-NEXT: v_mov_b32_e32 v0, s0 6293; GFX9-NEXT: v_mov_b32_e32 v1, s1 6294; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 6295; GFX9-NEXT: s_endpgm 6296 %r = sdiv <2 x i32> %x, <i32 4096, i32 4096> 6297 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 6298 ret void 6299} 6300 6301define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 6302; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom( 6303; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6304; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 6305; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 6306; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 6307; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095 6308; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 6309; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 6310; CHECK-NEXT: ret void 6311; 6312; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 6313; GFX6: ; %bb.0: 6314; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6315; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6316; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080081 6317; GFX6-NEXT: s_mov_b32 s7, 0xf000 6318; GFX6-NEXT: s_mov_b32 s6, -1 6319; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6320; GFX6-NEXT: v_mul_hi_i32 v0, s1, v0 6321; GFX6-NEXT: s_ashr_i32 s2, s0, 31 6322; GFX6-NEXT: s_lshr_b32 s2, s2, 20 6323; GFX6-NEXT: s_add_i32 s0, s0, s2 6324; GFX6-NEXT: v_add_i32_e32 v0, vcc, s1, v0 6325; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 6326; GFX6-NEXT: v_ashrrev_i32_e32 v0, 11, v0 6327; GFX6-NEXT: s_ashr_i32 s0, s0, 12 6328; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v0 6329; GFX6-NEXT: v_mov_b32_e32 v0, s0 6330; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6331; GFX6-NEXT: s_endpgm 6332; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 6333; GFX9: ; %bb.0: 6334; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6335; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 6336; GFX9-NEXT: v_mov_b32_e32 v2, 0 6337; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6338; GFX9-NEXT: s_ashr_i32 s0, s4, 31 6339; GFX9-NEXT: s_mul_hi_i32 s1, s5, 0x80080081 6340; GFX9-NEXT: s_lshr_b32 s0, s0, 20 6341; GFX9-NEXT: s_add_i32 s1, s1, s5 6342; GFX9-NEXT: s_add_i32 s0, s4, s0 6343; GFX9-NEXT: s_lshr_b32 s4, s1, 31 6344; GFX9-NEXT: s_ashr_i32 s1, s1, 11 6345; GFX9-NEXT: s_ashr_i32 s0, s0, 12 6346; GFX9-NEXT: s_add_i32 s1, s1, s4 6347; GFX9-NEXT: v_mov_b32_e32 v0, s0 6348; GFX9-NEXT: v_mov_b32_e32 v1, s1 6349; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 6350; GFX9-NEXT: s_endpgm 6351 %r = sdiv <2 x i32> %x, <i32 4096, i32 4095> 6352 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 6353 ret void 6354} 6355 6356define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 6357; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom( 6358; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 6359; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6360; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 6361; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 6362; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 6363; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 6364; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 6365; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 6366; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 6367; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 6368; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 6369; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 6370; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000 6371; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 6372; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]] 6373; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]] 6374; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64 6375; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 6376; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 6377; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 6378; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 6379; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 6380; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]] 6381; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64 6382; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64 6383; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 6384; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 6385; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 6386; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 6387; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]] 6388; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]] 6389; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]] 6390; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1 6391; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]] 6392; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]] 6393; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]] 6394; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]] 6395; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1 6396; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]] 6397; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]] 6398; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]] 6399; CHECK-NEXT: [[TMP41:%.*]] = insertelement <2 x i32> undef, i32 [[TMP40]], i64 0 6400; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[X]], i64 1 6401; CHECK-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 6402; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31 6403; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31 6404; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]] 6405; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]] 6406; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]] 6407; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]] 6408; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]] 6409; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float 6410; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]]) 6411; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000 6412; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32 6413; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]] 6414; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]] 6415; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64 6416; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64 6417; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]] 6418; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32 6419; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32 6420; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32 6421; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]] 6422; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64 6423; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64 6424; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]] 6425; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32 6426; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32 6427; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32 6428; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]] 6429; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]] 6430; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]] 6431; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1 6432; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]] 6433; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]] 6434; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]] 6435; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]] 6436; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1 6437; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]] 6438; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]] 6439; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]] 6440; CHECK-NEXT: [[TMP82:%.*]] = insertelement <2 x i32> [[TMP41]], i32 [[TMP81]], i64 1 6441; CHECK-NEXT: store <2 x i32> [[TMP82]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 6442; CHECK-NEXT: ret void 6443; 6444; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom: 6445; GFX6: ; %bb.0: 6446; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 6447; GFX6-NEXT: s_movk_i32 s10, 0x1000 6448; GFX6-NEXT: s_mov_b32 s13, 0x4f7ffffe 6449; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6450; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb 6451; GFX6-NEXT: s_mov_b32 s7, 0xf000 6452; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6453; GFX6-NEXT: s_lshl_b32 s2, s10, s2 6454; GFX6-NEXT: s_ashr_i32 s11, s2, 31 6455; GFX6-NEXT: s_add_i32 s2, s2, s11 6456; GFX6-NEXT: s_xor_b32 s12, s2, s11 6457; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 6458; GFX6-NEXT: s_lshl_b32 s0, s10, s3 6459; GFX6-NEXT: s_sub_i32 s3, 0, s12 6460; GFX6-NEXT: s_ashr_i32 s2, s0, 31 6461; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6462; GFX6-NEXT: s_add_i32 s0, s0, s2 6463; GFX6-NEXT: s_xor_b32 s10, s0, s2 6464; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s10 6465; GFX6-NEXT: v_mul_f32_e32 v0, s13, v0 6466; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6467; GFX6-NEXT: s_ashr_i32 s1, s8, 31 6468; GFX6-NEXT: s_add_i32 s0, s8, s1 6469; GFX6-NEXT: s_xor_b32 s0, s0, s1 6470; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 6471; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 6472; GFX6-NEXT: s_xor_b32 s3, s1, s11 6473; GFX6-NEXT: s_mov_b32 s6, -1 6474; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 6475; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 6476; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 6477; GFX6-NEXT: v_mul_f32_e32 v1, s13, v2 6478; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 6479; GFX6-NEXT: v_mul_lo_u32 v2, v0, s12 6480; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 6481; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s0, v2 6482; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v2 6483; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 6484; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s12, v2 6485; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 6486; GFX6-NEXT: s_sub_i32 s0, 0, s10 6487; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1 6488; GFX6-NEXT: s_ashr_i32 s0, s9, 31 6489; GFX6-NEXT: s_add_i32 s1, s9, s0 6490; GFX6-NEXT: s_xor_b32 s1, s1, s0 6491; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 6492; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 6493; GFX6-NEXT: s_xor_b32 s2, s0, s2 6494; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 6495; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 6496; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v2 6497; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 6498; GFX6-NEXT: v_xor_b32_e32 v0, s3, v0 6499; GFX6-NEXT: v_mul_lo_u32 v2, v1, s10 6500; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 6501; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 6502; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 6503; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 6504; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 6505; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s10, v2 6506; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 6507; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 6508; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 6509; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 6510; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 6511; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 6512; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6513; GFX6-NEXT: s_endpgm 6514; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: 6515; GFX9: ; %bb.0: 6516; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 6517; GFX9-NEXT: s_movk_i32 s8, 0x1000 6518; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 6519; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 6520; GFX9-NEXT: s_mov_b32 s11, 0x4f7ffffe 6521; GFX9-NEXT: v_mov_b32_e32 v2, 0 6522; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6523; GFX9-NEXT: s_lshl_b32 s2, s8, s2 6524; GFX9-NEXT: s_ashr_i32 s9, s2, 31 6525; GFX9-NEXT: s_add_i32 s2, s2, s9 6526; GFX9-NEXT: s_xor_b32 s10, s2, s9 6527; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 6528; GFX9-NEXT: s_lshl_b32 s0, s8, s3 6529; GFX9-NEXT: s_ashr_i32 s1, s0, 31 6530; GFX9-NEXT: s_add_i32 s0, s0, s1 6531; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6532; GFX9-NEXT: s_xor_b32 s8, s0, s1 6533; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8 6534; GFX9-NEXT: s_sub_i32 s0, 0, s10 6535; GFX9-NEXT: v_mul_f32_e32 v0, s11, v0 6536; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6537; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 6538; GFX9-NEXT: s_sub_i32 s3, 0, s8 6539; GFX9-NEXT: v_mul_lo_u32 v3, s0, v0 6540; GFX9-NEXT: v_mul_f32_e32 v1, s11, v1 6541; GFX9-NEXT: s_ashr_i32 s0, s6, 31 6542; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 6543; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 6544; GFX9-NEXT: s_add_i32 s2, s6, s0 6545; GFX9-NEXT: s_xor_b32 s2, s2, s0 6546; GFX9-NEXT: s_xor_b32 s0, s0, s9 6547; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 6548; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 6549; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 6550; GFX9-NEXT: s_ashr_i32 s3, s7, 31 6551; GFX9-NEXT: v_mul_lo_u32 v4, v0, s10 6552; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 6553; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 6554; GFX9-NEXT: v_sub_u32_e32 v4, s2, v4 6555; GFX9-NEXT: s_add_i32 s2, s7, s3 6556; GFX9-NEXT: s_xor_b32 s2, s2, s3 6557; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 6558; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v4 6559; GFX9-NEXT: v_mul_hi_u32 v1, s2, v1 6560; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 6561; GFX9-NEXT: v_subrev_u32_e32 v5, s10, v4 6562; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc 6563; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v4 6564; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 6565; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6566; GFX9-NEXT: v_mul_lo_u32 v3, v1, s8 6567; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 6568; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 6569; GFX9-NEXT: v_subrev_u32_e32 v0, s0, v0 6570; GFX9-NEXT: v_sub_u32_e32 v3, s2, v3 6571; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 6572; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 6573; GFX9-NEXT: v_subrev_u32_e32 v4, s8, v3 6574; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 6575; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 6576; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 6577; GFX9-NEXT: s_xor_b32 s0, s3, s1 6578; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 6579; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 6580; GFX9-NEXT: v_subrev_u32_e32 v1, s0, v1 6581; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 6582; GFX9-NEXT: s_endpgm 6583 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 6584 %r = sdiv <2 x i32> %x, %shl.y 6585 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 6586 ret void 6587} 6588 6589define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 6590; CHECK-LABEL: @srem_i32_oddk_denom( 6591; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 1235195 6592; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6593; CHECK-NEXT: ret void 6594; 6595; GFX6-LABEL: srem_i32_oddk_denom: 6596; GFX6: ; %bb.0: 6597; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 6598; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 6599; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb 6600; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6601; GFX6-NEXT: s_mov_b32 s3, 0xf000 6602; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6603; GFX6-NEXT: v_mul_hi_i32 v0, s4, v0 6604; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 6605; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 6606; GFX6-NEXT: v_ashrrev_i32_e32 v0, 20, v0 6607; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 6608; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 6609; GFX6-NEXT: s_mov_b32 s2, -1 6610; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 6611; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 6612; GFX6-NEXT: s_endpgm 6613; GFX9-LABEL: srem_i32_oddk_denom: 6614; GFX9: ; %bb.0: 6615; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6616; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 6617; GFX9-NEXT: v_mov_b32_e32 v0, 0 6618; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6619; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 6620; GFX9-NEXT: s_add_i32 s0, s0, s4 6621; GFX9-NEXT: s_lshr_b32 s1, s0, 31 6622; GFX9-NEXT: s_ashr_i32 s0, s0, 20 6623; GFX9-NEXT: s_add_i32 s0, s0, s1 6624; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb 6625; GFX9-NEXT: s_sub_i32 s0, s4, s0 6626; GFX9-NEXT: v_mov_b32_e32 v1, s0 6627; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 6628; GFX9-NEXT: s_endpgm 6629 %r = srem i32 %x, 1235195 6630 store i32 %r, i32 addrspace(1)* %out 6631 ret void 6632} 6633 6634define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 6635; CHECK-LABEL: @srem_i32_pow2k_denom( 6636; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 4096 6637; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6638; CHECK-NEXT: ret void 6639; 6640; GFX6-LABEL: srem_i32_pow2k_denom: 6641; GFX6: ; %bb.0: 6642; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6643; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 6644; GFX6-NEXT: s_mov_b32 s7, 0xf000 6645; GFX6-NEXT: s_mov_b32 s6, -1 6646; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6647; GFX6-NEXT: s_ashr_i32 s1, s0, 31 6648; GFX6-NEXT: s_lshr_b32 s1, s1, 20 6649; GFX6-NEXT: s_add_i32 s1, s0, s1 6650; GFX6-NEXT: s_and_b32 s1, s1, 0xfffff000 6651; GFX6-NEXT: s_sub_i32 s0, s0, s1 6652; GFX6-NEXT: v_mov_b32_e32 v0, s0 6653; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 6654; GFX6-NEXT: s_endpgm 6655; GFX9-LABEL: srem_i32_pow2k_denom: 6656; GFX9: ; %bb.0: 6657; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6658; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 6659; GFX9-NEXT: v_mov_b32_e32 v0, 0 6660; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6661; GFX9-NEXT: s_ashr_i32 s0, s4, 31 6662; GFX9-NEXT: s_lshr_b32 s0, s0, 20 6663; GFX9-NEXT: s_add_i32 s0, s4, s0 6664; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 6665; GFX9-NEXT: s_sub_i32 s0, s4, s0 6666; GFX9-NEXT: v_mov_b32_e32 v1, s0 6667; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 6668; GFX9-NEXT: s_endpgm 6669 %r = srem i32 %x, 4096 6670 store i32 %r, i32 addrspace(1)* %out 6671 ret void 6672} 6673 6674define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 6675; CHECK-LABEL: @srem_i32_pow2_shl_denom( 6676; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 6677; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]] 6678; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6679; CHECK-NEXT: ret void 6680; 6681; GFX6-LABEL: srem_i32_pow2_shl_denom: 6682; GFX6: ; %bb.0: 6683; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 6684; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6685; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6686; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 6687; GFX6-NEXT: s_ashr_i32 s4, s3, 31 6688; GFX6-NEXT: s_add_i32 s3, s3, s4 6689; GFX6-NEXT: s_xor_b32 s6, s3, s4 6690; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 6691; GFX6-NEXT: s_sub_i32 s3, 0, s6 6692; GFX6-NEXT: s_ashr_i32 s4, s2, 31 6693; GFX6-NEXT: s_add_i32 s2, s2, s4 6694; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6695; GFX6-NEXT: s_xor_b32 s5, s2, s4 6696; GFX6-NEXT: s_mov_b32 s2, -1 6697; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6698; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6699; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 6700; GFX6-NEXT: s_mov_b32 s3, 0xf000 6701; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 6702; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 6703; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 6704; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 6705; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s5, v0 6706; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s6, v0 6707; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 6708; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 6709; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s6, v0 6710; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 6711; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 6712; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 6713; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 6714; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 6715; GFX6-NEXT: s_endpgm 6716; GFX9-LABEL: srem_i32_pow2_shl_denom: 6717; GFX9: ; %bb.0: 6718; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6719; GFX9-NEXT: s_nop 0 6720; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6721; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6722; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 6723; GFX9-NEXT: s_ashr_i32 s4, s3, 31 6724; GFX9-NEXT: s_add_i32 s3, s3, s4 6725; GFX9-NEXT: s_xor_b32 s3, s3, s4 6726; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 6727; GFX9-NEXT: s_sub_i32 s4, 0, s3 6728; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6729; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6730; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6731; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 6732; GFX9-NEXT: s_ashr_i32 s4, s2, 31 6733; GFX9-NEXT: s_add_i32 s2, s2, s4 6734; GFX9-NEXT: s_xor_b32 s2, s2, s4 6735; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 6736; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 6737; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 6738; GFX9-NEXT: v_mov_b32_e32 v1, 0 6739; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 6740; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 6741; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 6742; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 6743; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6744; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 6745; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 6746; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6747; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 6748; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 6749; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 6750; GFX9-NEXT: s_endpgm 6751 %shl.y = shl i32 4096, %y 6752 %r = srem i32 %x, %shl.y 6753 store i32 %r, i32 addrspace(1)* %out 6754 ret void 6755} 6756 6757define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 6758; CHECK-LABEL: @srem_v2i32_pow2k_denom( 6759; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6760; CHECK-NEXT: [[TMP2:%.*]] = srem i32 [[TMP1]], 4096 6761; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 6762; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 6763; CHECK-NEXT: [[TMP5:%.*]] = srem i32 [[TMP4]], 4096 6764; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 6765; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 6766; CHECK-NEXT: ret void 6767; 6768; GFX6-LABEL: srem_v2i32_pow2k_denom: 6769; GFX6: ; %bb.0: 6770; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6771; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6772; GFX6-NEXT: s_movk_i32 s2, 0xf000 6773; GFX6-NEXT: s_mov_b32 s7, 0xf000 6774; GFX6-NEXT: s_mov_b32 s6, -1 6775; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6776; GFX6-NEXT: s_ashr_i32 s3, s0, 31 6777; GFX6-NEXT: s_lshr_b32 s3, s3, 20 6778; GFX6-NEXT: s_add_i32 s3, s0, s3 6779; GFX6-NEXT: s_and_b32 s3, s3, s2 6780; GFX6-NEXT: s_sub_i32 s0, s0, s3 6781; GFX6-NEXT: s_ashr_i32 s3, s1, 31 6782; GFX6-NEXT: s_lshr_b32 s3, s3, 20 6783; GFX6-NEXT: s_add_i32 s3, s1, s3 6784; GFX6-NEXT: s_and_b32 s2, s3, s2 6785; GFX6-NEXT: s_sub_i32 s1, s1, s2 6786; GFX6-NEXT: v_mov_b32_e32 v0, s0 6787; GFX6-NEXT: v_mov_b32_e32 v1, s1 6788; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6789; GFX6-NEXT: s_endpgm 6790; GFX9-LABEL: srem_v2i32_pow2k_denom: 6791; GFX9: ; %bb.0: 6792; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6793; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 6794; GFX9-NEXT: s_movk_i32 s6, 0xf000 6795; GFX9-NEXT: v_mov_b32_e32 v2, 0 6796; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6797; GFX9-NEXT: s_ashr_i32 s0, s4, 31 6798; GFX9-NEXT: s_ashr_i32 s1, s5, 31 6799; GFX9-NEXT: s_lshr_b32 s0, s0, 20 6800; GFX9-NEXT: s_lshr_b32 s1, s1, 20 6801; GFX9-NEXT: s_add_i32 s0, s4, s0 6802; GFX9-NEXT: s_add_i32 s1, s5, s1 6803; GFX9-NEXT: s_and_b32 s0, s0, s6 6804; GFX9-NEXT: s_and_b32 s1, s1, s6 6805; GFX9-NEXT: s_sub_i32 s0, s4, s0 6806; GFX9-NEXT: s_sub_i32 s1, s5, s1 6807; GFX9-NEXT: v_mov_b32_e32 v0, s0 6808; GFX9-NEXT: v_mov_b32_e32 v1, s1 6809; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 6810; GFX9-NEXT: s_endpgm 6811 %r = srem <2 x i32> %x, <i32 4096, i32 4096> 6812 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 6813 ret void 6814} 6815 6816define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 6817; CHECK-LABEL: @srem_v2i32_pow2_shl_denom( 6818; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 6819; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6820; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 6821; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 6822; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 6823; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 6824; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 6825; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 6826; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 6827; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 6828; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 6829; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000 6830; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 6831; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]] 6832; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]] 6833; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64 6834; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 6835; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]] 6836; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 6837; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32 6838; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 6839; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] 6840; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 6841; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64 6842; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 6843; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 6844; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 6845; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 6846; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]] 6847; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]] 6848; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]] 6849; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]] 6850; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]] 6851; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]] 6852; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]] 6853; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]] 6854; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]] 6855; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]] 6856; CHECK-NEXT: [[TMP38:%.*]] = insertelement <2 x i32> undef, i32 [[TMP37]], i64 0 6857; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[X]], i64 1 6858; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 6859; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31 6860; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31 6861; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]] 6862; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]] 6863; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]] 6864; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]] 6865; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float 6866; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]]) 6867; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000 6868; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32 6869; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]] 6870; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]] 6871; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64 6872; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64 6873; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]] 6874; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32 6875; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32 6876; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32 6877; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]] 6878; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64 6879; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64 6880; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]] 6881; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32 6882; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32 6883; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32 6884; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]] 6885; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]] 6886; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]] 6887; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]] 6888; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]] 6889; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]] 6890; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]] 6891; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]] 6892; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]] 6893; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]] 6894; CHECK-NEXT: [[TMP76:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP75]], i64 1 6895; CHECK-NEXT: store <2 x i32> [[TMP76]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 6896; CHECK-NEXT: ret void 6897; 6898; GFX6-LABEL: srem_v2i32_pow2_shl_denom: 6899; GFX6: ; %bb.0: 6900; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 6901; GFX6-NEXT: s_movk_i32 s6, 0x1000 6902; GFX6-NEXT: s_mov_b32 s10, 0x4f7ffffe 6903; GFX6-NEXT: s_mov_b32 s7, 0xf000 6904; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6905; GFX6-NEXT: s_lshl_b32 s2, s6, s2 6906; GFX6-NEXT: s_ashr_i32 s4, s2, 31 6907; GFX6-NEXT: s_add_i32 s2, s2, s4 6908; GFX6-NEXT: s_xor_b32 s9, s2, s4 6909; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 6910; GFX6-NEXT: s_lshl_b32 s2, s6, s3 6911; GFX6-NEXT: s_ashr_i32 s6, s2, 31 6912; GFX6-NEXT: s_add_i32 s2, s2, s6 6913; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6914; GFX6-NEXT: s_sub_i32 s8, 0, s9 6915; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6916; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6917; GFX6-NEXT: v_mul_f32_e32 v0, s10, v0 6918; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6919; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6920; GFX6-NEXT: s_ashr_i32 s3, s0, 31 6921; GFX6-NEXT: s_add_i32 s0, s0, s3 6922; GFX6-NEXT: v_mul_lo_u32 v1, s8, v0 6923; GFX6-NEXT: s_xor_b32 s8, s2, s6 6924; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s8 6925; GFX6-NEXT: s_xor_b32 s0, s0, s3 6926; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 6927; GFX6-NEXT: s_sub_i32 s2, 0, s8 6928; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 6929; GFX6-NEXT: s_mov_b32 s6, -1 6930; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 6931; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 6932; GFX6-NEXT: v_mul_f32_e32 v1, s10, v2 6933; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 6934; GFX6-NEXT: v_mul_lo_u32 v0, v0, s9 6935; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 6936; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 6937; GFX6-NEXT: s_ashr_i32 s0, s1, 31 6938; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 6939; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v0 6940; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v0 6941; GFX6-NEXT: s_add_i32 s1, s1, s0 6942; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6943; GFX6-NEXT: s_xor_b32 s1, s1, s0 6944; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 6945; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 6946; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v0 6947; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v0 6948; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6949; GFX6-NEXT: v_mul_lo_u32 v1, v1, s8 6950; GFX6-NEXT: v_xor_b32_e32 v0, s3, v0 6951; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s3, v0 6952; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 6953; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s8, v1 6954; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 6955; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6956; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s8, v1 6957; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 6958; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6959; GFX6-NEXT: v_xor_b32_e32 v1, s0, v1 6960; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1 6961; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6962; GFX6-NEXT: s_endpgm 6963; GFX9-LABEL: srem_v2i32_pow2_shl_denom: 6964; GFX9: ; %bb.0: 6965; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6966; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 6967; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 6968; GFX9-NEXT: s_movk_i32 s8, 0x1000 6969; GFX9-NEXT: s_mov_b32 s9, 0x4f7ffffe 6970; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6971; GFX9-NEXT: s_lshl_b32 s0, s8, s6 6972; GFX9-NEXT: s_ashr_i32 s1, s0, 31 6973; GFX9-NEXT: s_add_i32 s0, s0, s1 6974; GFX9-NEXT: s_xor_b32 s0, s0, s1 6975; GFX9-NEXT: s_lshl_b32 s1, s8, s7 6976; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 6977; GFX9-NEXT: s_ashr_i32 s6, s1, 31 6978; GFX9-NEXT: s_add_i32 s1, s1, s6 6979; GFX9-NEXT: s_xor_b32 s1, s1, s6 6980; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 6981; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6982; GFX9-NEXT: s_sub_i32 s7, 0, s0 6983; GFX9-NEXT: s_ashr_i32 s6, s4, 31 6984; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 6985; GFX9-NEXT: v_mul_f32_e32 v0, s9, v0 6986; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6987; GFX9-NEXT: s_add_i32 s4, s4, s6 6988; GFX9-NEXT: v_mul_f32_e32 v1, s9, v1 6989; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 6990; GFX9-NEXT: v_mul_lo_u32 v2, s7, v0 6991; GFX9-NEXT: s_sub_i32 s7, 0, s1 6992; GFX9-NEXT: s_xor_b32 s4, s4, s6 6993; GFX9-NEXT: v_mul_lo_u32 v3, s7, v1 6994; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 6995; GFX9-NEXT: s_ashr_i32 s7, s5, 31 6996; GFX9-NEXT: s_add_i32 s5, s5, s7 6997; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 6998; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 6999; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 7000; GFX9-NEXT: s_xor_b32 s5, s5, s7 7001; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 7002; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 7003; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 7004; GFX9-NEXT: v_mov_b32_e32 v2, 0 7005; GFX9-NEXT: v_mul_lo_u32 v1, v1, s1 7006; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 7007; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v0 7008; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 7009; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 7010; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v0 7011; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 7012; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 7013; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 7014; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v1 7015; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 7016; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 7017; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v1 7018; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 7019; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 7020; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 7021; GFX9-NEXT: v_xor_b32_e32 v1, s7, v1 7022; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 7023; GFX9-NEXT: v_subrev_u32_e32 v1, s7, v1 7024; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 7025; GFX9-NEXT: s_endpgm 7026 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 7027 %r = srem <2 x i32> %x, %shl.y 7028 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 7029 ret void 7030} 7031 7032define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 7033; CHECK-LABEL: @udiv_i64_oddk_denom( 7034; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943 7035; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 7036; CHECK-NEXT: ret void 7037; 7038; GFX6-LABEL: udiv_i64_oddk_denom: 7039; GFX6: ; %bb.0: 7040; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f176a73 7041; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 7042; GFX6-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 7043; GFX6-NEXT: v_rcp_f32_e32 v0, v0 7044; GFX6-NEXT: s_movk_i32 s2, 0xfee0 7045; GFX6-NEXT: s_mov_b32 s3, 0x68958c89 7046; GFX6-NEXT: v_mov_b32_e32 v8, 0 7047; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7048; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7049; GFX6-NEXT: v_trunc_f32_e32 v1, v1 7050; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7051; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 7052; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 7053; GFX6-NEXT: v_mov_b32_e32 v7, 0 7054; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 7055; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 7056; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 7057; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 7058; GFX6-NEXT: s_mov_b32 s11, 0xf000 7059; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7060; GFX6-NEXT: s_mov_b32 s8, s4 7061; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7062; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 7063; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 7064; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 7065; GFX6-NEXT: v_mul_hi_u32 v4, v0, v2 7066; GFX6-NEXT: v_mul_hi_u32 v6, v0, v3 7067; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 7068; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 7069; GFX6-NEXT: s_movk_i32 s4, 0x11e 7070; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 7071; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 7072; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 7073; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 7074; GFX6-NEXT: s_mov_b32 s10, -1 7075; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 7076; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc 7077; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 7078; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7079; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 7080; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 7081; GFX6-NEXT: v_mul_lo_u32 v4, v0, s2 7082; GFX6-NEXT: v_mul_hi_u32 v5, v0, s3 7083; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 7084; GFX6-NEXT: v_mul_lo_u32 v6, v2, s3 7085; GFX6-NEXT: s_mov_b32 s2, 0x976a7377 7086; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 7087; GFX6-NEXT: v_mul_lo_u32 v5, v0, s3 7088; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 7089; GFX6-NEXT: v_mul_lo_u32 v6, v0, v4 7090; GFX6-NEXT: v_mul_hi_u32 v10, v0, v4 7091; GFX6-NEXT: v_mul_hi_u32 v9, v0, v5 7092; GFX6-NEXT: v_mul_hi_u32 v11, v2, v4 7093; GFX6-NEXT: s_movk_i32 s3, 0x11f 7094; GFX6-NEXT: s_mov_b32 s9, s5 7095; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 7096; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc 7097; GFX6-NEXT: v_mul_lo_u32 v10, v2, v5 7098; GFX6-NEXT: v_mul_hi_u32 v5, v2, v5 7099; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 7100; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v10 7101; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc 7102; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc 7103; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 7104; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 7105; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 7106; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 7107; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 7108; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7109; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 7110; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 7111; GFX6-NEXT: v_mul_hi_u32 v4, s6, v1 7112; GFX6-NEXT: v_mul_hi_u32 v5, s7, v1 7113; GFX6-NEXT: v_mul_lo_u32 v1, s7, v1 7114; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7115; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 7116; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 7117; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 7118; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7119; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 7120; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 7121; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7122; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 7123; GFX6-NEXT: v_mul_lo_u32 v2, v0, s3 7124; GFX6-NEXT: v_mul_hi_u32 v3, v0, s2 7125; GFX6-NEXT: v_mul_lo_u32 v4, v1, s2 7126; GFX6-NEXT: v_mov_b32_e32 v5, s3 7127; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7128; GFX6-NEXT: v_mul_lo_u32 v3, v0, s2 7129; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7130; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s7, v2 7131; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 7132; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 7133; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 7134; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 7135; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v4 7136; GFX6-NEXT: s_mov_b32 s2, 0x976a7376 7137; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 7138; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s2, v5 7139; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 7140; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 7141; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 7142; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 7143; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 7144; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 7145; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 7146; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 7147; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] 7148; GFX6-NEXT: v_mov_b32_e32 v6, s7 7149; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 7150; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s4, v2 7151; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 7152; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s2, v3 7153; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 7154; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 7155; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 7156; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 7157; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 7158; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 7159; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 7160; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 7161; GFX6-NEXT: s_endpgm 7162; GFX9-LABEL: udiv_i64_oddk_denom: 7163; GFX9: ; %bb.0: 7164; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f176a73 7165; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 7166; GFX9-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 7167; GFX9-NEXT: v_rcp_f32_e32 v0, v0 7168; GFX9-NEXT: s_movk_i32 s4, 0xfee0 7169; GFX9-NEXT: s_mov_b32 s5, 0x68958c89 7170; GFX9-NEXT: v_mov_b32_e32 v6, 0 7171; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7172; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7173; GFX9-NEXT: v_trunc_f32_e32 v1, v1 7174; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7175; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 7176; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 7177; GFX9-NEXT: s_movk_i32 s8, 0x11f 7178; GFX9-NEXT: s_mov_b32 s9, 0x976a7376 7179; GFX9-NEXT: v_mul_lo_u32 v2, v0, s4 7180; GFX9-NEXT: v_mul_hi_u32 v3, v0, s5 7181; GFX9-NEXT: v_mul_lo_u32 v5, v1, s5 7182; GFX9-NEXT: v_mul_lo_u32 v4, v0, s5 7183; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 7184; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 7185; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 7186; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 7187; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 7188; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 7189; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 7190; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 7191; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc 7192; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 7193; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 7194; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 7195; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc 7196; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc 7197; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7198; GFX9-NEXT: v_mov_b32_e32 v5, 0 7199; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 7200; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc 7201; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] 7202; GFX9-NEXT: v_mul_lo_u32 v4, v0, s4 7203; GFX9-NEXT: v_mul_hi_u32 v7, v0, s5 7204; GFX9-NEXT: v_mul_lo_u32 v8, v2, s5 7205; GFX9-NEXT: v_mul_lo_u32 v9, v0, s5 7206; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 7207; GFX9-NEXT: v_add_u32_e32 v4, v7, v4 7208; GFX9-NEXT: v_add_u32_e32 v4, v4, v8 7209; GFX9-NEXT: v_mul_lo_u32 v7, v0, v4 7210; GFX9-NEXT: v_mul_hi_u32 v8, v0, v9 7211; GFX9-NEXT: v_mul_hi_u32 v10, v0, v4 7212; GFX9-NEXT: v_mul_hi_u32 v11, v2, v4 7213; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 7214; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 7215; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v10, vcc 7216; GFX9-NEXT: v_mul_lo_u32 v10, v2, v9 7217; GFX9-NEXT: v_mul_hi_u32 v9, v2, v9 7218; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 7219; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 7220; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v8, v9, vcc 7221; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v6, vcc 7222; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 7223; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v5, v4, vcc 7224; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] 7225; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 7226; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 7227; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7228; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 7229; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 7230; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 7231; GFX9-NEXT: v_mul_hi_u32 v7, s7, v1 7232; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 7233; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7234; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 7235; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 7236; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 7237; GFX9-NEXT: s_mov_b32 s2, 0x976a7377 7238; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 7239; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 7240; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v6, vcc 7241; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 7242; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc 7243; GFX9-NEXT: v_mul_lo_u32 v2, v0, s8 7244; GFX9-NEXT: v_mul_hi_u32 v3, v0, s2 7245; GFX9-NEXT: v_mul_lo_u32 v4, v1, s2 7246; GFX9-NEXT: v_mov_b32_e32 v5, s8 7247; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 7248; GFX9-NEXT: v_mul_lo_u32 v3, v0, s2 7249; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 7250; GFX9-NEXT: v_sub_u32_e32 v4, s7, v2 7251; GFX9-NEXT: v_sub_co_u32_e64 v3, s[0:1], s6, v3 7252; GFX9-NEXT: v_subb_co_u32_e64 v4, vcc, v4, v5, s[0:1] 7253; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s2, v3 7254; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v4, vcc 7255; GFX9-NEXT: s_movk_i32 s6, 0x11e 7256; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v4 7257; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 7258; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s9, v5 7259; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 7260; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v4 7261; GFX9-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc 7262; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 2, v0 7263; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc 7264; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 1, v0 7265; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc 7266; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v4 7267; GFX9-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[2:3] 7268; GFX9-NEXT: v_mov_b32_e32 v7, s7 7269; GFX9-NEXT: v_subb_co_u32_e64 v2, vcc, v7, v2, s[0:1] 7270; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v2 7271; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 7272; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s9, v3 7273; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 7274; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v2 7275; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc 7276; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 7277; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v5, s[2:3] 7278; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 7279; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 7280; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] 7281; GFX9-NEXT: s_endpgm 7282 %r = udiv i64 %x, 1235195949943 7283 store i64 %r, i64 addrspace(1)* %out 7284 ret void 7285} 7286 7287define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 7288; CHECK-LABEL: @udiv_i64_pow2k_denom( 7289; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 4096 7290; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 7291; CHECK-NEXT: ret void 7292; 7293; GFX6-LABEL: udiv_i64_pow2k_denom: 7294; GFX6: ; %bb.0: 7295; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 7296; GFX6-NEXT: s_mov_b32 s7, 0xf000 7297; GFX6-NEXT: s_mov_b32 s6, -1 7298; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7299; GFX6-NEXT: s_mov_b32 s4, s0 7300; GFX6-NEXT: s_mov_b32 s5, s1 7301; GFX6-NEXT: s_lshr_b64 s[0:1], s[2:3], 12 7302; GFX6-NEXT: v_mov_b32_e32 v0, s0 7303; GFX6-NEXT: v_mov_b32_e32 v1, s1 7304; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7305; GFX6-NEXT: s_endpgm 7306; GFX9-LABEL: udiv_i64_pow2k_denom: 7307; GFX9: ; %bb.0: 7308; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 7309; GFX9-NEXT: v_mov_b32_e32 v2, 0 7310; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7311; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 7312; GFX9-NEXT: v_mov_b32_e32 v0, s2 7313; GFX9-NEXT: v_mov_b32_e32 v1, s3 7314; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 7315; GFX9-NEXT: s_endpgm 7316 %r = udiv i64 %x, 4096 7317 store i64 %r, i64 addrspace(1)* %out 7318 ret void 7319} 7320 7321define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 7322; CHECK-LABEL: @udiv_i64_pow2_shl_denom( 7323; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 7324; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]] 7325; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 7326; CHECK-NEXT: ret void 7327; 7328; GFX6-LABEL: udiv_i64_pow2_shl_denom: 7329; GFX6: ; %bb.0: 7330; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 7331; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd 7332; GFX6-NEXT: s_mov_b32 s3, 0xf000 7333; GFX6-NEXT: s_mov_b32 s2, -1 7334; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7335; GFX6-NEXT: s_mov_b32 s0, s4 7336; GFX6-NEXT: s_add_i32 s8, s8, 12 7337; GFX6-NEXT: s_mov_b32 s1, s5 7338; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 7339; GFX6-NEXT: v_mov_b32_e32 v0, s4 7340; GFX6-NEXT: v_mov_b32_e32 v1, s5 7341; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 7342; GFX6-NEXT: s_endpgm 7343; GFX9-LABEL: udiv_i64_pow2_shl_denom: 7344; GFX9: ; %bb.0: 7345; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 7346; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 7347; GFX9-NEXT: v_mov_b32_e32 v2, 0 7348; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7349; GFX9-NEXT: s_add_i32 s2, s2, 12 7350; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s2 7351; GFX9-NEXT: v_mov_b32_e32 v0, s0 7352; GFX9-NEXT: v_mov_b32_e32 v1, s1 7353; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 7354; GFX9-NEXT: s_endpgm 7355 %shl.y = shl i64 4096, %y 7356 %r = udiv i64 %x, %shl.y 7357 store i64 %r, i64 addrspace(1)* %out 7358 ret void 7359} 7360 7361define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 7362; CHECK-LABEL: @udiv_v2i64_pow2k_denom( 7363; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 7364; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 7365; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 7366; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 7367; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096 7368; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 7369; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 7370; CHECK-NEXT: ret void 7371; 7372; GFX6-LABEL: udiv_v2i64_pow2k_denom: 7373; GFX6: ; %bb.0: 7374; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7375; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 7376; GFX6-NEXT: s_mov_b32 s7, 0xf000 7377; GFX6-NEXT: s_mov_b32 s6, -1 7378; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7379; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 12 7380; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 7381; GFX6-NEXT: v_mov_b32_e32 v0, s0 7382; GFX6-NEXT: v_mov_b32_e32 v1, s1 7383; GFX6-NEXT: v_mov_b32_e32 v2, s2 7384; GFX6-NEXT: v_mov_b32_e32 v3, s3 7385; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 7386; GFX6-NEXT: s_endpgm 7387; GFX9-LABEL: udiv_v2i64_pow2k_denom: 7388; GFX9: ; %bb.0: 7389; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7390; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 7391; GFX9-NEXT: v_mov_b32_e32 v4, 0 7392; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7393; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 7394; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 12 7395; GFX9-NEXT: v_mov_b32_e32 v0, s0 7396; GFX9-NEXT: v_mov_b32_e32 v1, s1 7397; GFX9-NEXT: v_mov_b32_e32 v2, s4 7398; GFX9-NEXT: v_mov_b32_e32 v3, s5 7399; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 7400; GFX9-NEXT: s_endpgm 7401 %r = udiv <2 x i64> %x, <i64 4096, i64 4096> 7402 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 7403 ret void 7404} 7405 7406define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 7407; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom( 7408; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 7409; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 7410; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 7411; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 7412; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095 7413; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 7414; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 7415; CHECK-NEXT: ret void 7416; 7417; GFX6-LABEL: udiv_v2i64_mixed_pow2k_denom: 7418; GFX6: ; %bb.0: 7419; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 7420; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 7421; GFX6-NEXT: v_rcp_f32_e32 v0, v0 7422; GFX6-NEXT: s_movk_i32 s6, 0xf001 7423; GFX6-NEXT: v_mov_b32_e32 v7, 0 7424; GFX6-NEXT: v_mov_b32_e32 v2, 0 7425; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7426; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7427; GFX6-NEXT: v_trunc_f32_e32 v1, v1 7428; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7429; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 7430; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 7431; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7432; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 7433; GFX6-NEXT: s_movk_i32 s0, 0xfff 7434; GFX6-NEXT: v_mul_hi_u32 v3, v0, s6 7435; GFX6-NEXT: v_mul_lo_u32 v5, v1, s6 7436; GFX6-NEXT: v_mul_lo_u32 v4, v0, s6 7437; GFX6-NEXT: s_mov_b32 s7, 0xf000 7438; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, v0, v3 7439; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 7440; GFX6-NEXT: v_mul_hi_u32 v6, v0, v4 7441; GFX6-NEXT: v_mul_lo_u32 v5, v0, v3 7442; GFX6-NEXT: v_mul_hi_u32 v8, v0, v3 7443; GFX6-NEXT: v_mul_hi_u32 v9, v1, v3 7444; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 7445; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 7446; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v7, v8, vcc 7447; GFX6-NEXT: v_mul_lo_u32 v8, v1, v4 7448; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 7449; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8 7450; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v6, v4, vcc 7451; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v2, vcc 7452; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 7453; GFX6-NEXT: v_add_i32_e64 v0, s[2:3], v0, v3 7454; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc 7455; GFX6-NEXT: v_mul_hi_u32 v5, v0, s6 7456; GFX6-NEXT: v_addc_u32_e64 v3, vcc, v1, v4, s[2:3] 7457; GFX6-NEXT: v_mul_lo_u32 v6, v3, s6 7458; GFX6-NEXT: v_mul_lo_u32 v8, v0, s6 7459; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, v0, v5 7460; GFX6-NEXT: s_mov_b32 s6, -1 7461; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 7462; GFX6-NEXT: v_mul_lo_u32 v6, v0, v5 7463; GFX6-NEXT: v_mul_hi_u32 v9, v0, v8 7464; GFX6-NEXT: v_mul_hi_u32 v10, v0, v5 7465; GFX6-NEXT: v_mul_hi_u32 v11, v3, v5 7466; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 7467; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v7, v10, vcc 7468; GFX6-NEXT: v_mul_lo_u32 v10, v3, v8 7469; GFX6-NEXT: v_mul_hi_u32 v8, v3, v8 7470; GFX6-NEXT: v_mul_lo_u32 v3, v3, v5 7471; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v10 7472; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v9, v8, vcc 7473; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v11, v2, vcc 7474; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 7475; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc 7476; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 7477; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] 7478; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v3 7479; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7480; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7481; GFX6-NEXT: v_mul_lo_u32 v3, s10, v1 7482; GFX6-NEXT: v_mul_hi_u32 v4, s10, v0 7483; GFX6-NEXT: v_mul_hi_u32 v5, s10, v1 7484; GFX6-NEXT: v_mul_hi_u32 v6, s11, v1 7485; GFX6-NEXT: v_mul_lo_u32 v1, s11, v1 7486; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 7487; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v7, v5, vcc 7488; GFX6-NEXT: v_mul_lo_u32 v5, s11, v0 7489; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 7490; GFX6-NEXT: s_lshr_b64 s[2:3], s[8:9], 12 7491; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 7492; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v4, v0, vcc 7493; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v6, v2, vcc 7494; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7495; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v7, v2, vcc 7496; GFX6-NEXT: v_mul_lo_u32 v2, v1, s0 7497; GFX6-NEXT: v_mul_hi_u32 v3, v0, s0 7498; GFX6-NEXT: v_mul_lo_u32 v4, v0, s0 7499; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7500; GFX6-NEXT: v_mov_b32_e32 v3, s11 7501; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s10, v4 7502; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc 7503; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s0, v4 7504; GFX6-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc 7505; GFX6-NEXT: s_movk_i32 s0, 0xffe 7506; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 7507; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 7508; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 7509; GFX6-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 7510; GFX6-NEXT: v_add_i32_e32 v5, vcc, 2, v0 7511; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 7512; GFX6-NEXT: v_add_i32_e32 v7, vcc, 1, v0 7513; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v4 7514; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc 7515; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] 7516; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 7517; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 7518; GFX6-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] 7519; GFX6-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc 7520; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 7521; GFX6-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] 7522; GFX6-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc 7523; GFX6-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] 7524; GFX6-NEXT: v_mov_b32_e32 v0, s2 7525; GFX6-NEXT: v_mov_b32_e32 v1, s3 7526; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 7527; GFX6-NEXT: s_endpgm 7528; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom: 7529; GFX9: ; %bb.0: 7530; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 7531; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 7532; GFX9-NEXT: v_rcp_f32_e32 v0, v0 7533; GFX9-NEXT: s_movk_i32 s4, 0xf001 7534; GFX9-NEXT: v_mov_b32_e32 v7, 0 7535; GFX9-NEXT: v_mov_b32_e32 v5, 0 7536; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7537; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7538; GFX9-NEXT: v_trunc_f32_e32 v1, v1 7539; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7540; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 7541; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 7542; GFX9-NEXT: s_movk_i32 s8, 0xfff 7543; GFX9-NEXT: v_mul_hi_u32 v2, v0, s4 7544; GFX9-NEXT: v_mul_lo_u32 v4, v1, s4 7545; GFX9-NEXT: v_mul_lo_u32 v3, v0, s4 7546; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 7547; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 7548; GFX9-NEXT: v_mul_hi_u32 v6, v0, v3 7549; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2 7550; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 7551; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 7552; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 7553; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 7554; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v8, vcc 7555; GFX9-NEXT: v_mul_lo_u32 v8, v1, v3 7556; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 7557; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 7558; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc 7559; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc 7560; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7561; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 7562; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 7563; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] 7564; GFX9-NEXT: v_mul_hi_u32 v4, v0, s4 7565; GFX9-NEXT: v_mul_lo_u32 v6, v2, s4 7566; GFX9-NEXT: v_mul_lo_u32 v8, v0, s4 7567; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 7568; GFX9-NEXT: v_sub_u32_e32 v4, v4, v0 7569; GFX9-NEXT: v_add_u32_e32 v4, v4, v6 7570; GFX9-NEXT: v_mul_lo_u32 v6, v0, v4 7571; GFX9-NEXT: v_mul_hi_u32 v9, v0, v8 7572; GFX9-NEXT: v_mul_hi_u32 v10, v0, v4 7573; GFX9-NEXT: v_mul_hi_u32 v11, v2, v4 7574; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 7575; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v9, v6 7576; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v7, v10, vcc 7577; GFX9-NEXT: v_mul_lo_u32 v10, v2, v8 7578; GFX9-NEXT: v_mul_hi_u32 v8, v2, v8 7579; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 7580; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7581; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 12 7582; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v10 7583; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v9, v8, vcc 7584; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v5, vcc 7585; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 7586; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc 7587; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] 7588; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 7589; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 7590; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 7591; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 7592; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 7593; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 7594; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 7595; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7596; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 7597; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 7598; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 7599; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7600; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 7601; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 7602; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc 7603; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 7604; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v2, vcc 7605; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 7606; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 7607; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 7608; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s6, v4 7609; GFX9-NEXT: s_movk_i32 s6, 0xffe 7610; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 7611; GFX9-NEXT: v_mov_b32_e32 v3, s7 7612; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v2, vcc 7613; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s8, v4 7614; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc 7615; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v3 7616; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 7617; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 7618; GFX9-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 7619; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 2, v0 7620; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc 7621; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 1, v0 7622; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc 7623; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v4 7624; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 7625; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 7626; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 7627; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc 7628; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[0:1] 7629; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 7630; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc 7631; GFX9-NEXT: v_cndmask_b32_e64 v1, v8, v6, s[0:1] 7632; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc 7633; GFX9-NEXT: v_mov_b32_e32 v0, s4 7634; GFX9-NEXT: v_mov_b32_e32 v1, s5 7635; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7636; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[2:3] 7637; GFX9-NEXT: s_endpgm 7638 %r = udiv <2 x i64> %x, <i64 4096, i64 4095> 7639 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 7640 ret void 7641} 7642 7643define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 7644; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom( 7645; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 7646; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 7647; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 7648; CHECK-NEXT: [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]] 7649; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 7650; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 7651; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 7652; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]] 7653; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 7654; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 7655; CHECK-NEXT: ret void 7656; 7657; GFX6-LABEL: udiv_v2i64_pow2_shl_denom: 7658; GFX6: ; %bb.0: 7659; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7660; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 7661; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 7662; GFX6-NEXT: s_mov_b32 s7, 0xf000 7663; GFX6-NEXT: s_mov_b32 s6, -1 7664; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7665; GFX6-NEXT: s_add_i32 s0, s0, 12 7666; GFX6-NEXT: s_add_i32 s2, s2, 12 7667; GFX6-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 7668; GFX6-NEXT: s_lshr_b64 s[2:3], s[10:11], s2 7669; GFX6-NEXT: v_mov_b32_e32 v0, s0 7670; GFX6-NEXT: v_mov_b32_e32 v1, s1 7671; GFX6-NEXT: v_mov_b32_e32 v2, s2 7672; GFX6-NEXT: v_mov_b32_e32 v3, s3 7673; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 7674; GFX6-NEXT: s_endpgm 7675; GFX9-LABEL: udiv_v2i64_pow2_shl_denom: 7676; GFX9: ; %bb.0: 7677; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7678; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 7679; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 7680; GFX9-NEXT: v_mov_b32_e32 v4, 0 7681; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7682; GFX9-NEXT: s_add_i32 s0, s8, 12 7683; GFX9-NEXT: s_add_i32 s8, s10, 12 7684; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], s0 7685; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 7686; GFX9-NEXT: v_mov_b32_e32 v0, s0 7687; GFX9-NEXT: v_mov_b32_e32 v1, s1 7688; GFX9-NEXT: v_mov_b32_e32 v2, s4 7689; GFX9-NEXT: v_mov_b32_e32 v3, s5 7690; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 7691; GFX9-NEXT: s_endpgm 7692 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 7693 %r = udiv <2 x i64> %x, %shl.y 7694 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 7695 ret void 7696} 7697 7698define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 7699; CHECK-LABEL: @urem_i64_oddk_denom( 7700; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993 7701; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 7702; CHECK-NEXT: ret void 7703; 7704; GFX6-LABEL: urem_i64_oddk_denom: 7705; GFX6: ; %bb.0: 7706; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 7707; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 7708; GFX6-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 7709; GFX6-NEXT: v_rcp_f32_e32 v0, v0 7710; GFX6-NEXT: s_movk_i32 s2, 0xfee0 7711; GFX6-NEXT: s_mov_b32 s3, 0x689e0837 7712; GFX6-NEXT: v_mov_b32_e32 v8, 0 7713; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7714; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7715; GFX6-NEXT: v_trunc_f32_e32 v1, v1 7716; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7717; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 7718; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 7719; GFX6-NEXT: v_mov_b32_e32 v7, 0 7720; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 7721; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 7722; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 7723; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 7724; GFX6-NEXT: s_movk_i32 s12, 0x11f 7725; GFX6-NEXT: s_mov_b32 s13, 0x9761f7c9 7726; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7727; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 7728; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 7729; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 7730; GFX6-NEXT: v_mul_hi_u32 v4, v0, v2 7731; GFX6-NEXT: v_mul_hi_u32 v6, v0, v3 7732; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 7733; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 7734; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7735; GFX6-NEXT: s_mov_b32 s9, s5 7736; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 7737; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 7738; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 7739; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 7740; GFX6-NEXT: s_movk_i32 s5, 0x11e 7741; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 7742; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc 7743; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 7744; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7745; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 7746; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 7747; GFX6-NEXT: v_mul_lo_u32 v4, v0, s2 7748; GFX6-NEXT: v_mul_hi_u32 v5, v0, s3 7749; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 7750; GFX6-NEXT: v_mul_lo_u32 v6, v2, s3 7751; GFX6-NEXT: s_mov_b32 s8, s4 7752; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 7753; GFX6-NEXT: v_mul_lo_u32 v5, v0, s3 7754; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 7755; GFX6-NEXT: v_mul_lo_u32 v6, v0, v4 7756; GFX6-NEXT: v_mul_hi_u32 v10, v0, v4 7757; GFX6-NEXT: v_mul_hi_u32 v9, v0, v5 7758; GFX6-NEXT: v_mul_hi_u32 v11, v2, v4 7759; GFX6-NEXT: s_mov_b32 s4, 0x9761f7c8 7760; GFX6-NEXT: s_mov_b32 s11, 0xf000 7761; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 7762; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc 7763; GFX6-NEXT: v_mul_lo_u32 v10, v2, v5 7764; GFX6-NEXT: v_mul_hi_u32 v5, v2, v5 7765; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 7766; GFX6-NEXT: s_mov_b32 s10, -1 7767; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v10 7768; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc 7769; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc 7770; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 7771; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 7772; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 7773; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 7774; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 7775; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7776; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 7777; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 7778; GFX6-NEXT: v_mul_hi_u32 v4, s6, v1 7779; GFX6-NEXT: v_mul_hi_u32 v5, s7, v1 7780; GFX6-NEXT: v_mul_lo_u32 v1, s7, v1 7781; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7782; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 7783; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 7784; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 7785; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7786; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 7787; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 7788; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7789; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 7790; GFX6-NEXT: v_mul_lo_u32 v2, v0, s12 7791; GFX6-NEXT: v_mul_hi_u32 v3, v0, s13 7792; GFX6-NEXT: v_mul_lo_u32 v1, v1, s13 7793; GFX6-NEXT: v_mul_lo_u32 v0, v0, s13 7794; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7795; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 7796; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 7797; GFX6-NEXT: v_mov_b32_e32 v3, s12 7798; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 7799; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 7800; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s13, v0 7801; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 7802; GFX6-NEXT: v_cmp_lt_u32_e64 s[2:3], s5, v5 7803; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 7804; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 7805; GFX6-NEXT: v_cmp_lt_u32_e64 s[2:3], s4, v4 7806; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s13, v4 7807; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 7808; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s12, v5 7809; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 7810; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 7811; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 7812; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 7813; GFX6-NEXT: v_mov_b32_e32 v5, s7 7814; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 7815; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s5, v1 7816; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 7817; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0 7818; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 7819; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s12, v1 7820; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 7821; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 7822; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 7823; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 7824; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 7825; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 7826; GFX6-NEXT: s_endpgm 7827; GFX9-LABEL: urem_i64_oddk_denom: 7828; GFX9: ; %bb.0: 7829; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 7830; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 7831; GFX9-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 7832; GFX9-NEXT: v_rcp_f32_e32 v0, v0 7833; GFX9-NEXT: s_movk_i32 s4, 0xfee0 7834; GFX9-NEXT: s_mov_b32 s5, 0x689e0837 7835; GFX9-NEXT: v_mov_b32_e32 v6, 0 7836; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7837; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7838; GFX9-NEXT: v_trunc_f32_e32 v1, v1 7839; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7840; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 7841; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 7842; GFX9-NEXT: s_movk_i32 s8, 0x11f 7843; GFX9-NEXT: s_mov_b32 s9, 0x9761f7c9 7844; GFX9-NEXT: v_mul_lo_u32 v2, v0, s4 7845; GFX9-NEXT: v_mul_hi_u32 v3, v0, s5 7846; GFX9-NEXT: v_mul_lo_u32 v5, v1, s5 7847; GFX9-NEXT: v_mul_lo_u32 v4, v0, s5 7848; GFX9-NEXT: s_mov_b32 s10, 0x9761f7c8 7849; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 7850; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 7851; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 7852; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 7853; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 7854; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 7855; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 7856; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 7857; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc 7858; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 7859; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 7860; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 7861; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc 7862; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc 7863; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7864; GFX9-NEXT: v_mov_b32_e32 v5, 0 7865; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 7866; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc 7867; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] 7868; GFX9-NEXT: v_mul_lo_u32 v4, v0, s4 7869; GFX9-NEXT: v_mul_hi_u32 v7, v0, s5 7870; GFX9-NEXT: v_mul_lo_u32 v8, v2, s5 7871; GFX9-NEXT: v_mul_lo_u32 v9, v0, s5 7872; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 7873; GFX9-NEXT: v_add_u32_e32 v4, v7, v4 7874; GFX9-NEXT: v_add_u32_e32 v4, v4, v8 7875; GFX9-NEXT: v_mul_lo_u32 v7, v0, v4 7876; GFX9-NEXT: v_mul_hi_u32 v8, v0, v9 7877; GFX9-NEXT: v_mul_hi_u32 v10, v0, v4 7878; GFX9-NEXT: v_mul_hi_u32 v11, v2, v4 7879; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 7880; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 7881; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v10, vcc 7882; GFX9-NEXT: v_mul_lo_u32 v10, v2, v9 7883; GFX9-NEXT: v_mul_hi_u32 v9, v2, v9 7884; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 7885; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 7886; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v8, v9, vcc 7887; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v6, vcc 7888; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 7889; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v5, v4, vcc 7890; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] 7891; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 7892; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 7893; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7894; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 7895; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 7896; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 7897; GFX9-NEXT: v_mul_hi_u32 v7, s7, v1 7898; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 7899; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7900; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 7901; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 7902; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 7903; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 7904; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 7905; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v6, vcc 7906; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 7907; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc 7908; GFX9-NEXT: v_mul_lo_u32 v2, v0, s8 7909; GFX9-NEXT: v_mul_hi_u32 v3, v0, s9 7910; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 7911; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 7912; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 7913; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 7914; GFX9-NEXT: v_sub_co_u32_e64 v0, s[0:1], s6, v0 7915; GFX9-NEXT: v_sub_u32_e32 v2, s7, v1 7916; GFX9-NEXT: v_mov_b32_e32 v3, s8 7917; GFX9-NEXT: v_subb_co_u32_e64 v2, vcc, v2, v3, s[0:1] 7918; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[2:3], s9, v0 7919; GFX9-NEXT: v_subbrev_co_u32_e64 v5, vcc, 0, v2, s[2:3] 7920; GFX9-NEXT: s_movk_i32 s6, 0x11e 7921; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v5 7922; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 7923; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s10, v4 7924; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 7925; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v5 7926; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 7927; GFX9-NEXT: v_subb_co_u32_e64 v2, vcc, v2, v3, s[2:3] 7928; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s9, v4 7929; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc 7930; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v7 7931; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[2:3] 7932; GFX9-NEXT: v_mov_b32_e32 v5, s7 7933; GFX9-NEXT: v_subb_co_u32_e64 v1, vcc, v5, v1, s[0:1] 7934; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 7935; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 7936; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 7937; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 7938; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 7939; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc 7940; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 7941; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 7942; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[2:3] 7943; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 7944; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] 7945; GFX9-NEXT: s_endpgm 7946 %r = urem i64 %x, 1235195393993 7947 store i64 %r, i64 addrspace(1)* %out 7948 ret void 7949} 7950 7951define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 7952; CHECK-LABEL: @urem_i64_pow2k_denom( 7953; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 4096 7954; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 7955; CHECK-NEXT: ret void 7956; 7957; GFX6-LABEL: urem_i64_pow2k_denom: 7958; GFX6: ; %bb.0: 7959; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 7960; GFX6-NEXT: s_mov_b32 s3, 0xf000 7961; GFX6-NEXT: s_mov_b32 s2, -1 7962; GFX6-NEXT: v_mov_b32_e32 v1, 0 7963; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7964; GFX6-NEXT: s_mov_b32 s0, s4 7965; GFX6-NEXT: s_and_b32 s4, s6, 0xfff 7966; GFX6-NEXT: s_mov_b32 s1, s5 7967; GFX6-NEXT: v_mov_b32_e32 v0, s4 7968; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 7969; GFX6-NEXT: s_endpgm 7970; GFX9-LABEL: urem_i64_pow2k_denom: 7971; GFX9: ; %bb.0: 7972; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 7973; GFX9-NEXT: v_mov_b32_e32 v1, 0 7974; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7975; GFX9-NEXT: s_and_b32 s2, s2, 0xfff 7976; GFX9-NEXT: v_mov_b32_e32 v0, s2 7977; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 7978; GFX9-NEXT: s_endpgm 7979 %r = urem i64 %x, 4096 7980 store i64 %r, i64 addrspace(1)* %out 7981 ret void 7982} 7983 7984define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 7985; CHECK-LABEL: @urem_i64_pow2_shl_denom( 7986; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 7987; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]] 7988; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 7989; CHECK-NEXT: ret void 7990; 7991; GFX6-LABEL: urem_i64_pow2_shl_denom: 7992; GFX6: ; %bb.0: 7993; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 7994; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd 7995; GFX6-NEXT: s_mov_b32 s3, 0xf000 7996; GFX6-NEXT: s_mov_b32 s2, -1 7997; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7998; GFX6-NEXT: s_mov_b32 s0, s4 7999; GFX6-NEXT: s_mov_b32 s1, s5 8000; GFX6-NEXT: s_mov_b32 s5, 0 8001; GFX6-NEXT: s_movk_i32 s4, 0x1000 8002; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 8003; GFX6-NEXT: s_add_u32 s4, s4, -1 8004; GFX6-NEXT: s_addc_u32 s5, s5, -1 8005; GFX6-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 8006; GFX6-NEXT: v_mov_b32_e32 v0, s4 8007; GFX6-NEXT: v_mov_b32_e32 v1, s5 8008; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 8009; GFX6-NEXT: s_endpgm 8010; GFX9-LABEL: urem_i64_pow2_shl_denom: 8011; GFX9: ; %bb.0: 8012; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 8013; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 8014; GFX9-NEXT: s_mov_b32 s1, 0 8015; GFX9-NEXT: s_movk_i32 s0, 0x1000 8016; GFX9-NEXT: v_mov_b32_e32 v2, 0 8017; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8018; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 8019; GFX9-NEXT: s_add_u32 s0, s0, -1 8020; GFX9-NEXT: s_addc_u32 s1, s1, -1 8021; GFX9-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] 8022; GFX9-NEXT: v_mov_b32_e32 v0, s0 8023; GFX9-NEXT: v_mov_b32_e32 v1, s1 8024; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 8025; GFX9-NEXT: s_endpgm 8026 %shl.y = shl i64 4096, %y 8027 %r = urem i64 %x, %shl.y 8028 store i64 %r, i64 addrspace(1)* %out 8029 ret void 8030} 8031 8032define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 8033; CHECK-LABEL: @urem_v2i64_pow2k_denom( 8034; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 8035; CHECK-NEXT: [[TMP2:%.*]] = urem i64 [[TMP1]], 4096 8036; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 8037; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 8038; CHECK-NEXT: [[TMP5:%.*]] = urem i64 [[TMP4]], 4096 8039; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 8040; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 8041; CHECK-NEXT: ret void 8042; 8043; GFX6-LABEL: urem_v2i64_pow2k_denom: 8044; GFX6: ; %bb.0: 8045; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8046; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 8047; GFX6-NEXT: s_movk_i32 s8, 0xfff 8048; GFX6-NEXT: v_mov_b32_e32 v1, 0 8049; GFX6-NEXT: s_mov_b32 s7, 0xf000 8050; GFX6-NEXT: s_mov_b32 s6, -1 8051; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8052; GFX6-NEXT: s_and_b32 s0, s0, s8 8053; GFX6-NEXT: s_and_b32 s1, s2, s8 8054; GFX6-NEXT: v_mov_b32_e32 v0, s0 8055; GFX6-NEXT: v_mov_b32_e32 v2, s1 8056; GFX6-NEXT: v_mov_b32_e32 v3, v1 8057; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 8058; GFX6-NEXT: s_endpgm 8059; GFX9-LABEL: urem_v2i64_pow2k_denom: 8060; GFX9: ; %bb.0: 8061; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8062; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 8063; GFX9-NEXT: s_movk_i32 s0, 0xfff 8064; GFX9-NEXT: v_mov_b32_e32 v1, 0 8065; GFX9-NEXT: v_mov_b32_e32 v3, v1 8066; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8067; GFX9-NEXT: s_and_b32 s1, s4, s0 8068; GFX9-NEXT: s_and_b32 s0, s6, s0 8069; GFX9-NEXT: v_mov_b32_e32 v0, s1 8070; GFX9-NEXT: v_mov_b32_e32 v2, s0 8071; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[2:3] 8072; GFX9-NEXT: s_endpgm 8073 %r = urem <2 x i64> %x, <i64 4096, i64 4096> 8074 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 8075 ret void 8076} 8077 8078define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 8079; CHECK-LABEL: @urem_v2i64_pow2_shl_denom( 8080; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 8081; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 8082; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 8083; CHECK-NEXT: [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]] 8084; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 8085; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 8086; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 8087; CHECK-NEXT: [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]] 8088; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 8089; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 8090; CHECK-NEXT: ret void 8091; 8092; GFX6-LABEL: urem_v2i64_pow2_shl_denom: 8093; GFX6: ; %bb.0: 8094; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8095; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 8096; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 8097; GFX6-NEXT: s_mov_b32 s13, 0 8098; GFX6-NEXT: s_movk_i32 s12, 0x1000 8099; GFX6-NEXT: s_mov_b32 s7, 0xf000 8100; GFX6-NEXT: s_mov_b32 s6, -1 8101; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8102; GFX6-NEXT: s_lshl_b64 s[2:3], s[12:13], s2 8103; GFX6-NEXT: s_lshl_b64 s[0:1], s[12:13], s0 8104; GFX6-NEXT: s_add_u32 s0, s0, -1 8105; GFX6-NEXT: s_addc_u32 s1, s1, -1 8106; GFX6-NEXT: s_and_b64 s[0:1], s[8:9], s[0:1] 8107; GFX6-NEXT: s_add_u32 s2, s2, -1 8108; GFX6-NEXT: s_addc_u32 s3, s3, -1 8109; GFX6-NEXT: s_and_b64 s[2:3], s[10:11], s[2:3] 8110; GFX6-NEXT: v_mov_b32_e32 v0, s0 8111; GFX6-NEXT: v_mov_b32_e32 v1, s1 8112; GFX6-NEXT: v_mov_b32_e32 v2, s2 8113; GFX6-NEXT: v_mov_b32_e32 v3, s3 8114; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 8115; GFX6-NEXT: s_endpgm 8116; GFX9-LABEL: urem_v2i64_pow2_shl_denom: 8117; GFX9: ; %bb.0: 8118; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8119; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 8120; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 8121; GFX9-NEXT: s_mov_b32 s1, 0 8122; GFX9-NEXT: s_movk_i32 s0, 0x1000 8123; GFX9-NEXT: v_mov_b32_e32 v4, 0 8124; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8125; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 8126; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 8127; GFX9-NEXT: s_add_u32 s0, s0, -1 8128; GFX9-NEXT: s_addc_u32 s1, s1, -1 8129; GFX9-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1] 8130; GFX9-NEXT: s_add_u32 s4, s10, -1 8131; GFX9-NEXT: s_addc_u32 s5, s11, -1 8132; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 8133; GFX9-NEXT: v_mov_b32_e32 v0, s0 8134; GFX9-NEXT: v_mov_b32_e32 v1, s1 8135; GFX9-NEXT: v_mov_b32_e32 v2, s4 8136; GFX9-NEXT: v_mov_b32_e32 v3, s5 8137; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 8138; GFX9-NEXT: s_endpgm 8139 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 8140 %r = urem <2 x i64> %x, %shl.y 8141 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 8142 ret void 8143} 8144 8145define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 8146; CHECK-LABEL: @sdiv_i64_oddk_denom( 8147; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195 8148; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 8149; CHECK-NEXT: ret void 8150; 8151; GFX6-LABEL: sdiv_i64_oddk_denom: 8152; GFX6: ; %bb.0: 8153; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 8154; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 8155; GFX6-NEXT: v_rcp_f32_e32 v0, v0 8156; GFX6-NEXT: s_mov_b32 s2, 0xffed2705 8157; GFX6-NEXT: v_mov_b32_e32 v8, 0 8158; GFX6-NEXT: v_mov_b32_e32 v7, 0 8159; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8160; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8161; GFX6-NEXT: v_trunc_f32_e32 v1, v1 8162; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8163; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 8164; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 8165; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 8166; GFX6-NEXT: s_mov_b32 s7, 0xf000 8167; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 8168; GFX6-NEXT: v_mul_lo_u32 v2, v1, s2 8169; GFX6-NEXT: v_mul_lo_u32 v4, v0, s2 8170; GFX6-NEXT: s_mov_b32 s6, -1 8171; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8172; GFX6-NEXT: s_mov_b32 s4, s8 8173; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8174; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 8175; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 8176; GFX6-NEXT: v_mul_hi_u32 v6, v0, v4 8177; GFX6-NEXT: v_mul_hi_u32 v3, v0, v2 8178; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 8179; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 8180; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 8181; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 8182; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 8183; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc 8184; GFX6-NEXT: s_mov_b32 s5, s9 8185; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 8186; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc 8187; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 8188; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8189; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 8190; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 8191; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 8192; GFX6-NEXT: v_mul_lo_u32 v4, v2, s2 8193; GFX6-NEXT: v_mul_hi_u32 v5, s2, v0 8194; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 8195; GFX6-NEXT: v_mul_lo_u32 v5, v0, s2 8196; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 8197; GFX6-NEXT: v_mul_lo_u32 v10, v0, v4 8198; GFX6-NEXT: v_mul_hi_u32 v12, v0, v4 8199; GFX6-NEXT: v_mul_hi_u32 v11, v0, v5 8200; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 8201; GFX6-NEXT: v_mul_lo_u32 v5, v2, v5 8202; GFX6-NEXT: v_mul_hi_u32 v6, v2, v4 8203; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 8204; GFX6-NEXT: v_addc_u32_e32 v11, vcc, v8, v12, vcc 8205; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 8206; GFX6-NEXT: v_add_i32_e32 v5, vcc, v10, v5 8207; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc 8208; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc 8209; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 8210; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 8211; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 8212; GFX6-NEXT: s_ashr_i32 s2, s11, 31 8213; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 8214; GFX6-NEXT: s_add_u32 s0, s10, s2 8215; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 8216; GFX6-NEXT: s_mov_b32 s3, s2 8217; GFX6-NEXT: s_addc_u32 s1, s11, s2 8218; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 8219; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8220; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 8221; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 8222; GFX6-NEXT: v_mul_hi_u32 v4, s0, v1 8223; GFX6-NEXT: v_mul_hi_u32 v5, s1, v1 8224; GFX6-NEXT: v_mul_lo_u32 v1, s1, v1 8225; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8226; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 8227; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 8228; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 8229; GFX6-NEXT: s_mov_b32 s3, 0x12d8fb 8230; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 8231; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 8232; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 8233; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 8234; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 8235; GFX6-NEXT: v_mul_lo_u32 v2, v1, s3 8236; GFX6-NEXT: v_mul_hi_u32 v3, s3, v0 8237; GFX6-NEXT: v_mul_lo_u32 v4, v0, s3 8238; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8239; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s0, v4 8240; GFX6-NEXT: v_mov_b32_e32 v3, s1 8241; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc 8242; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s3, v4 8243; GFX6-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc 8244; GFX6-NEXT: s_mov_b32 s0, 0x12d8fa 8245; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 8246; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 8247; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 8248; GFX6-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 8249; GFX6-NEXT: v_add_i32_e32 v5, vcc, 2, v0 8250; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 8251; GFX6-NEXT: v_add_i32_e32 v7, vcc, 1, v0 8252; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v4 8253; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc 8254; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] 8255; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 8256; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 8257; GFX6-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] 8258; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 8259; GFX6-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc 8260; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 8261; GFX6-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc 8262; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 8263; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 8264; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 8265; GFX6-NEXT: v_mov_b32_e32 v2, s2 8266; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 8267; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 8268; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 8269; GFX6-NEXT: s_endpgm 8270; GFX9-LABEL: sdiv_i64_oddk_denom: 8271; GFX9: ; %bb.0: 8272; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 8273; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 8274; GFX9-NEXT: v_rcp_f32_e32 v0, v0 8275; GFX9-NEXT: s_mov_b32 s8, 0xffed2705 8276; GFX9-NEXT: v_mov_b32_e32 v7, 0 8277; GFX9-NEXT: v_mov_b32_e32 v5, 0 8278; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8279; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8280; GFX9-NEXT: v_trunc_f32_e32 v1, v1 8281; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8282; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 8283; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 8284; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 8285; GFX9-NEXT: v_mul_hi_u32 v3, s8, v0 8286; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 8287; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 8288; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 8289; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 8290; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 8291; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 8292; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 8293; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 8294; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 8295; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 8296; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v8, vcc 8297; GFX9-NEXT: v_mul_lo_u32 v8, v1, v4 8298; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 8299; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 8300; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc 8301; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc 8302; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 8303; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 8304; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 8305; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] 8306; GFX9-NEXT: v_mul_lo_u32 v4, v2, s8 8307; GFX9-NEXT: v_mul_hi_u32 v6, s8, v0 8308; GFX9-NEXT: v_mul_lo_u32 v8, v0, s8 8309; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 8310; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 8311; GFX9-NEXT: v_sub_u32_e32 v4, v4, v0 8312; GFX9-NEXT: v_mul_lo_u32 v10, v0, v4 8313; GFX9-NEXT: v_mul_hi_u32 v11, v0, v8 8314; GFX9-NEXT: v_mul_hi_u32 v12, v0, v4 8315; GFX9-NEXT: v_mul_hi_u32 v9, v2, v8 8316; GFX9-NEXT: v_mul_lo_u32 v8, v2, v8 8317; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 8318; GFX9-NEXT: v_mul_hi_u32 v6, v2, v4 8319; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v7, v12, vcc 8320; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 8321; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 8322; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc 8323; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc 8324; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 8325; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc 8326; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] 8327; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8328; GFX9-NEXT: s_ashr_i32 s2, s7, 31 8329; GFX9-NEXT: s_add_u32 s0, s6, s2 8330; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 8331; GFX9-NEXT: s_mov_b32 s3, s2 8332; GFX9-NEXT: s_addc_u32 s1, s7, s2 8333; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 8334; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 8335; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 8336; GFX9-NEXT: v_mul_hi_u32 v3, s0, v0 8337; GFX9-NEXT: v_mul_hi_u32 v4, s0, v1 8338; GFX9-NEXT: v_mul_hi_u32 v6, s1, v1 8339; GFX9-NEXT: v_mul_lo_u32 v1, s1, v1 8340; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 8341; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 8342; GFX9-NEXT: v_mul_lo_u32 v4, s1, v0 8343; GFX9-NEXT: v_mul_hi_u32 v0, s1, v0 8344; GFX9-NEXT: s_mov_b32 s3, 0x12d8fb 8345; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 8346; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 8347; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc 8348; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 8349; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v2, vcc 8350; GFX9-NEXT: v_mul_lo_u32 v4, v0, s3 8351; GFX9-NEXT: v_mul_lo_u32 v2, v1, s3 8352; GFX9-NEXT: v_mul_hi_u32 v3, s3, v0 8353; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s0, v4 8354; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 8355; GFX9-NEXT: v_mov_b32_e32 v3, s1 8356; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v2, vcc 8357; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s3, v4 8358; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc 8359; GFX9-NEXT: s_mov_b32 s3, 0x12d8fa 8360; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s3, v3 8361; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 8362; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 8363; GFX9-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 8364; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 2, v0 8365; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc 8366; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 1, v0 8367; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc 8368; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s3, v4 8369; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 8370; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 8371; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 8372; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc 8373; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 8374; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v6, s[0:1] 8375; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 8376; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[0:1] 8377; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 8378; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 8379; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 8380; GFX9-NEXT: v_mov_b32_e32 v2, s2 8381; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s2, v0 8382; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc 8383; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] 8384; GFX9-NEXT: s_endpgm 8385 %r = sdiv i64 %x, 1235195 8386 store i64 %r, i64 addrspace(1)* %out 8387 ret void 8388} 8389 8390define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 8391; CHECK-LABEL: @sdiv_i64_pow2k_denom( 8392; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 4096 8393; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 8394; CHECK-NEXT: ret void 8395; 8396; GFX6-LABEL: sdiv_i64_pow2k_denom: 8397; GFX6: ; %bb.0: 8398; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 8399; GFX6-NEXT: s_mov_b32 s7, 0xf000 8400; GFX6-NEXT: s_mov_b32 s6, -1 8401; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8402; GFX6-NEXT: s_mov_b32 s4, s0 8403; GFX6-NEXT: s_ashr_i32 s0, s3, 31 8404; GFX6-NEXT: s_lshr_b32 s0, s0, 20 8405; GFX6-NEXT: s_add_u32 s0, s2, s0 8406; GFX6-NEXT: s_mov_b32 s5, s1 8407; GFX6-NEXT: s_addc_u32 s1, s3, 0 8408; GFX6-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 8409; GFX6-NEXT: v_mov_b32_e32 v0, s0 8410; GFX6-NEXT: v_mov_b32_e32 v1, s1 8411; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 8412; GFX6-NEXT: s_endpgm 8413; GFX9-LABEL: sdiv_i64_pow2k_denom: 8414; GFX9: ; %bb.0: 8415; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 8416; GFX9-NEXT: v_mov_b32_e32 v2, 0 8417; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8418; GFX9-NEXT: s_ashr_i32 s4, s3, 31 8419; GFX9-NEXT: s_lshr_b32 s4, s4, 20 8420; GFX9-NEXT: s_add_u32 s2, s2, s4 8421; GFX9-NEXT: s_addc_u32 s3, s3, 0 8422; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 8423; GFX9-NEXT: v_mov_b32_e32 v0, s2 8424; GFX9-NEXT: v_mov_b32_e32 v1, s3 8425; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 8426; GFX9-NEXT: s_endpgm 8427 %r = sdiv i64 %x, 4096 8428 store i64 %r, i64 addrspace(1)* %out 8429 ret void 8430} 8431 8432define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 8433; CHECK-LABEL: @sdiv_i64_pow2_shl_denom( 8434; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 8435; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]] 8436; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 8437; CHECK-NEXT: ret void 8438; 8439; GFX6-LABEL: sdiv_i64_pow2_shl_denom: 8440; GFX6: ; %bb.0: 8441; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd 8442; GFX6-NEXT: s_mov_b32 s3, 0 8443; GFX6-NEXT: s_movk_i32 s2, 0x1000 8444; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 8445; GFX6-NEXT: s_mov_b32 s7, 0xf000 8446; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8447; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 8448; GFX6-NEXT: s_ashr_i32 s12, s3, 31 8449; GFX6-NEXT: s_add_u32 s2, s2, s12 8450; GFX6-NEXT: s_mov_b32 s13, s12 8451; GFX6-NEXT: s_addc_u32 s3, s3, s12 8452; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] 8453; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 8454; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 8455; GFX6-NEXT: s_sub_u32 s4, 0, s2 8456; GFX6-NEXT: s_subb_u32 s5, 0, s3 8457; GFX6-NEXT: s_ashr_i32 s14, s11, 31 8458; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 8459; GFX6-NEXT: v_rcp_f32_e32 v0, v0 8460; GFX6-NEXT: s_mov_b32 s15, s14 8461; GFX6-NEXT: s_mov_b32 s6, -1 8462; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8463; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8464; GFX6-NEXT: v_trunc_f32_e32 v1, v1 8465; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8466; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 8467; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 8468; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 8469; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 8470; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 8471; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 8472; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8473; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 8474; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 8475; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 8476; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 8477; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 8478; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 8479; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 8480; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 8481; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 8482; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 8483; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 8484; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 8485; GFX6-NEXT: v_mov_b32_e32 v4, 0 8486; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 8487; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8488; GFX6-NEXT: v_mov_b32_e32 v6, 0 8489; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 8490; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 8491; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 8492; GFX6-NEXT: v_mul_lo_u32 v5, s4, v2 8493; GFX6-NEXT: v_mul_hi_u32 v7, s4, v0 8494; GFX6-NEXT: v_mul_lo_u32 v8, s5, v0 8495; GFX6-NEXT: s_mov_b32 s5, s9 8496; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 8497; GFX6-NEXT: v_mul_lo_u32 v7, s4, v0 8498; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 8499; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 8500; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 8501; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 8502; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 8503; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 8504; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 8505; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 8506; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 8507; GFX6-NEXT: v_mul_lo_u32 v2, v2, v5 8508; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 8509; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 8510; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 8511; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 8512; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 8513; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 8514; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 8515; GFX6-NEXT: s_add_u32 s0, s10, s14 8516; GFX6-NEXT: s_addc_u32 s1, s11, s14 8517; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 8518; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 8519; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8520; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 8521; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 8522; GFX6-NEXT: v_mul_hi_u32 v5, s10, v1 8523; GFX6-NEXT: v_mul_hi_u32 v7, s11, v1 8524; GFX6-NEXT: v_mul_lo_u32 v1, s11, v1 8525; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8526; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 8527; GFX6-NEXT: v_mul_lo_u32 v5, s11, v0 8528; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 8529; GFX6-NEXT: s_mov_b32 s4, s8 8530; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 8531; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 8532; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 8533; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 8534; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 8535; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 8536; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 8537; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 8538; GFX6-NEXT: v_mov_b32_e32 v5, s3 8539; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8540; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 8541; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 8542; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s11, v2 8543; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 8544; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 8545; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 8546; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 8547; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 8548; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 8549; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 8550; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 8551; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 8552; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 8553; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 8554; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 8555; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 8556; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 8557; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 8558; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] 8559; GFX6-NEXT: v_mov_b32_e32 v6, s11 8560; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 8561; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 8562; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 8563; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 8564; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 8565; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 8566; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 8567; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 8568; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 8569; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 8570; GFX6-NEXT: s_xor_b64 s[0:1], s[14:15], s[12:13] 8571; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 8572; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 8573; GFX6-NEXT: v_xor_b32_e32 v1, s1, v1 8574; GFX6-NEXT: v_mov_b32_e32 v2, s1 8575; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 8576; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 8577; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 8578; GFX6-NEXT: s_endpgm 8579; GFX9-LABEL: sdiv_i64_pow2_shl_denom: 8580; GFX9: ; %bb.0: 8581; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 8582; GFX9-NEXT: s_mov_b32 s3, 0 8583; GFX9-NEXT: s_movk_i32 s2, 0x1000 8584; GFX9-NEXT: v_mov_b32_e32 v2, 0 8585; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8586; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 8587; GFX9-NEXT: s_ashr_i32 s8, s3, 31 8588; GFX9-NEXT: s_add_u32 s2, s2, s8 8589; GFX9-NEXT: s_mov_b32 s9, s8 8590; GFX9-NEXT: s_addc_u32 s3, s3, s8 8591; GFX9-NEXT: s_xor_b64 s[10:11], s[2:3], s[8:9] 8592; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 8593; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 8594; GFX9-NEXT: s_sub_u32 s12, 0, s10 8595; GFX9-NEXT: s_subb_u32 s4, 0, s11 8596; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 8597; GFX9-NEXT: v_rcp_f32_e32 v0, v0 8598; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8599; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8600; GFX9-NEXT: v_trunc_f32_e32 v1, v1 8601; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8602; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 8603; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 8604; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0 8605; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 8606; GFX9-NEXT: v_mul_lo_u32 v6, s4, v0 8607; GFX9-NEXT: v_mul_lo_u32 v5, s12, v0 8608; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 8609; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 8610; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 8611; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3 8612; GFX9-NEXT: v_mul_hi_u32 v8, v0, v3 8613; GFX9-NEXT: v_mul_hi_u32 v7, v1, v5 8614; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 8615; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 8616; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 8617; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc 8618; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 8619; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 8620; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v7, vcc 8621; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v2, vcc 8622; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 8623; GFX9-NEXT: v_mov_b32_e32 v6, 0 8624; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v3 8625; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc 8626; GFX9-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[2:3] 8627; GFX9-NEXT: v_mul_lo_u32 v5, s12, v3 8628; GFX9-NEXT: v_mul_hi_u32 v7, s12, v0 8629; GFX9-NEXT: v_mul_lo_u32 v8, s4, v0 8630; GFX9-NEXT: v_mul_lo_u32 v9, s12, v0 8631; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 8632; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 8633; GFX9-NEXT: v_add_u32_e32 v5, v5, v8 8634; GFX9-NEXT: v_mul_lo_u32 v10, v0, v5 8635; GFX9-NEXT: v_mul_hi_u32 v11, v0, v9 8636; GFX9-NEXT: v_mul_hi_u32 v12, v0, v5 8637; GFX9-NEXT: v_mul_hi_u32 v8, v3, v9 8638; GFX9-NEXT: v_mul_lo_u32 v9, v3, v9 8639; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 8640; GFX9-NEXT: v_mul_hi_u32 v7, v3, v5 8641; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc 8642; GFX9-NEXT: v_mul_lo_u32 v3, v3, v5 8643; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 8644; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc 8645; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 8646; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 8647; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8648; GFX9-NEXT: s_ashr_i32 s12, s7, 31 8649; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v5, vcc 8650; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 8651; GFX9-NEXT: s_add_u32 s0, s6, s12 8652; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3] 8653; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 8654; GFX9-NEXT: s_mov_b32 s13, s12 8655; GFX9-NEXT: s_addc_u32 s1, s7, s12 8656; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] 8657; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 8658; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 8659; GFX9-NEXT: v_mul_hi_u32 v4, s6, v0 8660; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 8661; GFX9-NEXT: v_mul_hi_u32 v7, s7, v1 8662; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 8663; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 8664; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 8665; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 8666; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 8667; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 8668; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v4, v0, vcc 8669; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v2, vcc 8670; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 8671; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v3, vcc 8672; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 8673; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 8674; GFX9-NEXT: v_mul_lo_u32 v5, s11, v0 8675; GFX9-NEXT: v_mov_b32_e32 v6, s11 8676; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 8677; GFX9-NEXT: v_mul_lo_u32 v4, s10, v0 8678; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 8679; GFX9-NEXT: v_sub_u32_e32 v5, s7, v3 8680; GFX9-NEXT: v_sub_co_u32_e64 v4, s[0:1], s6, v4 8681; GFX9-NEXT: v_subb_co_u32_e64 v5, vcc, v5, v6, s[0:1] 8682; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s10, v4 8683; GFX9-NEXT: v_subbrev_co_u32_e32 v5, vcc, 0, v5, vcc 8684; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 8685; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 8686; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v6 8687; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 8688; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v5 8689; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc 8690; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 2, v0 8691; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc 8692; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 1, v0 8693; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc 8694; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v5 8695; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v7, s[2:3] 8696; GFX9-NEXT: v_mov_b32_e32 v7, s7 8697; GFX9-NEXT: v_subb_co_u32_e64 v3, vcc, v7, v3, s[0:1] 8698; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 8699; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 8700; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v4 8701; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 8702; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v3 8703; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc 8704; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 8705; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v6, s[2:3] 8706; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 8707; GFX9-NEXT: s_xor_b64 s[0:1], s[12:13], s[8:9] 8708; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 8709; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 8710; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 8711; GFX9-NEXT: v_mov_b32_e32 v3, s1 8712; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 8713; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 8714; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 8715; GFX9-NEXT: s_endpgm 8716 %shl.y = shl i64 4096, %y 8717 %r = sdiv i64 %x, %shl.y 8718 store i64 %r, i64 addrspace(1)* %out 8719 ret void 8720} 8721 8722define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 8723; CHECK-LABEL: @sdiv_v2i64_pow2k_denom( 8724; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 8725; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 8726; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 8727; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 8728; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096 8729; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 8730; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 8731; CHECK-NEXT: ret void 8732; 8733; GFX6-LABEL: sdiv_v2i64_pow2k_denom: 8734; GFX6: ; %bb.0: 8735; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8736; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 8737; GFX6-NEXT: s_mov_b32 s7, 0xf000 8738; GFX6-NEXT: s_mov_b32 s6, -1 8739; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8740; GFX6-NEXT: s_ashr_i32 s8, s1, 31 8741; GFX6-NEXT: s_lshr_b32 s8, s8, 20 8742; GFX6-NEXT: s_add_u32 s0, s0, s8 8743; GFX6-NEXT: s_addc_u32 s1, s1, 0 8744; GFX6-NEXT: s_ashr_i32 s8, s3, 31 8745; GFX6-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 8746; GFX6-NEXT: s_lshr_b32 s8, s8, 20 8747; GFX6-NEXT: s_add_u32 s2, s2, s8 8748; GFX6-NEXT: s_addc_u32 s3, s3, 0 8749; GFX6-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 8750; GFX6-NEXT: v_mov_b32_e32 v0, s0 8751; GFX6-NEXT: v_mov_b32_e32 v1, s1 8752; GFX6-NEXT: v_mov_b32_e32 v2, s2 8753; GFX6-NEXT: v_mov_b32_e32 v3, s3 8754; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 8755; GFX6-NEXT: s_endpgm 8756; GFX9-LABEL: sdiv_v2i64_pow2k_denom: 8757; GFX9: ; %bb.0: 8758; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8759; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 8760; GFX9-NEXT: v_mov_b32_e32 v4, 0 8761; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8762; GFX9-NEXT: s_ashr_i32 s0, s5, 31 8763; GFX9-NEXT: s_lshr_b32 s0, s0, 20 8764; GFX9-NEXT: s_add_u32 s0, s4, s0 8765; GFX9-NEXT: s_addc_u32 s1, s5, 0 8766; GFX9-NEXT: s_ashr_i32 s4, s7, 31 8767; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 8768; GFX9-NEXT: s_lshr_b32 s4, s4, 20 8769; GFX9-NEXT: s_add_u32 s4, s6, s4 8770; GFX9-NEXT: s_addc_u32 s5, s7, 0 8771; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 8772; GFX9-NEXT: v_mov_b32_e32 v0, s0 8773; GFX9-NEXT: v_mov_b32_e32 v1, s1 8774; GFX9-NEXT: v_mov_b32_e32 v2, s4 8775; GFX9-NEXT: v_mov_b32_e32 v3, s5 8776; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 8777; GFX9-NEXT: s_endpgm 8778 %r = sdiv <2 x i64> %x, <i64 4096, i64 4096> 8779 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 8780 ret void 8781} 8782 8783define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 8784; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom( 8785; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 8786; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 8787; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 8788; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 8789; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095 8790; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 8791; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 8792; CHECK-NEXT: ret void 8793; 8794; GFX6-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 8795; GFX6: ; %bb.0: 8796; GFX6-NEXT: v_mov_b32_e32 v0, 0x457ff000 8797; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 8798; GFX6-NEXT: v_mac_f32_e32 v0, 0, v1 8799; GFX6-NEXT: v_rcp_f32_e32 v0, v0 8800; GFX6-NEXT: s_movk_i32 s6, 0xf001 8801; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8802; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 8803; GFX6-NEXT: s_mov_b32 s7, 0xf000 8804; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8805; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8806; GFX6-NEXT: v_trunc_f32_e32 v1, v1 8807; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8808; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 8809; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 8810; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8811; GFX6-NEXT: s_ashr_i32 s0, s9, 31 8812; GFX6-NEXT: s_lshr_b32 s0, s0, 20 8813; GFX6-NEXT: v_mul_hi_u32 v2, s6, v0 8814; GFX6-NEXT: v_mul_lo_u32 v3, v1, s6 8815; GFX6-NEXT: s_add_u32 s2, s8, s0 8816; GFX6-NEXT: s_addc_u32 s3, s9, 0 8817; GFX6-NEXT: s_ashr_i32 s8, s11, 31 8818; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 8819; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 8820; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 8821; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 8822; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 8823; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 8824; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 8825; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 8826; GFX6-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 8827; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 8828; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 8829; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 8830; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 8831; GFX6-NEXT: s_mov_b32 s9, s8 8832; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 8833; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 8834; GFX6-NEXT: v_mov_b32_e32 v4, 0 8835; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 8836; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8837; GFX6-NEXT: v_mov_b32_e32 v6, 0 8838; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 8839; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 8840; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 8841; GFX6-NEXT: v_mul_lo_u32 v5, v2, s6 8842; GFX6-NEXT: v_mul_hi_u32 v7, s6, v0 8843; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 8844; GFX6-NEXT: v_mul_lo_u32 v7, v0, s6 8845; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, v0, v5 8846; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 8847; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 8848; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 8849; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 8850; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 8851; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 8852; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 8853; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 8854; GFX6-NEXT: v_mul_lo_u32 v2, v2, v5 8855; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 8856; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 8857; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 8858; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 8859; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 8860; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 8861; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 8862; GFX6-NEXT: s_add_u32 s0, s10, s8 8863; GFX6-NEXT: s_addc_u32 s1, s11, s8 8864; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 8865; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] 8866; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8867; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 8868; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 8869; GFX6-NEXT: v_mul_hi_u32 v5, s0, v1 8870; GFX6-NEXT: v_mul_hi_u32 v7, s1, v1 8871; GFX6-NEXT: v_mul_lo_u32 v1, s1, v1 8872; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8873; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 8874; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 8875; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 8876; GFX6-NEXT: s_movk_i32 s9, 0xfff 8877; GFX6-NEXT: s_mov_b32 s6, -1 8878; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 8879; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 8880; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 8881; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 8882; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 8883; GFX6-NEXT: v_mul_lo_u32 v2, v1, s9 8884; GFX6-NEXT: v_mul_hi_u32 v3, s9, v0 8885; GFX6-NEXT: v_mul_lo_u32 v4, v0, s9 8886; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8887; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s0, v4 8888; GFX6-NEXT: v_mov_b32_e32 v3, s1 8889; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v3, v2, vcc 8890; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v4 8891; GFX6-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v2, vcc 8892; GFX6-NEXT: s_movk_i32 s0, 0xffe 8893; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 8894; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 8895; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 8896; GFX6-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 8897; GFX6-NEXT: v_add_i32_e32 v5, vcc, 2, v0 8898; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc 8899; GFX6-NEXT: v_add_i32_e32 v7, vcc, 1, v0 8900; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v4 8901; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc 8902; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1] 8903; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 8904; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 8905; GFX6-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[0:1] 8906; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 8907; GFX6-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc 8908; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 8909; GFX6-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc 8910; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 8911; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 8912; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0 8913; GFX6-NEXT: v_xor_b32_e32 v1, s8, v1 8914; GFX6-NEXT: v_mov_b32_e32 v3, s8 8915; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc 8916; GFX6-NEXT: v_mov_b32_e32 v0, s2 8917; GFX6-NEXT: v_mov_b32_e32 v1, s3 8918; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 8919; GFX6-NEXT: s_endpgm 8920; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 8921; GFX9: ; %bb.0: 8922; GFX9-NEXT: v_mov_b32_e32 v0, 0x457ff000 8923; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 8924; GFX9-NEXT: v_mac_f32_e32 v0, 0, v1 8925; GFX9-NEXT: v_rcp_f32_e32 v0, v0 8926; GFX9-NEXT: s_movk_i32 s8, 0xf001 8927; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 8928; GFX9-NEXT: v_mov_b32_e32 v4, 0 8929; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8930; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8931; GFX9-NEXT: v_trunc_f32_e32 v1, v1 8932; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8933; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 8934; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 8935; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8936; GFX9-NEXT: s_ashr_i32 s2, s5, 31 8937; GFX9-NEXT: s_lshr_b32 s2, s2, 20 8938; GFX9-NEXT: v_mul_hi_u32 v2, s8, v0 8939; GFX9-NEXT: v_mul_lo_u32 v3, v1, s8 8940; GFX9-NEXT: v_mul_lo_u32 v5, v0, s8 8941; GFX9-NEXT: s_add_u32 s4, s4, s2 8942; GFX9-NEXT: s_addc_u32 s5, s5, 0 8943; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 8944; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 8945; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 8946; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5 8947; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 8948; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 8949; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 8950; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 8951; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 8952; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 8953; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 8954; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 8955; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 8956; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc 8957; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v4, vcc 8958; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 8959; GFX9-NEXT: v_mov_b32_e32 v6, 0 8960; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 8961; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc 8962; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] 8963; GFX9-NEXT: v_mul_lo_u32 v5, v2, s8 8964; GFX9-NEXT: v_mul_hi_u32 v7, s8, v0 8965; GFX9-NEXT: v_mul_lo_u32 v8, v0, s8 8966; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 8967; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 8968; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 8969; GFX9-NEXT: v_sub_u32_e32 v5, v5, v0 8970; GFX9-NEXT: v_mul_lo_u32 v10, v0, v5 8971; GFX9-NEXT: v_mul_hi_u32 v11, v0, v8 8972; GFX9-NEXT: v_mul_hi_u32 v12, v0, v5 8973; GFX9-NEXT: v_mul_hi_u32 v9, v2, v8 8974; GFX9-NEXT: v_mul_lo_u32 v8, v2, v8 8975; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 8976; GFX9-NEXT: v_mul_hi_u32 v7, v2, v5 8977; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc 8978; GFX9-NEXT: v_mul_lo_u32 v2, v2, v5 8979; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 8980; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc 8981; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc 8982; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 8983; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v5, vcc 8984; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3] 8985; GFX9-NEXT: s_ashr_i32 s2, s7, 31 8986; GFX9-NEXT: s_add_u32 s6, s6, s2 8987; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 8988; GFX9-NEXT: s_mov_b32 s3, s2 8989; GFX9-NEXT: s_addc_u32 s7, s7, s2 8990; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] 8991; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 8992; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 8993; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 8994; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 8995; GFX9-NEXT: v_mul_hi_u32 v7, s7, v1 8996; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 8997; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 8998; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc 8999; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 9000; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 9001; GFX9-NEXT: s_movk_i32 s3, 0xfff 9002; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 9003; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 9004; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v4, vcc 9005; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 9006; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v2, vcc 9007; GFX9-NEXT: v_mul_lo_u32 v5, v0, s3 9008; GFX9-NEXT: v_mul_lo_u32 v2, v1, s3 9009; GFX9-NEXT: v_mul_hi_u32 v3, s3, v0 9010; GFX9-NEXT: v_sub_co_u32_e32 v5, vcc, s6, v5 9011; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9012; GFX9-NEXT: v_mov_b32_e32 v3, s7 9013; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v2, vcc 9014; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s3, v5 9015; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc 9016; GFX9-NEXT: s_movk_i32 s3, 0xffe 9017; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s3, v3 9018; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 9019; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 9020; GFX9-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 9021; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 2, v0 9022; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v1, vcc 9023; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 1, v0 9024; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v1, vcc 9025; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s3, v5 9026; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 9027; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 9028; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 9029; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v5, vcc 9030; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 9031; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v6, s[0:1] 9032; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 9033; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[0:1] 9034; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 9035; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 9036; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v0 9037; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 9038; GFX9-NEXT: v_mov_b32_e32 v3, s2 9039; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc 9040; GFX9-NEXT: v_mov_b32_e32 v0, s4 9041; GFX9-NEXT: v_mov_b32_e32 v1, s5 9042; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9043; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] 9044; GFX9-NEXT: s_endpgm 9045 %r = sdiv <2 x i64> %x, <i64 4096, i64 4095> 9046 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 9047 ret void 9048} 9049 9050define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 9051; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom( 9052; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 9053; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 9054; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 9055; CHECK-NEXT: [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]] 9056; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 9057; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 9058; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 9059; CHECK-NEXT: [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]] 9060; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 9061; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 9062; CHECK-NEXT: ret void 9063; 9064; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom: 9065; GFX6: ; %bb.0: 9066; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 9067; GFX6-NEXT: s_mov_b32 s3, 0 9068; GFX6-NEXT: s_movk_i32 s2, 0x1000 9069; GFX6-NEXT: s_mov_b32 s18, 0x4f800000 9070; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc 9071; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9072; GFX6-NEXT: s_lshl_b64 s[12:13], s[2:3], s6 9073; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 9074; GFX6-NEXT: s_ashr_i32 s16, s3, 31 9075; GFX6-NEXT: s_add_u32 s2, s2, s16 9076; GFX6-NEXT: s_mov_b32 s17, s16 9077; GFX6-NEXT: s_addc_u32 s3, s3, s16 9078; GFX6-NEXT: s_xor_b64 s[14:15], s[2:3], s[16:17] 9079; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s14 9080; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s15 9081; GFX6-NEXT: s_mov_b32 s20, 0x2f800000 9082; GFX6-NEXT: s_mov_b32 s21, 0xcf800000 9083; GFX6-NEXT: s_sub_u32 s6, 0, s14 9084; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1 9085; GFX6-NEXT: v_rcp_f32_e32 v0, v0 9086; GFX6-NEXT: s_subb_u32 s7, 0, s15 9087; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 9088; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 9089; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0 9090; GFX6-NEXT: v_mul_f32_e32 v1, s20, v0 9091; GFX6-NEXT: v_trunc_f32_e32 v1, v1 9092; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 9093; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9094; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9095; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 9096; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 9097; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 9098; GFX6-NEXT: v_mul_lo_u32 v5, s6, v0 9099; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9100; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9101; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 9102; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 9103; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 9104; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 9105; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9106; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 9107; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 9108; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 9109; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 9110; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 9111; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 9112; GFX6-NEXT: v_mov_b32_e32 v4, 0 9113; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 9114; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9115; GFX6-NEXT: v_mov_b32_e32 v6, 0 9116; GFX6-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2 9117; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 9118; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[2:3] 9119; GFX6-NEXT: v_mul_lo_u32 v5, s6, v2 9120; GFX6-NEXT: v_mul_hi_u32 v7, s6, v0 9121; GFX6-NEXT: v_mul_lo_u32 v8, s7, v0 9122; GFX6-NEXT: s_mov_b32 s7, 0xf000 9123; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 9124; GFX6-NEXT: v_mul_lo_u32 v7, s6, v0 9125; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 9126; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 9127; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 9128; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 9129; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 9130; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 9131; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 9132; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 9133; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 9134; GFX6-NEXT: v_mul_lo_u32 v2, v2, v5 9135; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 9136; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 9137; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 9138; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 9139; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 9140; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 9141; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] 9142; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9143; GFX6-NEXT: s_ashr_i32 s2, s9, 31 9144; GFX6-NEXT: s_add_u32 s0, s8, s2 9145; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9146; GFX6-NEXT: s_mov_b32 s3, s2 9147; GFX6-NEXT: s_addc_u32 s1, s9, s2 9148; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3] 9149; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 9150; GFX6-NEXT: v_mul_lo_u32 v2, s8, v1 9151; GFX6-NEXT: v_mul_hi_u32 v3, s8, v0 9152; GFX6-NEXT: v_mul_hi_u32 v5, s8, v1 9153; GFX6-NEXT: v_mul_hi_u32 v7, s9, v1 9154; GFX6-NEXT: v_mul_lo_u32 v1, s9, v1 9155; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9156; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 9157; GFX6-NEXT: v_mul_lo_u32 v5, s9, v0 9158; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 9159; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] 9160; GFX6-NEXT: s_mov_b32 s6, -1 9161; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 9162; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 9163; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 9164; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 9165; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 9166; GFX6-NEXT: v_mul_lo_u32 v2, s14, v1 9167; GFX6-NEXT: v_mul_hi_u32 v3, s14, v0 9168; GFX6-NEXT: v_mul_lo_u32 v5, s15, v0 9169; GFX6-NEXT: v_mov_b32_e32 v7, s15 9170; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9171; GFX6-NEXT: v_mul_lo_u32 v3, s14, v0 9172; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 9173; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s9, v2 9174; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 9175; GFX6-NEXT: v_subb_u32_e64 v5, s[0:1], v5, v7, vcc 9176; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s14, v3 9177; GFX6-NEXT: v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1] 9178; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v5 9179; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 9180; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 9181; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 9182; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v5 9183; GFX6-NEXT: v_cndmask_b32_e64 v5, v8, v7, s[0:1] 9184; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0 9185; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 9186; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v0 9187; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1] 9188; GFX6-NEXT: s_ashr_i32 s8, s13, 31 9189; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 9190; GFX6-NEXT: s_add_u32 s12, s12, s8 9191; GFX6-NEXT: v_cndmask_b32_e64 v5, v10, v8, s[0:1] 9192; GFX6-NEXT: v_mov_b32_e32 v8, s9 9193; GFX6-NEXT: s_mov_b32 s9, s8 9194; GFX6-NEXT: s_addc_u32 s13, s13, s8 9195; GFX6-NEXT: s_xor_b64 s[12:13], s[12:13], s[8:9] 9196; GFX6-NEXT: v_cvt_f32_u32_e32 v10, s12 9197; GFX6-NEXT: v_cvt_f32_u32_e32 v11, s13 9198; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc 9199; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s15, v2 9200; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 9201; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s14, v3 9202; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 9203; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s15, v2 9204; GFX6-NEXT: v_mac_f32_e32 v10, s18, v11 9205; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc 9206; GFX6-NEXT: v_rcp_f32_e32 v3, v10 9207; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 9208; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 9209; GFX6-NEXT: s_sub_u32 s14, 0, s12 9210; GFX6-NEXT: v_mul_f32_e32 v3, s19, v3 9211; GFX6-NEXT: v_mul_f32_e32 v5, s20, v3 9212; GFX6-NEXT: v_trunc_f32_e32 v5, v5 9213; GFX6-NEXT: v_mac_f32_e32 v3, s21, v5 9214; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 9215; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 9216; GFX6-NEXT: v_cndmask_b32_e64 v2, v9, v7, s[0:1] 9217; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 9218; GFX6-NEXT: v_mul_hi_u32 v2, s14, v3 9219; GFX6-NEXT: v_mul_lo_u32 v7, s14, v5 9220; GFX6-NEXT: s_subb_u32 s15, 0, s13 9221; GFX6-NEXT: v_mul_lo_u32 v8, s15, v3 9222; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 9223; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v7 9224; GFX6-NEXT: v_mul_lo_u32 v7, s14, v3 9225; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 9226; GFX6-NEXT: v_mul_lo_u32 v8, v3, v2 9227; GFX6-NEXT: v_mul_hi_u32 v10, v3, v2 9228; GFX6-NEXT: v_mul_hi_u32 v9, v3, v7 9229; GFX6-NEXT: v_mul_hi_u32 v11, v5, v2 9230; GFX6-NEXT: v_mul_lo_u32 v2, v5, v2 9231; GFX6-NEXT: v_xor_b32_e32 v1, s3, v1 9232; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 9233; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 9234; GFX6-NEXT: v_mul_lo_u32 v10, v5, v7 9235; GFX6-NEXT: v_mul_hi_u32 v7, v5, v7 9236; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v10 9237; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc 9238; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v11, v4, vcc 9239; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 9240; GFX6-NEXT: v_add_i32_e64 v2, s[0:1], v3, v2 9241; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v6, v8, vcc 9242; GFX6-NEXT: v_addc_u32_e64 v3, vcc, v5, v7, s[0:1] 9243; GFX6-NEXT: v_mul_lo_u32 v8, s14, v3 9244; GFX6-NEXT: v_mul_hi_u32 v9, s14, v2 9245; GFX6-NEXT: v_mul_lo_u32 v10, s15, v2 9246; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 9247; GFX6-NEXT: v_mul_lo_u32 v9, s14, v2 9248; GFX6-NEXT: v_add_i32_e32 v8, vcc, v10, v8 9249; GFX6-NEXT: v_mul_lo_u32 v12, v2, v8 9250; GFX6-NEXT: v_mul_hi_u32 v14, v2, v8 9251; GFX6-NEXT: v_mul_hi_u32 v13, v2, v9 9252; GFX6-NEXT: v_mul_hi_u32 v11, v3, v9 9253; GFX6-NEXT: v_mul_lo_u32 v9, v3, v9 9254; GFX6-NEXT: v_mul_hi_u32 v10, v3, v8 9255; GFX6-NEXT: v_add_i32_e32 v12, vcc, v13, v12 9256; GFX6-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc 9257; GFX6-NEXT: v_mul_lo_u32 v3, v3, v8 9258; GFX6-NEXT: v_add_i32_e32 v9, vcc, v12, v9 9259; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v13, v11, vcc 9260; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v10, v4, vcc 9261; GFX6-NEXT: v_add_i32_e32 v3, vcc, v9, v3 9262; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v6, v8, vcc 9263; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v7 9264; GFX6-NEXT: s_ashr_i32 s14, s11, 31 9265; GFX6-NEXT: v_addc_u32_e64 v5, vcc, v5, v8, s[0:1] 9266; GFX6-NEXT: s_add_u32 s0, s10, s14 9267; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 9268; GFX6-NEXT: s_mov_b32 s15, s14 9269; GFX6-NEXT: s_addc_u32 s1, s11, s14 9270; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 9271; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 9272; GFX6-NEXT: v_mul_lo_u32 v5, s10, v3 9273; GFX6-NEXT: v_mul_hi_u32 v7, s10, v2 9274; GFX6-NEXT: v_mul_hi_u32 v9, s10, v3 9275; GFX6-NEXT: v_mul_hi_u32 v10, s11, v3 9276; GFX6-NEXT: v_mul_lo_u32 v3, s11, v3 9277; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 9278; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc 9279; GFX6-NEXT: v_mul_lo_u32 v9, s11, v2 9280; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2 9281; GFX6-NEXT: v_mov_b32_e32 v8, s3 9282; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 9283; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v2, vcc 9284; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc 9285; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 9286; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc 9287; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 9288; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 9289; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 9290; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 9291; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc 9292; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 9293; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 9294; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 9295; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s11, v4 9296; GFX6-NEXT: v_mov_b32_e32 v7, s13 9297; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s10, v5 9298; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc 9299; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s12, v5 9300; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] 9301; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 9302; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 9303; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 9304; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 9305; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 9306; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] 9307; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v2 9308; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] 9309; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v2 9310; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] 9311; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 9312; GFX6-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[0:1] 9313; GFX6-NEXT: v_mov_b32_e32 v8, s11 9314; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc 9315; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 9316; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 9317; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 9318; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 9319; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v4 9320; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc 9321; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 9322; GFX6-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] 9323; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 9324; GFX6-NEXT: s_xor_b64 s[0:1], s[14:15], s[8:9] 9325; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 9326; GFX6-NEXT: v_xor_b32_e32 v2, s0, v2 9327; GFX6-NEXT: v_xor_b32_e32 v3, s1, v3 9328; GFX6-NEXT: v_mov_b32_e32 v4, s1 9329; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v2 9330; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc 9331; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 9332; GFX6-NEXT: s_endpgm 9333; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom: 9334; GFX9: ; %bb.0: 9335; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 9336; GFX9-NEXT: s_mov_b32 s3, 0 9337; GFX9-NEXT: s_movk_i32 s2, 0x1000 9338; GFX9-NEXT: s_mov_b32 s18, 0x4f800000 9339; GFX9-NEXT: s_mov_b32 s19, 0x5f7ffffc 9340; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9341; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s6 9342; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 9343; GFX9-NEXT: s_ashr_i32 s12, s3, 31 9344; GFX9-NEXT: s_add_u32 s2, s2, s12 9345; GFX9-NEXT: s_mov_b32 s13, s12 9346; GFX9-NEXT: s_addc_u32 s3, s3, s12 9347; GFX9-NEXT: s_xor_b64 s[10:11], s[2:3], s[12:13] 9348; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 9349; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 9350; GFX9-NEXT: s_mov_b32 s20, 0x2f800000 9351; GFX9-NEXT: s_mov_b32 s21, 0xcf800000 9352; GFX9-NEXT: s_sub_u32 s14, 0, s10 9353; GFX9-NEXT: v_mac_f32_e32 v0, s18, v1 9354; GFX9-NEXT: v_rcp_f32_e32 v0, v0 9355; GFX9-NEXT: s_subb_u32 s4, 0, s11 9356; GFX9-NEXT: v_mov_b32_e32 v6, 0 9357; GFX9-NEXT: v_mul_f32_e32 v0, s19, v0 9358; GFX9-NEXT: v_mul_f32_e32 v1, s20, v0 9359; GFX9-NEXT: v_trunc_f32_e32 v1, v1 9360; GFX9-NEXT: v_mac_f32_e32 v0, s21, v1 9361; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 9362; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 9363; GFX9-NEXT: v_mul_hi_u32 v3, s14, v0 9364; GFX9-NEXT: v_mul_lo_u32 v2, s14, v1 9365; GFX9-NEXT: v_mul_lo_u32 v5, s4, v0 9366; GFX9-NEXT: v_mul_lo_u32 v4, s14, v0 9367; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9368; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 9369; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 9370; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 9371; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 9372; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 9373; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 9374; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 9375; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc 9376; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 9377; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 9378; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 9379; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc 9380; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc 9381; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9382; GFX9-NEXT: v_mov_b32_e32 v5, 0 9383; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 9384; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc 9385; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] 9386; GFX9-NEXT: v_mul_lo_u32 v4, s14, v2 9387; GFX9-NEXT: v_mul_hi_u32 v7, s14, v0 9388; GFX9-NEXT: v_mul_lo_u32 v8, s4, v0 9389; GFX9-NEXT: v_mul_lo_u32 v9, s14, v0 9390; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 9391; GFX9-NEXT: v_add_u32_e32 v4, v7, v4 9392; GFX9-NEXT: v_add_u32_e32 v4, v4, v8 9393; GFX9-NEXT: v_mul_lo_u32 v10, v0, v4 9394; GFX9-NEXT: v_mul_hi_u32 v11, v0, v9 9395; GFX9-NEXT: v_mul_hi_u32 v12, v0, v4 9396; GFX9-NEXT: v_mul_hi_u32 v8, v2, v9 9397; GFX9-NEXT: v_mul_lo_u32 v9, v2, v9 9398; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 9399; GFX9-NEXT: v_mul_hi_u32 v7, v2, v4 9400; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc 9401; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 9402; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 9403; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc 9404; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v6, vcc 9405; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 9406; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9407; GFX9-NEXT: s_ashr_i32 s14, s5, 31 9408; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v5, v4, vcc 9409; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 9410; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] 9411; GFX9-NEXT: s_add_u32 s2, s4, s14 9412; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 9413; GFX9-NEXT: s_addc_u32 s3, s5, s14 9414; GFX9-NEXT: s_mov_b32 s15, s14 9415; GFX9-NEXT: s_xor_b64 s[16:17], s[2:3], s[14:15] 9416; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 9417; GFX9-NEXT: v_mul_lo_u32 v2, s16, v1 9418; GFX9-NEXT: v_mul_hi_u32 v3, s16, v0 9419; GFX9-NEXT: v_mul_hi_u32 v4, s16, v1 9420; GFX9-NEXT: v_mul_hi_u32 v7, s17, v1 9421; GFX9-NEXT: v_mul_lo_u32 v1, s17, v1 9422; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9423; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 9424; GFX9-NEXT: v_mul_lo_u32 v4, s17, v0 9425; GFX9-NEXT: v_mul_hi_u32 v0, s17, v0 9426; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 9427; GFX9-NEXT: s_xor_b64 s[12:13], s[14:15], s[12:13] 9428; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 9429; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 9430; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v6, vcc 9431; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 9432; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc 9433; GFX9-NEXT: v_mul_lo_u32 v2, s10, v1 9434; GFX9-NEXT: v_mul_hi_u32 v3, s10, v0 9435; GFX9-NEXT: v_mul_lo_u32 v4, s11, v0 9436; GFX9-NEXT: v_mov_b32_e32 v7, s11 9437; GFX9-NEXT: s_ashr_i32 s14, s9, 31 9438; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9439; GFX9-NEXT: v_mul_lo_u32 v3, s10, v0 9440; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 9441; GFX9-NEXT: v_sub_u32_e32 v4, s17, v2 9442; GFX9-NEXT: s_mov_b32 s15, s14 9443; GFX9-NEXT: v_sub_co_u32_e64 v3, s[0:1], s16, v3 9444; GFX9-NEXT: v_subb_co_u32_e64 v4, vcc, v4, v7, s[0:1] 9445; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s10, v3 9446; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v4, vcc 9447; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v4 9448; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 9449; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v7 9450; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 9451; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v4 9452; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v7, vcc 9453; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 2, v0 9454; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc 9455; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 1, v0 9456; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v1, vcc 9457; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v4 9458; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, v8, s[2:3] 9459; GFX9-NEXT: v_mov_b32_e32 v8, s17 9460; GFX9-NEXT: v_subb_co_u32_e64 v2, vcc, v8, v2, s[0:1] 9461; GFX9-NEXT: s_add_u32 s0, s8, s14 9462; GFX9-NEXT: s_addc_u32 s1, s9, s14 9463; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[14:15] 9464; GFX9-NEXT: v_cvt_f32_u32_e32 v10, s8 9465; GFX9-NEXT: v_cvt_f32_u32_e32 v11, s9 9466; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 9467; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 9468; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 9469; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 9470; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v2 9471; GFX9-NEXT: v_mac_f32_e32 v10, s18, v11 9472; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc 9473; GFX9-NEXT: v_rcp_f32_e32 v3, v10 9474; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 9475; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 9476; GFX9-NEXT: v_cndmask_b32_e64 v2, v9, v7, s[2:3] 9477; GFX9-NEXT: v_mul_f32_e32 v3, s19, v3 9478; GFX9-NEXT: v_mul_f32_e32 v4, s20, v3 9479; GFX9-NEXT: v_trunc_f32_e32 v4, v4 9480; GFX9-NEXT: v_mac_f32_e32 v3, s21, v4 9481; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 9482; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 9483; GFX9-NEXT: s_sub_u32 s2, 0, s8 9484; GFX9-NEXT: s_subb_u32 s3, 0, s9 9485; GFX9-NEXT: v_mul_hi_u32 v7, s2, v3 9486; GFX9-NEXT: v_mul_lo_u32 v8, s2, v4 9487; GFX9-NEXT: v_mul_lo_u32 v9, s3, v3 9488; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 9489; GFX9-NEXT: v_mul_lo_u32 v2, s2, v3 9490; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 9491; GFX9-NEXT: v_add_u32_e32 v7, v7, v9 9492; GFX9-NEXT: v_mul_lo_u32 v8, v3, v7 9493; GFX9-NEXT: v_mul_hi_u32 v9, v3, v2 9494; GFX9-NEXT: v_mul_hi_u32 v10, v3, v7 9495; GFX9-NEXT: v_mul_hi_u32 v11, v4, v7 9496; GFX9-NEXT: v_mul_lo_u32 v7, v4, v7 9497; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 9498; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc 9499; GFX9-NEXT: v_mul_lo_u32 v10, v4, v2 9500; GFX9-NEXT: v_mul_hi_u32 v2, v4, v2 9501; GFX9-NEXT: s_ashr_i32 s10, s7, 31 9502; GFX9-NEXT: s_mov_b32 s11, s10 9503; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 9504; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v9, v2, vcc 9505; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v6, vcc 9506; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 9507; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v3, v2 9508; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v5, v8, vcc 9509; GFX9-NEXT: v_addc_co_u32_e64 v3, vcc, v4, v7, s[0:1] 9510; GFX9-NEXT: v_mul_lo_u32 v8, s2, v3 9511; GFX9-NEXT: v_mul_hi_u32 v9, s2, v2 9512; GFX9-NEXT: v_mul_lo_u32 v10, s3, v2 9513; GFX9-NEXT: v_mul_lo_u32 v11, s2, v2 9514; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 9515; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 9516; GFX9-NEXT: v_add_u32_e32 v8, v8, v10 9517; GFX9-NEXT: v_mul_lo_u32 v12, v2, v8 9518; GFX9-NEXT: v_mul_hi_u32 v13, v2, v11 9519; GFX9-NEXT: v_mul_hi_u32 v14, v2, v8 9520; GFX9-NEXT: v_mul_hi_u32 v10, v3, v11 9521; GFX9-NEXT: v_mul_lo_u32 v11, v3, v11 9522; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12 9523; GFX9-NEXT: v_mul_hi_u32 v9, v3, v8 9524; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v14, vcc 9525; GFX9-NEXT: v_mul_lo_u32 v3, v3, v8 9526; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v12, v11 9527; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v10, vcc 9528; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v6, vcc 9529; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3 9530; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v5, v8, vcc 9531; GFX9-NEXT: v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1] 9532; GFX9-NEXT: s_add_u32 s0, s6, s10 9533; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 9534; GFX9-NEXT: s_addc_u32 s1, s7, s10 9535; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 9536; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 9537; GFX9-NEXT: v_mul_lo_u32 v4, s6, v3 9538; GFX9-NEXT: v_mul_hi_u32 v7, s6, v2 9539; GFX9-NEXT: v_mul_hi_u32 v9, s6, v3 9540; GFX9-NEXT: v_mul_hi_u32 v10, s7, v3 9541; GFX9-NEXT: v_mul_lo_u32 v3, s7, v3 9542; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v4 9543; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v9, vcc 9544; GFX9-NEXT: v_mul_lo_u32 v9, s7, v2 9545; GFX9-NEXT: v_mul_hi_u32 v2, s7, v2 9546; GFX9-NEXT: v_xor_b32_e32 v0, s12, v0 9547; GFX9-NEXT: v_xor_b32_e32 v1, s13, v1 9548; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 9549; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v2, vcc 9550; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v10, v6, vcc 9551; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 9552; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc 9553; GFX9-NEXT: v_mul_lo_u32 v4, s8, v3 9554; GFX9-NEXT: v_mul_hi_u32 v5, s8, v2 9555; GFX9-NEXT: v_mul_lo_u32 v7, s9, v2 9556; GFX9-NEXT: v_mov_b32_e32 v8, s13 9557; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s12, v0 9558; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 9559; GFX9-NEXT: v_mul_lo_u32 v5, s8, v2 9560; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 9561; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc 9562; GFX9-NEXT: v_sub_u32_e32 v7, s7, v4 9563; GFX9-NEXT: v_mov_b32_e32 v8, s9 9564; GFX9-NEXT: v_sub_co_u32_e64 v5, s[0:1], s6, v5 9565; GFX9-NEXT: v_subb_co_u32_e64 v7, vcc, v7, v8, s[0:1] 9566; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s8, v5 9567; GFX9-NEXT: v_subbrev_co_u32_e32 v7, vcc, 0, v7, vcc 9568; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v7 9569; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc 9570; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v8 9571; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 9572; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v7 9573; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc 9574; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, 2, v2 9575; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v3, vcc 9576; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 1, v2 9577; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v3, vcc 9578; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v7 9579; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v9, s[2:3] 9580; GFX9-NEXT: v_mov_b32_e32 v9, s7 9581; GFX9-NEXT: v_subb_co_u32_e64 v4, vcc, v9, v4, s[0:1] 9582; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v4 9583; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc 9584; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 9585; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 9586; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v4 9587; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc 9588; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 9589; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, v8, s[2:3] 9590; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 9591; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[14:15] 9592; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc 9593; GFX9-NEXT: v_xor_b32_e32 v2, s0, v2 9594; GFX9-NEXT: v_xor_b32_e32 v3, s1, v3 9595; GFX9-NEXT: v_mov_b32_e32 v4, s1 9596; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2 9597; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc 9598; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9599; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] 9600; GFX9-NEXT: s_endpgm 9601 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 9602 %r = sdiv <2 x i64> %x, %shl.y 9603 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 9604 ret void 9605} 9606 9607define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 9608; CHECK-LABEL: @srem_i64_oddk_denom( 9609; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 1235195 9610; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 9611; CHECK-NEXT: ret void 9612; 9613; GFX6-LABEL: srem_i64_oddk_denom: 9614; GFX6: ; %bb.0: 9615; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 9616; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 9617; GFX6-NEXT: v_rcp_f32_e32 v0, v0 9618; GFX6-NEXT: s_mov_b32 s2, 0xffed2705 9619; GFX6-NEXT: v_mov_b32_e32 v8, 0 9620; GFX6-NEXT: v_mov_b32_e32 v7, 0 9621; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9622; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9623; GFX6-NEXT: v_trunc_f32_e32 v1, v1 9624; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9625; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9626; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9627; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 9628; GFX6-NEXT: s_mov_b32 s7, 0xf000 9629; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 9630; GFX6-NEXT: v_mul_lo_u32 v2, v1, s2 9631; GFX6-NEXT: v_mul_lo_u32 v4, v0, s2 9632; GFX6-NEXT: s_mov_b32 s6, -1 9633; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9634; GFX6-NEXT: s_mov_b32 s4, s8 9635; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9636; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 9637; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 9638; GFX6-NEXT: v_mul_hi_u32 v6, v0, v4 9639; GFX6-NEXT: v_mul_hi_u32 v3, v0, v2 9640; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 9641; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9642; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 9643; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 9644; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 9645; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc 9646; GFX6-NEXT: s_mov_b32 s5, s9 9647; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 9648; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc 9649; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 9650; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9651; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 9652; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 9653; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 9654; GFX6-NEXT: v_mul_lo_u32 v4, v2, s2 9655; GFX6-NEXT: v_mul_hi_u32 v5, s2, v0 9656; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 9657; GFX6-NEXT: v_mul_lo_u32 v5, v0, s2 9658; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 9659; GFX6-NEXT: v_mul_lo_u32 v10, v0, v4 9660; GFX6-NEXT: v_mul_hi_u32 v12, v0, v4 9661; GFX6-NEXT: v_mul_hi_u32 v11, v0, v5 9662; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 9663; GFX6-NEXT: v_mul_lo_u32 v5, v2, v5 9664; GFX6-NEXT: v_mul_hi_u32 v6, v2, v4 9665; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 9666; GFX6-NEXT: v_addc_u32_e32 v11, vcc, v8, v12, vcc 9667; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 9668; GFX6-NEXT: v_add_i32_e32 v5, vcc, v10, v5 9669; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc 9670; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc 9671; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 9672; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 9673; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 9674; GFX6-NEXT: s_ashr_i32 s2, s11, 31 9675; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 9676; GFX6-NEXT: s_add_u32 s0, s10, s2 9677; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9678; GFX6-NEXT: s_mov_b32 s3, s2 9679; GFX6-NEXT: s_addc_u32 s1, s11, s2 9680; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 9681; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 9682; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 9683; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 9684; GFX6-NEXT: v_mul_hi_u32 v4, s0, v1 9685; GFX6-NEXT: v_mul_hi_u32 v5, s1, v1 9686; GFX6-NEXT: v_mul_lo_u32 v1, s1, v1 9687; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9688; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 9689; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 9690; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 9691; GFX6-NEXT: s_mov_b32 s3, 0x12d8fb 9692; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9693; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 9694; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 9695; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 9696; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 9697; GFX6-NEXT: v_mul_hi_u32 v2, s3, v0 9698; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 9699; GFX6-NEXT: v_mul_lo_u32 v0, v0, s3 9700; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 9701; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 9702; GFX6-NEXT: v_mov_b32_e32 v2, s1 9703; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 9704; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v0 9705; GFX6-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc 9706; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s3, v2 9707; GFX6-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v3, vcc 9708; GFX6-NEXT: s_mov_b32 s0, 0x12d8fa 9709; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v2 9710; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 9711; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 9712; GFX6-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc 9713; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 9714; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v0 9715; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 9716; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 9717; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 9718; GFX6-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[0:1] 9719; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 9720; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 9721; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 9722; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 9723; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 9724; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 9725; GFX6-NEXT: v_mov_b32_e32 v2, s2 9726; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 9727; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 9728; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 9729; GFX6-NEXT: s_endpgm 9730; GFX9-LABEL: srem_i64_oddk_denom: 9731; GFX9: ; %bb.0: 9732; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 9733; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 9734; GFX9-NEXT: v_rcp_f32_e32 v0, v0 9735; GFX9-NEXT: s_mov_b32 s8, 0xffed2705 9736; GFX9-NEXT: v_mov_b32_e32 v7, 0 9737; GFX9-NEXT: v_mov_b32_e32 v5, 0 9738; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9739; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9740; GFX9-NEXT: v_trunc_f32_e32 v1, v1 9741; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9742; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 9743; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 9744; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 9745; GFX9-NEXT: v_mul_hi_u32 v3, s8, v0 9746; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 9747; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 9748; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9749; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 9750; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 9751; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 9752; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 9753; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 9754; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 9755; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 9756; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v8, vcc 9757; GFX9-NEXT: v_mul_lo_u32 v8, v1, v4 9758; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 9759; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 9760; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc 9761; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc 9762; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9763; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 9764; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 9765; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] 9766; GFX9-NEXT: v_mul_lo_u32 v4, v2, s8 9767; GFX9-NEXT: v_mul_hi_u32 v6, s8, v0 9768; GFX9-NEXT: v_mul_lo_u32 v8, v0, s8 9769; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 9770; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 9771; GFX9-NEXT: v_sub_u32_e32 v4, v4, v0 9772; GFX9-NEXT: v_mul_lo_u32 v10, v0, v4 9773; GFX9-NEXT: v_mul_hi_u32 v11, v0, v8 9774; GFX9-NEXT: v_mul_hi_u32 v12, v0, v4 9775; GFX9-NEXT: v_mul_hi_u32 v9, v2, v8 9776; GFX9-NEXT: v_mul_lo_u32 v8, v2, v8 9777; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 9778; GFX9-NEXT: v_mul_hi_u32 v6, v2, v4 9779; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v7, v12, vcc 9780; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 9781; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 9782; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc 9783; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc 9784; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 9785; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc 9786; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] 9787; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9788; GFX9-NEXT: s_ashr_i32 s2, s7, 31 9789; GFX9-NEXT: s_add_u32 s0, s6, s2 9790; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 9791; GFX9-NEXT: s_mov_b32 s3, s2 9792; GFX9-NEXT: s_addc_u32 s1, s7, s2 9793; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 9794; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 9795; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 9796; GFX9-NEXT: v_mul_hi_u32 v3, s0, v0 9797; GFX9-NEXT: v_mul_hi_u32 v4, s0, v1 9798; GFX9-NEXT: v_mul_hi_u32 v6, s1, v1 9799; GFX9-NEXT: v_mul_lo_u32 v1, s1, v1 9800; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9801; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 9802; GFX9-NEXT: v_mul_lo_u32 v4, s1, v0 9803; GFX9-NEXT: v_mul_hi_u32 v0, s1, v0 9804; GFX9-NEXT: s_mov_b32 s3, 0x12d8fb 9805; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 9806; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 9807; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc 9808; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 9809; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v2, vcc 9810; GFX9-NEXT: v_mul_hi_u32 v2, s3, v0 9811; GFX9-NEXT: v_mul_lo_u32 v1, v1, s3 9812; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 9813; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 9814; GFX9-NEXT: v_mov_b32_e32 v2, s1 9815; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 9816; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 9817; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s3, v0 9818; GFX9-NEXT: v_subbrev_co_u32_e32 v3, vcc, 0, v1, vcc 9819; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s3, v2 9820; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc 9821; GFX9-NEXT: s_mov_b32 s3, 0x12d8fa 9822; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s3, v2 9823; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 9824; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 9825; GFX9-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc 9826; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 9827; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s3, v0 9828; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 9829; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 9830; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 9831; GFX9-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc 9832; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 9833; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 9834; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 9835; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 9836; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 9837; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 9838; GFX9-NEXT: v_mov_b32_e32 v2, s2 9839; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s2, v0 9840; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc 9841; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] 9842; GFX9-NEXT: s_endpgm 9843 %r = srem i64 %x, 1235195 9844 store i64 %r, i64 addrspace(1)* %out 9845 ret void 9846} 9847 9848define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 9849; CHECK-LABEL: @srem_i64_pow2k_denom( 9850; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 4096 9851; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 9852; CHECK-NEXT: ret void 9853; 9854; GFX6-LABEL: srem_i64_pow2k_denom: 9855; GFX6: ; %bb.0: 9856; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 9857; GFX6-NEXT: s_mov_b32 s3, 0xf000 9858; GFX6-NEXT: s_mov_b32 s2, -1 9859; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9860; GFX6-NEXT: s_mov_b32 s0, s4 9861; GFX6-NEXT: s_ashr_i32 s4, s7, 31 9862; GFX6-NEXT: s_lshr_b32 s4, s4, 20 9863; GFX6-NEXT: s_add_u32 s4, s6, s4 9864; GFX6-NEXT: s_mov_b32 s1, s5 9865; GFX6-NEXT: s_addc_u32 s5, s7, 0 9866; GFX6-NEXT: s_and_b32 s4, s4, 0xfffff000 9867; GFX6-NEXT: s_sub_u32 s4, s6, s4 9868; GFX6-NEXT: s_subb_u32 s5, s7, s5 9869; GFX6-NEXT: v_mov_b32_e32 v0, s4 9870; GFX6-NEXT: v_mov_b32_e32 v1, s5 9871; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 9872; GFX6-NEXT: s_endpgm 9873; GFX9-LABEL: srem_i64_pow2k_denom: 9874; GFX9: ; %bb.0: 9875; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 9876; GFX9-NEXT: v_mov_b32_e32 v2, 0 9877; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9878; GFX9-NEXT: s_ashr_i32 s4, s3, 31 9879; GFX9-NEXT: s_lshr_b32 s4, s4, 20 9880; GFX9-NEXT: s_add_u32 s4, s2, s4 9881; GFX9-NEXT: s_addc_u32 s5, s3, 0 9882; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 9883; GFX9-NEXT: s_sub_u32 s2, s2, s4 9884; GFX9-NEXT: s_subb_u32 s3, s3, s5 9885; GFX9-NEXT: v_mov_b32_e32 v0, s2 9886; GFX9-NEXT: v_mov_b32_e32 v1, s3 9887; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 9888; GFX9-NEXT: s_endpgm 9889 %r = srem i64 %x, 4096 9890 store i64 %r, i64 addrspace(1)* %out 9891 ret void 9892} 9893 9894define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 9895; CHECK-LABEL: @srem_i64_pow2_shl_denom( 9896; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 9897; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]] 9898; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 9899; CHECK-NEXT: ret void 9900; 9901; GFX6-LABEL: srem_i64_pow2_shl_denom: 9902; GFX6: ; %bb.0: 9903; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd 9904; GFX6-NEXT: s_mov_b32 s3, 0 9905; GFX6-NEXT: s_movk_i32 s2, 0x1000 9906; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 9907; GFX6-NEXT: s_mov_b32 s7, 0xf000 9908; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9909; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 9910; GFX6-NEXT: s_ashr_i32 s4, s3, 31 9911; GFX6-NEXT: s_add_u32 s2, s2, s4 9912; GFX6-NEXT: s_mov_b32 s5, s4 9913; GFX6-NEXT: s_addc_u32 s3, s3, s4 9914; GFX6-NEXT: s_xor_b64 s[12:13], s[2:3], s[4:5] 9915; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 9916; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s13 9917; GFX6-NEXT: s_sub_u32 s2, 0, s12 9918; GFX6-NEXT: s_subb_u32 s3, 0, s13 9919; GFX6-NEXT: s_ashr_i32 s14, s11, 31 9920; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 9921; GFX6-NEXT: v_rcp_f32_e32 v0, v0 9922; GFX6-NEXT: s_mov_b32 s15, s14 9923; GFX6-NEXT: s_mov_b32 s6, -1 9924; GFX6-NEXT: s_mov_b32 s4, s8 9925; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9926; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9927; GFX6-NEXT: v_trunc_f32_e32 v1, v1 9928; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9929; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9930; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9931; GFX6-NEXT: s_mov_b32 s5, s9 9932; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 9933; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 9934; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 9935; GFX6-NEXT: v_mul_lo_u32 v4, s2, v0 9936; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9937; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 9938; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 9939; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 9940; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 9941; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 9942; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9943; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 9944; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 9945; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 9946; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 9947; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 9948; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 9949; GFX6-NEXT: v_mov_b32_e32 v4, 0 9950; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 9951; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9952; GFX6-NEXT: v_mov_b32_e32 v6, 0 9953; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 9954; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 9955; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 9956; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 9957; GFX6-NEXT: v_mul_hi_u32 v7, s2, v0 9958; GFX6-NEXT: v_mul_lo_u32 v8, s3, v0 9959; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 9960; GFX6-NEXT: v_mul_lo_u32 v7, s2, v0 9961; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 9962; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 9963; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 9964; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 9965; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 9966; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 9967; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 9968; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 9969; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 9970; GFX6-NEXT: v_mul_lo_u32 v2, v2, v5 9971; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 9972; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 9973; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 9974; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 9975; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 9976; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 9977; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 9978; GFX6-NEXT: s_add_u32 s0, s10, s14 9979; GFX6-NEXT: s_addc_u32 s1, s11, s14 9980; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9981; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 9982; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 9983; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 9984; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 9985; GFX6-NEXT: v_mul_hi_u32 v5, s10, v1 9986; GFX6-NEXT: v_mul_hi_u32 v7, s11, v1 9987; GFX6-NEXT: v_mul_lo_u32 v1, s11, v1 9988; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9989; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 9990; GFX6-NEXT: v_mul_lo_u32 v5, s11, v0 9991; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 9992; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 9993; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 9994; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 9995; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 9996; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 9997; GFX6-NEXT: v_mul_lo_u32 v1, s12, v1 9998; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 9999; GFX6-NEXT: v_mul_lo_u32 v3, s13, v0 10000; GFX6-NEXT: v_mul_lo_u32 v0, s12, v0 10001; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 10002; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 10003; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s11, v1 10004; GFX6-NEXT: v_mov_b32_e32 v3, s13 10005; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 10006; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 10007; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 10008; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 10009; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v5 10010; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 10011; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 10012; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v4 10013; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 10014; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 10015; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v5 10016; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 10017; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 10018; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 10019; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 10020; GFX6-NEXT: v_mov_b32_e32 v5, s11 10021; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 10022; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 10023; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 10024; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 10025; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 10026; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 10027; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 10028; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 10029; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 10030; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 10031; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 10032; GFX6-NEXT: v_xor_b32_e32 v0, s14, v0 10033; GFX6-NEXT: v_xor_b32_e32 v1, s14, v1 10034; GFX6-NEXT: v_mov_b32_e32 v2, s14 10035; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s14, v0 10036; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 10037; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 10038; GFX6-NEXT: s_endpgm 10039; GFX9-LABEL: srem_i64_pow2_shl_denom: 10040; GFX9: ; %bb.0: 10041; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 10042; GFX9-NEXT: s_mov_b32 s3, 0 10043; GFX9-NEXT: s_movk_i32 s2, 0x1000 10044; GFX9-NEXT: v_mov_b32_e32 v2, 0 10045; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10046; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 10047; GFX9-NEXT: s_ashr_i32 s4, s3, 31 10048; GFX9-NEXT: s_add_u32 s2, s2, s4 10049; GFX9-NEXT: s_mov_b32 s5, s4 10050; GFX9-NEXT: s_addc_u32 s3, s3, s4 10051; GFX9-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] 10052; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 10053; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 10054; GFX9-NEXT: s_sub_u32 s10, 0, s8 10055; GFX9-NEXT: s_subb_u32 s4, 0, s9 10056; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 10057; GFX9-NEXT: v_rcp_f32_e32 v0, v0 10058; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10059; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10060; GFX9-NEXT: v_trunc_f32_e32 v1, v1 10061; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10062; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 10063; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 10064; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 10065; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 10066; GFX9-NEXT: v_mul_lo_u32 v6, s4, v0 10067; GFX9-NEXT: v_mul_lo_u32 v5, s10, v0 10068; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 10069; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 10070; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 10071; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3 10072; GFX9-NEXT: v_mul_hi_u32 v8, v0, v3 10073; GFX9-NEXT: v_mul_hi_u32 v7, v1, v5 10074; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 10075; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 10076; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 10077; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc 10078; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 10079; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 10080; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v7, vcc 10081; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v2, vcc 10082; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 10083; GFX9-NEXT: v_mov_b32_e32 v6, 0 10084; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v3 10085; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc 10086; GFX9-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[2:3] 10087; GFX9-NEXT: v_mul_lo_u32 v5, s10, v3 10088; GFX9-NEXT: v_mul_hi_u32 v7, s10, v0 10089; GFX9-NEXT: v_mul_lo_u32 v8, s4, v0 10090; GFX9-NEXT: v_mul_lo_u32 v9, s10, v0 10091; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 10092; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 10093; GFX9-NEXT: v_add_u32_e32 v5, v5, v8 10094; GFX9-NEXT: v_mul_lo_u32 v10, v0, v5 10095; GFX9-NEXT: v_mul_hi_u32 v11, v0, v9 10096; GFX9-NEXT: v_mul_hi_u32 v12, v0, v5 10097; GFX9-NEXT: v_mul_hi_u32 v8, v3, v9 10098; GFX9-NEXT: v_mul_lo_u32 v9, v3, v9 10099; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 10100; GFX9-NEXT: v_mul_hi_u32 v7, v3, v5 10101; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc 10102; GFX9-NEXT: v_mul_lo_u32 v3, v3, v5 10103; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 10104; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc 10105; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 10106; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 10107; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10108; GFX9-NEXT: s_ashr_i32 s10, s7, 31 10109; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v5, vcc 10110; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 10111; GFX9-NEXT: s_add_u32 s0, s6, s10 10112; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3] 10113; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 10114; GFX9-NEXT: s_mov_b32 s11, s10 10115; GFX9-NEXT: s_addc_u32 s1, s7, s10 10116; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 10117; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 10118; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 10119; GFX9-NEXT: v_mul_hi_u32 v4, s6, v0 10120; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 10121; GFX9-NEXT: v_mul_hi_u32 v7, s7, v1 10122; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 10123; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 10124; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 10125; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 10126; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 10127; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 10128; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v4, v0, vcc 10129; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v2, vcc 10130; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 10131; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v3, vcc 10132; GFX9-NEXT: v_mul_lo_u32 v1, s8, v1 10133; GFX9-NEXT: v_mul_hi_u32 v3, s8, v0 10134; GFX9-NEXT: v_mul_lo_u32 v4, s9, v0 10135; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 10136; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 10137; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 10138; GFX9-NEXT: v_sub_co_u32_e64 v0, s[0:1], s6, v0 10139; GFX9-NEXT: v_sub_u32_e32 v3, s7, v1 10140; GFX9-NEXT: v_mov_b32_e32 v4, s9 10141; GFX9-NEXT: v_subb_co_u32_e64 v3, vcc, v3, v4, s[0:1] 10142; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[2:3], s8, v0 10143; GFX9-NEXT: v_subbrev_co_u32_e64 v6, vcc, 0, v3, s[2:3] 10144; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v6 10145; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 10146; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 10147; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 10148; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v6 10149; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 10150; GFX9-NEXT: v_subb_co_u32_e64 v3, vcc, v3, v4, s[2:3] 10151; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s8, v5 10152; GFX9-NEXT: v_subbrev_co_u32_e32 v3, vcc, 0, v3, vcc 10153; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v7 10154; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[2:3] 10155; GFX9-NEXT: v_mov_b32_e32 v6, s7 10156; GFX9-NEXT: v_subb_co_u32_e64 v1, vcc, v6, v1, s[0:1] 10157; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 10158; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 10159; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 10160; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 10161; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 10162; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc 10163; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 10164; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 10165; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[2:3] 10166; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 10167; GFX9-NEXT: v_xor_b32_e32 v0, s10, v0 10168; GFX9-NEXT: v_xor_b32_e32 v1, s10, v1 10169; GFX9-NEXT: v_mov_b32_e32 v3, s10 10170; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s10, v0 10171; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 10172; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 10173; GFX9-NEXT: s_endpgm 10174 %shl.y = shl i64 4096, %y 10175 %r = srem i64 %x, %shl.y 10176 store i64 %r, i64 addrspace(1)* %out 10177 ret void 10178} 10179 10180define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 10181; CHECK-LABEL: @srem_v2i64_pow2k_denom( 10182; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 10183; CHECK-NEXT: [[TMP2:%.*]] = srem i64 [[TMP1]], 4096 10184; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 10185; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 10186; CHECK-NEXT: [[TMP5:%.*]] = srem i64 [[TMP4]], 4096 10187; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 10188; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 10189; CHECK-NEXT: ret void 10190; 10191; GFX6-LABEL: srem_v2i64_pow2k_denom: 10192; GFX6: ; %bb.0: 10193; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 10194; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 10195; GFX6-NEXT: s_movk_i32 s8, 0xf000 10196; GFX6-NEXT: s_mov_b32 s7, 0xf000 10197; GFX6-NEXT: s_mov_b32 s6, -1 10198; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10199; GFX6-NEXT: s_ashr_i32 s9, s1, 31 10200; GFX6-NEXT: s_lshr_b32 s9, s9, 20 10201; GFX6-NEXT: s_add_u32 s9, s0, s9 10202; GFX6-NEXT: s_addc_u32 s10, s1, 0 10203; GFX6-NEXT: s_and_b32 s9, s9, s8 10204; GFX6-NEXT: s_sub_u32 s0, s0, s9 10205; GFX6-NEXT: s_subb_u32 s1, s1, s10 10206; GFX6-NEXT: s_ashr_i32 s9, s3, 31 10207; GFX6-NEXT: s_lshr_b32 s9, s9, 20 10208; GFX6-NEXT: s_add_u32 s9, s2, s9 10209; GFX6-NEXT: s_addc_u32 s10, s3, 0 10210; GFX6-NEXT: s_and_b32 s8, s9, s8 10211; GFX6-NEXT: s_sub_u32 s2, s2, s8 10212; GFX6-NEXT: s_subb_u32 s3, s3, s10 10213; GFX6-NEXT: v_mov_b32_e32 v0, s0 10214; GFX6-NEXT: v_mov_b32_e32 v1, s1 10215; GFX6-NEXT: v_mov_b32_e32 v2, s2 10216; GFX6-NEXT: v_mov_b32_e32 v3, s3 10217; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 10218; GFX6-NEXT: s_endpgm 10219; GFX9-LABEL: srem_v2i64_pow2k_denom: 10220; GFX9: ; %bb.0: 10221; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10222; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 10223; GFX9-NEXT: s_movk_i32 s8, 0xf000 10224; GFX9-NEXT: v_mov_b32_e32 v4, 0 10225; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10226; GFX9-NEXT: s_ashr_i32 s0, s5, 31 10227; GFX9-NEXT: s_lshr_b32 s0, s0, 20 10228; GFX9-NEXT: s_add_u32 s0, s4, s0 10229; GFX9-NEXT: s_addc_u32 s1, s5, 0 10230; GFX9-NEXT: s_and_b32 s0, s0, s8 10231; GFX9-NEXT: s_sub_u32 s0, s4, s0 10232; GFX9-NEXT: s_subb_u32 s1, s5, s1 10233; GFX9-NEXT: s_ashr_i32 s4, s7, 31 10234; GFX9-NEXT: s_lshr_b32 s4, s4, 20 10235; GFX9-NEXT: s_add_u32 s4, s6, s4 10236; GFX9-NEXT: s_addc_u32 s5, s7, 0 10237; GFX9-NEXT: s_and_b32 s4, s4, s8 10238; GFX9-NEXT: s_sub_u32 s4, s6, s4 10239; GFX9-NEXT: s_subb_u32 s5, s7, s5 10240; GFX9-NEXT: v_mov_b32_e32 v0, s0 10241; GFX9-NEXT: v_mov_b32_e32 v1, s1 10242; GFX9-NEXT: v_mov_b32_e32 v2, s4 10243; GFX9-NEXT: v_mov_b32_e32 v3, s5 10244; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 10245; GFX9-NEXT: s_endpgm 10246 %r = srem <2 x i64> %x, <i64 4096, i64 4096> 10247 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 10248 ret void 10249} 10250 10251define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 10252; CHECK-LABEL: @srem_v2i64_pow2_shl_denom( 10253; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 10254; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 10255; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 10256; CHECK-NEXT: [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]] 10257; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 10258; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 10259; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 10260; CHECK-NEXT: [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]] 10261; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 10262; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 10263; CHECK-NEXT: ret void 10264; 10265; GFX6-LABEL: srem_v2i64_pow2_shl_denom: 10266; GFX6: ; %bb.0: 10267; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 10268; GFX6-NEXT: s_mov_b32 s3, 0 10269; GFX6-NEXT: s_movk_i32 s2, 0x1000 10270; GFX6-NEXT: s_mov_b32 s18, 0x4f800000 10271; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc 10272; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10273; GFX6-NEXT: s_lshl_b64 s[14:15], s[2:3], s6 10274; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 10275; GFX6-NEXT: s_ashr_i32 s4, s3, 31 10276; GFX6-NEXT: s_add_u32 s2, s2, s4 10277; GFX6-NEXT: s_mov_b32 s5, s4 10278; GFX6-NEXT: s_addc_u32 s3, s3, s4 10279; GFX6-NEXT: s_xor_b64 s[16:17], s[2:3], s[4:5] 10280; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s16 10281; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s17 10282; GFX6-NEXT: s_mov_b32 s20, 0x2f800000 10283; GFX6-NEXT: s_mov_b32 s21, 0xcf800000 10284; GFX6-NEXT: s_sub_u32 s6, 0, s16 10285; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1 10286; GFX6-NEXT: v_rcp_f32_e32 v0, v0 10287; GFX6-NEXT: s_subb_u32 s7, 0, s17 10288; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 10289; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 10290; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0 10291; GFX6-NEXT: v_mul_f32_e32 v1, s20, v0 10292; GFX6-NEXT: v_trunc_f32_e32 v1, v1 10293; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 10294; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 10295; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 10296; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10297; GFX6-NEXT: s_ashr_i32 s12, s9, 31 10298; GFX6-NEXT: s_add_u32 s0, s8, s12 10299; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 10300; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 10301; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 10302; GFX6-NEXT: v_mul_lo_u32 v5, s6, v0 10303; GFX6-NEXT: s_mov_b32 s13, s12 10304; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10305; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 10306; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 10307; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 10308; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 10309; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 10310; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 10311; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 10312; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 10313; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 10314; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 10315; GFX6-NEXT: s_addc_u32 s1, s9, s12 10316; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] 10317; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 10318; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 10319; GFX6-NEXT: v_mov_b32_e32 v4, 0 10320; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 10321; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10322; GFX6-NEXT: v_mov_b32_e32 v6, 0 10323; GFX6-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2 10324; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 10325; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[2:3] 10326; GFX6-NEXT: v_mul_lo_u32 v5, s6, v2 10327; GFX6-NEXT: v_mul_hi_u32 v7, s6, v0 10328; GFX6-NEXT: v_mul_lo_u32 v8, s7, v0 10329; GFX6-NEXT: s_mov_b32 s7, 0xf000 10330; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 10331; GFX6-NEXT: v_mul_lo_u32 v7, s6, v0 10332; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 10333; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 10334; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 10335; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 10336; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 10337; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 10338; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 10339; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 10340; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 10341; GFX6-NEXT: v_mul_lo_u32 v2, v2, v5 10342; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 10343; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 10344; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 10345; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 10346; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 10347; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 10348; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] 10349; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 10350; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 10351; GFX6-NEXT: v_mul_lo_u32 v2, s8, v1 10352; GFX6-NEXT: v_mul_hi_u32 v3, s8, v0 10353; GFX6-NEXT: v_mul_hi_u32 v5, s8, v1 10354; GFX6-NEXT: v_mul_hi_u32 v7, s9, v1 10355; GFX6-NEXT: v_mul_lo_u32 v1, s9, v1 10356; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10357; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 10358; GFX6-NEXT: v_mul_lo_u32 v5, s9, v0 10359; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 10360; GFX6-NEXT: s_mov_b32 s6, -1 10361; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 10362; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 10363; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 10364; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 10365; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 10366; GFX6-NEXT: v_mul_lo_u32 v1, s16, v1 10367; GFX6-NEXT: v_mul_hi_u32 v2, s16, v0 10368; GFX6-NEXT: v_mul_lo_u32 v3, s17, v0 10369; GFX6-NEXT: v_mul_lo_u32 v0, s16, v0 10370; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 10371; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 10372; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v1 10373; GFX6-NEXT: v_mov_b32_e32 v3, s17 10374; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 10375; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 10376; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s16, v0 10377; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v2, s[0:1] 10378; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s17, v7 10379; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 10380; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 10381; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v5 10382; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s16, v5 10383; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 10384; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v7 10385; GFX6-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 10386; GFX6-NEXT: s_ashr_i32 s2, s15, 31 10387; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 10388; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 10389; GFX6-NEXT: s_add_u32 s8, s14, s2 10390; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[0:1] 10391; GFX6-NEXT: v_mov_b32_e32 v7, s9 10392; GFX6-NEXT: s_mov_b32 s3, s2 10393; GFX6-NEXT: s_addc_u32 s9, s15, s2 10394; GFX6-NEXT: s_xor_b64 s[8:9], s[8:9], s[2:3] 10395; GFX6-NEXT: v_cvt_f32_u32_e32 v8, s8 10396; GFX6-NEXT: v_cvt_f32_u32_e32 v9, s9 10397; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v7, v1, vcc 10398; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s17, v1 10399; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 10400; GFX6-NEXT: v_mac_f32_e32 v8, s18, v9 10401; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s16, v0 10402; GFX6-NEXT: v_rcp_f32_e32 v8, v8 10403; GFX6-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc 10404; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s17, v1 10405; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc 10406; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 10407; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 10408; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v3, s[0:1] 10409; GFX6-NEXT: v_mul_f32_e32 v3, s19, v8 10410; GFX6-NEXT: v_mul_f32_e32 v5, s20, v3 10411; GFX6-NEXT: v_trunc_f32_e32 v5, v5 10412; GFX6-NEXT: v_mac_f32_e32 v3, s21, v5 10413; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 10414; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 10415; GFX6-NEXT: s_sub_u32 s2, 0, s8 10416; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 10417; GFX6-NEXT: v_mul_hi_u32 v2, s2, v3 10418; GFX6-NEXT: v_mul_lo_u32 v7, s2, v5 10419; GFX6-NEXT: s_subb_u32 s3, 0, s9 10420; GFX6-NEXT: v_mul_lo_u32 v8, s3, v3 10421; GFX6-NEXT: s_ashr_i32 s14, s11, 31 10422; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v7 10423; GFX6-NEXT: v_mul_lo_u32 v7, s2, v3 10424; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 10425; GFX6-NEXT: v_mul_lo_u32 v8, v3, v2 10426; GFX6-NEXT: v_mul_hi_u32 v10, v3, v2 10427; GFX6-NEXT: v_mul_hi_u32 v9, v3, v7 10428; GFX6-NEXT: v_mul_hi_u32 v11, v5, v2 10429; GFX6-NEXT: v_mul_lo_u32 v2, v5, v2 10430; GFX6-NEXT: s_mov_b32 s15, s14 10431; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 10432; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 10433; GFX6-NEXT: v_mul_lo_u32 v10, v5, v7 10434; GFX6-NEXT: v_mul_hi_u32 v7, v5, v7 10435; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 10436; GFX6-NEXT: v_xor_b32_e32 v1, s12, v1 10437; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v10 10438; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc 10439; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v11, v4, vcc 10440; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 10441; GFX6-NEXT: v_add_i32_e64 v2, s[0:1], v3, v2 10442; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v6, v8, vcc 10443; GFX6-NEXT: v_addc_u32_e64 v3, vcc, v5, v7, s[0:1] 10444; GFX6-NEXT: v_mul_lo_u32 v8, s2, v3 10445; GFX6-NEXT: v_mul_hi_u32 v9, s2, v2 10446; GFX6-NEXT: v_mul_lo_u32 v10, s3, v2 10447; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 10448; GFX6-NEXT: v_mul_lo_u32 v9, s2, v2 10449; GFX6-NEXT: v_add_i32_e32 v8, vcc, v10, v8 10450; GFX6-NEXT: v_mul_lo_u32 v12, v2, v8 10451; GFX6-NEXT: v_mul_hi_u32 v14, v2, v8 10452; GFX6-NEXT: v_mul_hi_u32 v13, v2, v9 10453; GFX6-NEXT: v_mul_hi_u32 v11, v3, v9 10454; GFX6-NEXT: v_mul_lo_u32 v9, v3, v9 10455; GFX6-NEXT: v_mul_hi_u32 v10, v3, v8 10456; GFX6-NEXT: v_add_i32_e32 v12, vcc, v13, v12 10457; GFX6-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc 10458; GFX6-NEXT: v_mul_lo_u32 v3, v3, v8 10459; GFX6-NEXT: v_add_i32_e32 v9, vcc, v12, v9 10460; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v13, v11, vcc 10461; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v10, v4, vcc 10462; GFX6-NEXT: v_add_i32_e32 v3, vcc, v9, v3 10463; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v6, v8, vcc 10464; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v7 10465; GFX6-NEXT: v_addc_u32_e64 v5, vcc, v5, v8, s[0:1] 10466; GFX6-NEXT: s_add_u32 s0, s10, s14 10467; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 10468; GFX6-NEXT: s_addc_u32 s1, s11, s14 10469; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 10470; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 10471; GFX6-NEXT: v_mul_lo_u32 v5, s10, v3 10472; GFX6-NEXT: v_mul_hi_u32 v7, s10, v2 10473; GFX6-NEXT: v_mul_hi_u32 v9, s10, v3 10474; GFX6-NEXT: v_mul_hi_u32 v10, s11, v3 10475; GFX6-NEXT: v_mul_lo_u32 v3, s11, v3 10476; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 10477; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc 10478; GFX6-NEXT: v_mul_lo_u32 v9, s11, v2 10479; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2 10480; GFX6-NEXT: v_mov_b32_e32 v8, s12 10481; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 10482; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v2, vcc 10483; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc 10484; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 10485; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc 10486; GFX6-NEXT: v_mul_lo_u32 v3, s8, v3 10487; GFX6-NEXT: v_mul_hi_u32 v4, s8, v2 10488; GFX6-NEXT: v_mul_lo_u32 v5, s9, v2 10489; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 10490; GFX6-NEXT: v_mul_lo_u32 v2, s8, v2 10491; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc 10492; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 10493; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 10494; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s11, v3 10495; GFX6-NEXT: v_mov_b32_e32 v5, s9 10496; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 10497; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 10498; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s8, v2 10499; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] 10500; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v7 10501; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1] 10502; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 10503; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v6 10504; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s8, v6 10505; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 10506; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v7 10507; GFX6-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 10508; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 10509; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 10510; GFX6-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] 10511; GFX6-NEXT: v_mov_b32_e32 v7, s11 10512; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc 10513; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 10514; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 10515; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 10516; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 10517; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 10518; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 10519; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 10520; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 10521; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 10522; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 10523; GFX6-NEXT: v_xor_b32_e32 v2, s14, v2 10524; GFX6-NEXT: v_xor_b32_e32 v3, s14, v3 10525; GFX6-NEXT: v_mov_b32_e32 v4, s14 10526; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s14, v2 10527; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc 10528; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 10529; GFX6-NEXT: s_endpgm 10530; GFX9-LABEL: srem_v2i64_pow2_shl_denom: 10531; GFX9: ; %bb.0: 10532; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 10533; GFX9-NEXT: s_mov_b32 s3, 0 10534; GFX9-NEXT: s_movk_i32 s2, 0x1000 10535; GFX9-NEXT: s_mov_b32 s16, 0x4f800000 10536; GFX9-NEXT: s_mov_b32 s17, 0x5f7ffffc 10537; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10538; GFX9-NEXT: s_lshl_b64 s[12:13], s[2:3], s6 10539; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 10540; GFX9-NEXT: s_ashr_i32 s4, s3, 31 10541; GFX9-NEXT: s_add_u32 s2, s2, s4 10542; GFX9-NEXT: s_mov_b32 s5, s4 10543; GFX9-NEXT: s_addc_u32 s3, s3, s4 10544; GFX9-NEXT: s_xor_b64 s[14:15], s[2:3], s[4:5] 10545; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s14 10546; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s15 10547; GFX9-NEXT: s_mov_b32 s18, 0x2f800000 10548; GFX9-NEXT: s_mov_b32 s19, 0xcf800000 10549; GFX9-NEXT: s_sub_u32 s4, 0, s14 10550; GFX9-NEXT: v_mac_f32_e32 v0, s16, v1 10551; GFX9-NEXT: v_rcp_f32_e32 v0, v0 10552; GFX9-NEXT: s_subb_u32 s5, 0, s15 10553; GFX9-NEXT: v_mov_b32_e32 v6, 0 10554; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 10555; GFX9-NEXT: v_mul_f32_e32 v0, s17, v0 10556; GFX9-NEXT: v_mul_f32_e32 v1, s18, v0 10557; GFX9-NEXT: v_trunc_f32_e32 v1, v1 10558; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 10559; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 10560; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 10561; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10562; GFX9-NEXT: s_ashr_i32 s6, s9, 31 10563; GFX9-NEXT: s_mov_b32 s7, s6 10564; GFX9-NEXT: v_mul_hi_u32 v3, s4, v0 10565; GFX9-NEXT: v_mul_lo_u32 v2, s4, v1 10566; GFX9-NEXT: v_mul_lo_u32 v5, s5, v0 10567; GFX9-NEXT: v_mul_lo_u32 v4, s4, v0 10568; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 10569; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 10570; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 10571; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 10572; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 10573; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 10574; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 10575; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 10576; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc 10577; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 10578; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 10579; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 10580; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc 10581; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc 10582; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10583; GFX9-NEXT: v_mov_b32_e32 v5, 0 10584; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 10585; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc 10586; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] 10587; GFX9-NEXT: v_mul_lo_u32 v4, s4, v2 10588; GFX9-NEXT: v_mul_hi_u32 v7, s4, v0 10589; GFX9-NEXT: v_mul_lo_u32 v8, s5, v0 10590; GFX9-NEXT: v_mul_lo_u32 v9, s4, v0 10591; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 10592; GFX9-NEXT: v_add_u32_e32 v4, v7, v4 10593; GFX9-NEXT: v_add_u32_e32 v4, v4, v8 10594; GFX9-NEXT: v_mul_lo_u32 v10, v0, v4 10595; GFX9-NEXT: v_mul_hi_u32 v11, v0, v9 10596; GFX9-NEXT: v_mul_hi_u32 v12, v0, v4 10597; GFX9-NEXT: v_mul_hi_u32 v8, v2, v9 10598; GFX9-NEXT: v_mul_lo_u32 v9, v2, v9 10599; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 10600; GFX9-NEXT: v_mul_hi_u32 v7, v2, v4 10601; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc 10602; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 10603; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 10604; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc 10605; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v6, vcc 10606; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 10607; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v5, v4, vcc 10608; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] 10609; GFX9-NEXT: s_add_u32 s2, s8, s6 10610; GFX9-NEXT: s_addc_u32 s3, s9, s6 10611; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 10612; GFX9-NEXT: s_xor_b64 s[8:9], s[2:3], s[6:7] 10613; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 10614; GFX9-NEXT: v_mul_lo_u32 v2, s8, v1 10615; GFX9-NEXT: v_mul_hi_u32 v3, s8, v0 10616; GFX9-NEXT: v_mul_hi_u32 v4, s8, v1 10617; GFX9-NEXT: v_mul_hi_u32 v7, s9, v1 10618; GFX9-NEXT: v_mul_lo_u32 v1, s9, v1 10619; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10620; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 10621; GFX9-NEXT: v_mul_lo_u32 v4, s9, v0 10622; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0 10623; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 10624; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 10625; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 10626; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v6, vcc 10627; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 10628; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc 10629; GFX9-NEXT: v_mul_lo_u32 v1, s14, v1 10630; GFX9-NEXT: v_mul_hi_u32 v2, s14, v0 10631; GFX9-NEXT: v_mul_lo_u32 v3, s15, v0 10632; GFX9-NEXT: v_mul_lo_u32 v0, s14, v0 10633; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 10634; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 10635; GFX9-NEXT: v_sub_co_u32_e64 v0, s[0:1], s8, v0 10636; GFX9-NEXT: v_sub_u32_e32 v2, s9, v1 10637; GFX9-NEXT: v_mov_b32_e32 v3, s15 10638; GFX9-NEXT: v_subb_co_u32_e64 v2, vcc, v2, v3, s[0:1] 10639; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[2:3], s14, v0 10640; GFX9-NEXT: v_subbrev_co_u32_e64 v7, vcc, 0, v2, s[2:3] 10641; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v7 10642; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 10643; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v4 10644; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc 10645; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s15, v7 10646; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc 10647; GFX9-NEXT: v_subb_co_u32_e64 v2, vcc, v2, v3, s[2:3] 10648; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s14, v4 10649; GFX9-NEXT: v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc 10650; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v8 10651; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[2:3] 10652; GFX9-NEXT: v_mov_b32_e32 v7, s9 10653; GFX9-NEXT: v_subb_co_u32_e64 v1, vcc, v7, v1, s[0:1] 10654; GFX9-NEXT: s_ashr_i32 s0, s13, 31 10655; GFX9-NEXT: s_add_u32 s8, s12, s0 10656; GFX9-NEXT: s_mov_b32 s1, s0 10657; GFX9-NEXT: s_addc_u32 s9, s13, s0 10658; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[0:1] 10659; GFX9-NEXT: v_cvt_f32_u32_e32 v9, s8 10660; GFX9-NEXT: v_cvt_f32_u32_e32 v10, s9 10661; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s15, v1 10662; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 10663; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s14, v0 10664; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 10665; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s15, v1 10666; GFX9-NEXT: v_mac_f32_e32 v9, s16, v10 10667; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 10668; GFX9-NEXT: v_rcp_f32_e32 v8, v9 10669; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 10670; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 10671; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[2:3] 10672; GFX9-NEXT: v_mul_f32_e32 v3, s17, v8 10673; GFX9-NEXT: v_mul_f32_e32 v4, s18, v3 10674; GFX9-NEXT: v_trunc_f32_e32 v4, v4 10675; GFX9-NEXT: v_mac_f32_e32 v3, s19, v4 10676; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 10677; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 10678; GFX9-NEXT: s_sub_u32 s2, 0, s8 10679; GFX9-NEXT: s_subb_u32 s3, 0, s9 10680; GFX9-NEXT: v_mul_hi_u32 v7, s2, v3 10681; GFX9-NEXT: v_mul_lo_u32 v8, s2, v4 10682; GFX9-NEXT: v_mul_lo_u32 v9, s3, v3 10683; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 10684; GFX9-NEXT: v_mul_lo_u32 v2, s2, v3 10685; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 10686; GFX9-NEXT: v_add_u32_e32 v7, v7, v9 10687; GFX9-NEXT: v_mul_lo_u32 v8, v3, v7 10688; GFX9-NEXT: v_mul_hi_u32 v9, v3, v2 10689; GFX9-NEXT: v_mul_hi_u32 v10, v3, v7 10690; GFX9-NEXT: v_mul_hi_u32 v11, v4, v7 10691; GFX9-NEXT: v_mul_lo_u32 v7, v4, v7 10692; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 10693; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc 10694; GFX9-NEXT: v_mul_lo_u32 v10, v4, v2 10695; GFX9-NEXT: v_mul_hi_u32 v2, v4, v2 10696; GFX9-NEXT: s_ashr_i32 s12, s11, 31 10697; GFX9-NEXT: s_mov_b32 s13, s12 10698; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 10699; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v9, v2, vcc 10700; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v6, vcc 10701; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 10702; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v3, v2 10703; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v5, v8, vcc 10704; GFX9-NEXT: v_addc_co_u32_e64 v3, vcc, v4, v7, s[0:1] 10705; GFX9-NEXT: v_mul_lo_u32 v8, s2, v3 10706; GFX9-NEXT: v_mul_hi_u32 v9, s2, v2 10707; GFX9-NEXT: v_mul_lo_u32 v10, s3, v2 10708; GFX9-NEXT: v_mul_lo_u32 v11, s2, v2 10709; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 10710; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 10711; GFX9-NEXT: v_add_u32_e32 v8, v8, v10 10712; GFX9-NEXT: v_mul_lo_u32 v12, v2, v8 10713; GFX9-NEXT: v_mul_hi_u32 v13, v2, v11 10714; GFX9-NEXT: v_mul_hi_u32 v14, v2, v8 10715; GFX9-NEXT: v_mul_hi_u32 v10, v3, v11 10716; GFX9-NEXT: v_mul_lo_u32 v11, v3, v11 10717; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12 10718; GFX9-NEXT: v_mul_hi_u32 v9, v3, v8 10719; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v14, vcc 10720; GFX9-NEXT: v_mul_lo_u32 v3, v3, v8 10721; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v12, v11 10722; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v10, vcc 10723; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v6, vcc 10724; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3 10725; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v5, v8, vcc 10726; GFX9-NEXT: v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1] 10727; GFX9-NEXT: s_add_u32 s0, s10, s12 10728; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 10729; GFX9-NEXT: s_addc_u32 s1, s11, s12 10730; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[12:13] 10731; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 10732; GFX9-NEXT: v_mul_lo_u32 v4, s10, v3 10733; GFX9-NEXT: v_mul_hi_u32 v7, s10, v2 10734; GFX9-NEXT: v_mul_hi_u32 v9, s10, v3 10735; GFX9-NEXT: v_mul_hi_u32 v10, s11, v3 10736; GFX9-NEXT: v_mul_lo_u32 v3, s11, v3 10737; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v4 10738; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v9, vcc 10739; GFX9-NEXT: v_mul_lo_u32 v9, s11, v2 10740; GFX9-NEXT: v_mul_hi_u32 v2, s11, v2 10741; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 10742; GFX9-NEXT: v_xor_b32_e32 v1, s6, v1 10743; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 10744; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v2, vcc 10745; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v10, v6, vcc 10746; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 10747; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc 10748; GFX9-NEXT: v_mul_lo_u32 v3, s8, v3 10749; GFX9-NEXT: v_mul_hi_u32 v4, s8, v2 10750; GFX9-NEXT: v_mul_lo_u32 v5, s9, v2 10751; GFX9-NEXT: v_mul_lo_u32 v2, s8, v2 10752; GFX9-NEXT: v_mov_b32_e32 v8, s6 10753; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 10754; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 10755; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v0 10756; GFX9-NEXT: v_sub_co_u32_e64 v2, s[0:1], s10, v2 10757; GFX9-NEXT: v_sub_u32_e32 v4, s11, v3 10758; GFX9-NEXT: v_mov_b32_e32 v5, s9 10759; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc 10760; GFX9-NEXT: v_subb_co_u32_e64 v4, vcc, v4, v5, s[0:1] 10761; GFX9-NEXT: v_subrev_co_u32_e64 v7, s[2:3], s8, v2 10762; GFX9-NEXT: v_subbrev_co_u32_e64 v8, vcc, 0, v4, s[2:3] 10763; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v8 10764; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc 10765; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v7 10766; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc 10767; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v8 10768; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v10, vcc 10769; GFX9-NEXT: v_subb_co_u32_e64 v4, vcc, v4, v5, s[2:3] 10770; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s8, v7 10771; GFX9-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v4, vcc 10772; GFX9-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, v9 10773; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[2:3] 10774; GFX9-NEXT: v_mov_b32_e32 v8, s11 10775; GFX9-NEXT: v_subb_co_u32_e64 v3, vcc, v8, v3, s[0:1] 10776; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 10777; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 10778; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 10779; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc 10780; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 10781; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc 10782; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 10783; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 10784; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v5, s[2:3] 10785; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 10786; GFX9-NEXT: v_xor_b32_e32 v2, s12, v2 10787; GFX9-NEXT: v_xor_b32_e32 v3, s12, v3 10788; GFX9-NEXT: v_mov_b32_e32 v4, s12 10789; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s12, v2 10790; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc 10791; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10792; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] 10793; GFX9-NEXT: s_endpgm 10794 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 10795 %r = srem <2 x i64> %x, %shl.y 10796 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 10797 ret void 10798} 10799