1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 3; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s 5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s 6 7define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 8; CHECK-LABEL: @udiv_i32( 9; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 10; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 11; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000 12; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 13; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]] 14; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]] 15; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64 16; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 17; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 18; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 19; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 20; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 21; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]] 22; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64 23; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 24; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 25; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 26; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 27; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 28; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]] 29; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]] 30; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]] 31; CHECK-NEXT: [[TMP23:%.*]] = add i32 [[TMP19]], 1 32; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP19]] 33; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP21]], [[Y]] 34; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP22]], i32 [[TMP25]], i32 [[TMP21]] 35; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]] 36; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP24]], 1 37; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]] 38; CHECK-NEXT: store i32 [[TMP29]], i32 addrspace(1)* [[OUT:%.*]], align 4 39; CHECK-NEXT: ret void 40; 41; GFX6-LABEL: udiv_i32: 42; GFX6: ; %bb.0: 43; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 44; GFX6-NEXT: s_mov_b32 s7, 0xf000 45; GFX6-NEXT: s_mov_b32 s6, -1 46; GFX6-NEXT: s_waitcnt lgkmcnt(0) 47; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 48; GFX6-NEXT: s_sub_i32 s4, 0, s3 49; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 50; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 51; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 52; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 53; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 54; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 55; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 56; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 57; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 58; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 59; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 60; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 61; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 62; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 63; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 64; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 65; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 66; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 67; GFX6-NEXT: s_waitcnt lgkmcnt(0) 68; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 69; GFX6-NEXT: s_endpgm 70; 71; GFX9-LABEL: udiv_i32: 72; GFX9: ; %bb.0: 73; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 74; GFX9-NEXT: v_mov_b32_e32 v2, 0 75; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 76; GFX9-NEXT: s_waitcnt lgkmcnt(0) 77; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 78; GFX9-NEXT: s_sub_i32 s4, 0, s3 79; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 80; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 81; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 82; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 83; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 84; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 85; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 86; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3 87; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 88; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 89; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 90; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 91; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 92; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 93; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 94; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 95; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 96; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 97; GFX9-NEXT: s_endpgm 98 %r = udiv i32 %x, %y 99 store i32 %r, i32 addrspace(1)* %out 100 ret void 101} 102 103define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 104; CHECK-LABEL: @urem_i32( 105; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 106; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 107; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000 108; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 109; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]] 110; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]] 111; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64 112; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 113; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 114; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 115; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 116; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 117; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]] 118; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64 119; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 120; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 121; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 122; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 123; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 124; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]] 125; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]] 126; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]] 127; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP21]], [[Y]] 128; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP21]] 129; CHECK-NEXT: [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]] 130; CHECK-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]] 131; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]] 132; CHECK-NEXT: store i32 [[TMP27]], i32 addrspace(1)* [[OUT:%.*]], align 4 133; CHECK-NEXT: ret void 134; 135; GFX6-LABEL: urem_i32: 136; GFX6: ; %bb.0: 137; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 138; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 139; GFX6-NEXT: s_mov_b32 s3, 0xf000 140; GFX6-NEXT: s_waitcnt lgkmcnt(0) 141; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 142; GFX6-NEXT: s_sub_i32 s2, 0, s5 143; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 144; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 145; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 146; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 147; GFX6-NEXT: s_mov_b32 s2, -1 148; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 149; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 150; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 151; GFX6-NEXT: v_mul_lo_u32 v0, v0, s5 152; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 153; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s5, v0 154; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 155; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 156; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s5, v0 157; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 158; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 159; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 160; GFX6-NEXT: s_endpgm 161; 162; GFX9-LABEL: urem_i32: 163; GFX9: ; %bb.0: 164; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 165; GFX9-NEXT: s_nop 0 166; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 167; GFX9-NEXT: s_waitcnt lgkmcnt(0) 168; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 169; GFX9-NEXT: s_sub_i32 s4, 0, s3 170; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 171; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 172; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 173; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 174; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 175; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 176; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 177; GFX9-NEXT: v_mov_b32_e32 v1, 0 178; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 179; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 180; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 181; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 182; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 183; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 184; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 185; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 186; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 187; GFX9-NEXT: s_endpgm 188 %r = urem i32 %x, %y 189 store i32 %r, i32 addrspace(1)* %out 190 ret void 191} 192 193define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 194; CHECK-LABEL: @sdiv_i32( 195; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 196; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 197; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 198; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[X]], [[TMP1]] 199; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]] 200; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]] 201; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]] 202; CHECK-NEXT: [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float 203; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]]) 204; CHECK-NEXT: [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41EFFFFFC0000000 205; CHECK-NEXT: [[TMP11:%.*]] = fptoui float [[TMP10]] to i32 206; CHECK-NEXT: [[TMP12:%.*]] = sub i32 0, [[TMP7]] 207; CHECK-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP11]] 208; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP11]] to i64 209; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 210; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 211; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 212; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 213; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 214; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP11]], [[TMP19]] 215; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP6]] to i64 216; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP20]] to i64 217; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]] 218; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 219; CHECK-NEXT: [[TMP25:%.*]] = lshr i64 [[TMP23]], 32 220; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 221; CHECK-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], [[TMP7]] 222; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP6]], [[TMP27]] 223; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP7]] 224; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 225; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 226; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP28]], [[TMP7]] 227; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP29]], i32 [[TMP32]], i32 [[TMP28]] 228; CHECK-NEXT: [[TMP34:%.*]] = icmp uge i32 [[TMP33]], [[TMP7]] 229; CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP31]], 1 230; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]] 231; CHECK-NEXT: [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]] 232; CHECK-NEXT: [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]] 233; CHECK-NEXT: store i32 [[TMP38]], i32 addrspace(1)* [[OUT:%.*]], align 4 234; CHECK-NEXT: ret void 235; 236; GFX6-LABEL: sdiv_i32: 237; GFX6: ; %bb.0: 238; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 239; GFX6-NEXT: s_mov_b32 s7, 0xf000 240; GFX6-NEXT: s_mov_b32 s6, -1 241; GFX6-NEXT: s_waitcnt lgkmcnt(0) 242; GFX6-NEXT: s_ashr_i32 s8, s3, 31 243; GFX6-NEXT: s_add_i32 s3, s3, s8 244; GFX6-NEXT: s_xor_b32 s3, s3, s8 245; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 246; GFX6-NEXT: s_sub_i32 s4, 0, s3 247; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 248; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 249; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 250; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 251; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 252; GFX6-NEXT: s_ashr_i32 s0, s2, 31 253; GFX6-NEXT: s_add_i32 s1, s2, s0 254; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 255; GFX6-NEXT: s_xor_b32 s1, s1, s0 256; GFX6-NEXT: s_xor_b32 s2, s0, s8 257; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 258; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 259; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 260; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 261; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 262; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 263; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 264; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 265; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 266; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 267; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 268; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 269; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 270; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 271; GFX6-NEXT: s_waitcnt lgkmcnt(0) 272; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 273; GFX6-NEXT: s_endpgm 274; 275; GFX9-LABEL: sdiv_i32: 276; GFX9: ; %bb.0: 277; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 278; GFX9-NEXT: v_mov_b32_e32 v2, 0 279; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 280; GFX9-NEXT: s_waitcnt lgkmcnt(0) 281; GFX9-NEXT: s_ashr_i32 s4, s3, 31 282; GFX9-NEXT: s_add_i32 s3, s3, s4 283; GFX9-NEXT: s_xor_b32 s3, s3, s4 284; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 285; GFX9-NEXT: s_sub_i32 s5, 0, s3 286; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 287; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 288; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 289; GFX9-NEXT: v_mul_lo_u32 v1, s5, v0 290; GFX9-NEXT: s_ashr_i32 s5, s2, 31 291; GFX9-NEXT: s_add_i32 s2, s2, s5 292; GFX9-NEXT: s_xor_b32 s2, s2, s5 293; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 294; GFX9-NEXT: s_xor_b32 s4, s5, s4 295; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 296; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 297; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3 298; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 299; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 300; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 301; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 302; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 303; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 304; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 305; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 306; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 307; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 308; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 309; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 310; GFX9-NEXT: s_endpgm 311 %r = sdiv i32 %x, %y 312 store i32 %r, i32 addrspace(1)* %out 313 ret void 314} 315 316define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 317; CHECK-LABEL: @srem_i32( 318; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 319; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 320; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[X]], [[TMP1]] 321; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]] 322; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]] 323; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]] 324; CHECK-NEXT: [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float 325; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 326; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41EFFFFFC0000000 327; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP9]] to i32 328; CHECK-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP6]] 329; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP10]] 330; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP10]] to i64 331; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64 332; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]] 333; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 334; CHECK-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP15]], 32 335; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 336; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP10]], [[TMP18]] 337; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64 338; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP19]] to i64 339; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]] 340; CHECK-NEXT: [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32 341; CHECK-NEXT: [[TMP24:%.*]] = lshr i64 [[TMP22]], 32 342; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 343; CHECK-NEXT: [[TMP26:%.*]] = mul i32 [[TMP25]], [[TMP6]] 344; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP5]], [[TMP26]] 345; CHECK-NEXT: [[TMP28:%.*]] = icmp uge i32 [[TMP27]], [[TMP6]] 346; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP6]] 347; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP28]], i32 [[TMP29]], i32 [[TMP27]] 348; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP6]] 349; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP30]], [[TMP6]] 350; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]] 351; CHECK-NEXT: [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]] 352; CHECK-NEXT: [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]] 353; CHECK-NEXT: store i32 [[TMP35]], i32 addrspace(1)* [[OUT:%.*]], align 4 354; CHECK-NEXT: ret void 355; 356; GFX6-LABEL: srem_i32: 357; GFX6: ; %bb.0: 358; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 359; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 360; GFX6-NEXT: s_waitcnt lgkmcnt(0) 361; GFX6-NEXT: s_ashr_i32 s4, s3, 31 362; GFX6-NEXT: s_add_i32 s3, s3, s4 363; GFX6-NEXT: s_xor_b32 s4, s3, s4 364; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 365; GFX6-NEXT: s_sub_i32 s3, 0, s4 366; GFX6-NEXT: s_ashr_i32 s5, s2, 31 367; GFX6-NEXT: s_add_i32 s2, s2, s5 368; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 369; GFX6-NEXT: s_xor_b32 s6, s2, s5 370; GFX6-NEXT: s_mov_b32 s2, -1 371; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 372; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 373; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 374; GFX6-NEXT: s_mov_b32 s3, 0xf000 375; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 376; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 377; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 378; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 379; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 380; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 381; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 382; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 383; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 384; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 385; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 386; GFX6-NEXT: v_xor_b32_e32 v0, s5, v0 387; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s5, v0 388; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 389; GFX6-NEXT: s_endpgm 390; 391; GFX9-LABEL: srem_i32: 392; GFX9: ; %bb.0: 393; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 394; GFX9-NEXT: s_nop 0 395; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 396; GFX9-NEXT: s_waitcnt lgkmcnt(0) 397; GFX9-NEXT: s_ashr_i32 s4, s3, 31 398; GFX9-NEXT: s_add_i32 s3, s3, s4 399; GFX9-NEXT: s_xor_b32 s3, s3, s4 400; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 401; GFX9-NEXT: s_sub_i32 s4, 0, s3 402; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 403; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 404; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 405; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 406; GFX9-NEXT: s_ashr_i32 s4, s2, 31 407; GFX9-NEXT: s_add_i32 s2, s2, s4 408; GFX9-NEXT: s_xor_b32 s2, s2, s4 409; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 410; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 411; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 412; GFX9-NEXT: v_mov_b32_e32 v1, 0 413; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 414; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 415; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 416; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 417; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 418; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 419; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 420; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 421; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 422; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 423; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 424; GFX9-NEXT: s_endpgm 425 %r = srem i32 %x, %y 426 store i32 %r, i32 addrspace(1)* %out 427 ret void 428} 429 430define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 431; CHECK-LABEL: @udiv_i16( 432; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 433; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 434; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 435; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 436; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 437; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 438; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 439; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 440; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 441; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 442; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 443; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 444; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 445; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 446; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 447; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 65535 448; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 449; CHECK-NEXT: store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]], align 2 450; CHECK-NEXT: ret void 451; 452; GFX6-LABEL: udiv_i16: 453; GFX6: ; %bb.0: 454; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb 455; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 456; GFX6-NEXT: s_waitcnt lgkmcnt(0) 457; GFX6-NEXT: s_lshr_b32 s3, s2, 16 458; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 459; GFX6-NEXT: s_and_b32 s2, s2, 0xffff 460; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s2 461; GFX6-NEXT: s_mov_b32 s3, 0xf000 462; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 463; GFX6-NEXT: s_mov_b32 s2, -1 464; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 465; GFX6-NEXT: v_trunc_f32_e32 v2, v2 466; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 467; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 468; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 469; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 470; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 471; GFX6-NEXT: s_endpgm 472; 473; GFX9-LABEL: udiv_i16: 474; GFX9: ; %bb.0: 475; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 476; GFX9-NEXT: v_mov_b32_e32 v3, 0 477; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 478; GFX9-NEXT: s_waitcnt lgkmcnt(0) 479; GFX9-NEXT: s_lshr_b32 s3, s2, 16 480; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 481; GFX9-NEXT: s_and_b32 s2, s2, 0xffff 482; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 483; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 484; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 485; GFX9-NEXT: v_trunc_f32_e32 v2, v2 486; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 487; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 488; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 489; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 490; GFX9-NEXT: global_store_short v3, v0, s[0:1] 491; GFX9-NEXT: s_endpgm 492 %r = udiv i16 %x, %y 493 store i16 %r, i16 addrspace(1)* %out 494 ret void 495} 496 497define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 498; CHECK-LABEL: @urem_i16( 499; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 500; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 501; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 502; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 503; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 504; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 505; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 506; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 507; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 508; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 509; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 510; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 511; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 512; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 513; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 514; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 515; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 516; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 517; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 518; CHECK-NEXT: store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]], align 2 519; CHECK-NEXT: ret void 520; 521; GFX6-LABEL: urem_i16: 522; GFX6: ; %bb.0: 523; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 524; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 525; GFX6-NEXT: s_waitcnt lgkmcnt(0) 526; GFX6-NEXT: s_lshr_b32 s2, s4, 16 527; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 528; GFX6-NEXT: s_and_b32 s3, s4, 0xffff 529; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 530; GFX6-NEXT: s_mov_b32 s3, 0xf000 531; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 532; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 533; GFX6-NEXT: v_trunc_f32_e32 v2, v2 534; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 535; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 536; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 537; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 538; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 539; GFX6-NEXT: s_mov_b32 s2, -1 540; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 541; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 542; GFX6-NEXT: s_endpgm 543; 544; GFX9-LABEL: urem_i16: 545; GFX9: ; %bb.0: 546; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 547; GFX9-NEXT: s_nop 0 548; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 549; GFX9-NEXT: s_waitcnt lgkmcnt(0) 550; GFX9-NEXT: s_lshr_b32 s3, s2, 16 551; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 552; GFX9-NEXT: s_and_b32 s4, s2, 0xffff 553; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 554; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 555; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 556; GFX9-NEXT: v_trunc_f32_e32 v2, v2 557; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 558; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 559; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 560; GFX9-NEXT: v_mov_b32_e32 v1, 0 561; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 562; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 563; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 564; GFX9-NEXT: global_store_short v1, v0, s[0:1] 565; GFX9-NEXT: s_endpgm 566 %r = urem i16 %x, %y 567 store i16 %r, i16 addrspace(1)* %out 568 ret void 569} 570 571define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 572; CHECK-LABEL: @sdiv_i16( 573; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 574; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 575; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 576; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 577; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 578; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 579; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 580; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 581; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 582; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 583; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 584; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 585; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 586; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 587; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 588; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 589; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 590; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 591; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 16 592; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 16 593; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 594; CHECK-NEXT: store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]], align 2 595; CHECK-NEXT: ret void 596; 597; GFX6-LABEL: sdiv_i16: 598; GFX6: ; %bb.0: 599; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 600; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 601; GFX6-NEXT: s_mov_b32 s7, 0xf000 602; GFX6-NEXT: s_mov_b32 s6, -1 603; GFX6-NEXT: s_waitcnt lgkmcnt(0) 604; GFX6-NEXT: s_ashr_i32 s1, s0, 16 605; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 606; GFX6-NEXT: s_sext_i32_i16 s0, s0 607; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 608; GFX6-NEXT: s_xor_b32 s0, s0, s1 609; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 610; GFX6-NEXT: s_ashr_i32 s0, s0, 30 611; GFX6-NEXT: s_or_b32 s0, s0, 1 612; GFX6-NEXT: v_mov_b32_e32 v3, s0 613; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 614; GFX6-NEXT: v_trunc_f32_e32 v2, v2 615; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 616; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 617; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 618; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 619; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 620; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 621; GFX6-NEXT: s_endpgm 622; 623; GFX9-LABEL: sdiv_i16: 624; GFX9: ; %bb.0: 625; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 626; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 627; GFX9-NEXT: v_mov_b32_e32 v1, 0 628; GFX9-NEXT: s_waitcnt lgkmcnt(0) 629; GFX9-NEXT: s_ashr_i32 s0, s4, 16 630; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 631; GFX9-NEXT: s_sext_i32_i16 s1, s4 632; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 633; GFX9-NEXT: s_xor_b32 s0, s1, s0 634; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 635; GFX9-NEXT: s_ashr_i32 s0, s0, 30 636; GFX9-NEXT: s_or_b32 s4, s0, 1 637; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 638; GFX9-NEXT: v_trunc_f32_e32 v3, v3 639; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 640; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 641; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 642; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 643; GFX9-NEXT: s_cselect_b32 s0, s4, 0 644; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 645; GFX9-NEXT: global_store_short v1, v0, s[2:3] 646; GFX9-NEXT: s_endpgm 647 %r = sdiv i16 %x, %y 648 store i16 %r, i16 addrspace(1)* %out 649 ret void 650} 651 652define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 653; CHECK-LABEL: @srem_i16( 654; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 655; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 656; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 657; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 658; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 659; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 660; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 661; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 662; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 663; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 664; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 665; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 666; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 667; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 668; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 669; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 670; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 671; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 672; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 673; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 674; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 675; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 676; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 677; CHECK-NEXT: store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]], align 2 678; CHECK-NEXT: ret void 679; 680; GFX6-LABEL: srem_i16: 681; GFX6: ; %bb.0: 682; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 683; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 684; GFX6-NEXT: s_waitcnt lgkmcnt(0) 685; GFX6-NEXT: s_ashr_i32 s2, s4, 16 686; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 687; GFX6-NEXT: s_sext_i32_i16 s3, s4 688; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 689; GFX6-NEXT: s_xor_b32 s3, s3, s2 690; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 691; GFX6-NEXT: s_ashr_i32 s3, s3, 30 692; GFX6-NEXT: s_or_b32 s3, s3, 1 693; GFX6-NEXT: v_mov_b32_e32 v3, s3 694; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 695; GFX6-NEXT: v_trunc_f32_e32 v2, v2 696; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 697; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 698; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 699; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 700; GFX6-NEXT: s_mov_b32 s3, 0xf000 701; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 702; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 703; GFX6-NEXT: s_mov_b32 s2, -1 704; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 705; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 706; GFX6-NEXT: s_endpgm 707; 708; GFX9-LABEL: srem_i16: 709; GFX9: ; %bb.0: 710; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 711; GFX9-NEXT: s_nop 0 712; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 713; GFX9-NEXT: s_waitcnt lgkmcnt(0) 714; GFX9-NEXT: s_ashr_i32 s5, s4, 16 715; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5 716; GFX9-NEXT: s_sext_i32_i16 s2, s4 717; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 718; GFX9-NEXT: s_xor_b32 s2, s2, s5 719; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 720; GFX9-NEXT: s_ashr_i32 s2, s2, 30 721; GFX9-NEXT: s_or_b32 s6, s2, 1 722; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 723; GFX9-NEXT: v_trunc_f32_e32 v2, v2 724; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 725; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 726; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 727; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 728; GFX9-NEXT: s_cselect_b32 s2, s6, 0 729; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 730; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 731; GFX9-NEXT: v_mov_b32_e32 v1, 0 732; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 733; GFX9-NEXT: global_store_short v1, v0, s[0:1] 734; GFX9-NEXT: s_endpgm 735 %r = srem i16 %x, %y 736 store i16 %r, i16 addrspace(1)* %out 737 ret void 738} 739 740define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 741; CHECK-LABEL: @udiv_i8( 742; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 743; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 744; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 745; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 746; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 747; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 748; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 749; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 750; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 751; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 752; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 753; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 754; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 755; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 756; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 757; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 255 758; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 759; CHECK-NEXT: store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]], align 1 760; CHECK-NEXT: ret void 761; 762; GFX6-LABEL: udiv_i8: 763; GFX6: ; %bb.0: 764; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 765; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 766; GFX6-NEXT: s_mov_b32 s7, 0xf000 767; GFX6-NEXT: s_mov_b32 s6, -1 768; GFX6-NEXT: s_waitcnt lgkmcnt(0) 769; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s0 770; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 771; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 772; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 773; GFX6-NEXT: v_trunc_f32_e32 v1, v1 774; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 775; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 776; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 777; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 778; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 779; GFX6-NEXT: s_endpgm 780; 781; GFX9-LABEL: udiv_i8: 782; GFX9: ; %bb.0: 783; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 784; GFX9-NEXT: v_mov_b32_e32 v2, 0 785; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 786; GFX9-NEXT: s_waitcnt lgkmcnt(0) 787; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 788; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 789; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 790; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 791; GFX9-NEXT: v_trunc_f32_e32 v1, v1 792; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 793; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v3 794; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 795; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 796; GFX9-NEXT: global_store_byte v2, v0, s[0:1] 797; GFX9-NEXT: s_endpgm 798 %r = udiv i8 %x, %y 799 store i8 %r, i8 addrspace(1)* %out 800 ret void 801} 802 803define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 804; CHECK-LABEL: @urem_i8( 805; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 806; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 807; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 808; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 809; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 810; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 811; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 812; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 813; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 814; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 815; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 816; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 817; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 818; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 819; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 820; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 821; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 822; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 255 823; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8 824; CHECK-NEXT: store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]], align 1 825; CHECK-NEXT: ret void 826; 827; GFX6-LABEL: urem_i8: 828; GFX6: ; %bb.0: 829; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 830; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 831; GFX6-NEXT: s_mov_b32 s3, 0xf000 832; GFX6-NEXT: s_waitcnt lgkmcnt(0) 833; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 834; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 835; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 836; GFX6-NEXT: s_lshr_b32 s2, s4, 8 837; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 838; GFX6-NEXT: v_trunc_f32_e32 v1, v1 839; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 840; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 841; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 842; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 843; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 844; GFX6-NEXT: s_mov_b32 s2, -1 845; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 846; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 847; GFX6-NEXT: s_endpgm 848; 849; GFX9-LABEL: urem_i8: 850; GFX9: ; %bb.0: 851; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 852; GFX9-NEXT: s_nop 0 853; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 854; GFX9-NEXT: s_waitcnt lgkmcnt(0) 855; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 856; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 857; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 858; GFX9-NEXT: s_lshr_b32 s3, s2, 8 859; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 860; GFX9-NEXT: v_trunc_f32_e32 v1, v1 861; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 862; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2 863; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 864; GFX9-NEXT: v_mov_b32_e32 v1, 0 865; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 866; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 867; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 868; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 869; GFX9-NEXT: s_endpgm 870 %r = urem i8 %x, %y 871 store i8 %r, i8 addrspace(1)* %out 872 ret void 873} 874 875define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 876; CHECK-LABEL: @sdiv_i8( 877; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 878; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 879; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 880; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 881; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 882; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 883; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 884; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 885; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 886; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 887; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 888; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 889; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 890; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 891; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 892; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 893; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 894; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 895; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 24 896; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 24 897; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8 898; CHECK-NEXT: store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]], align 1 899; CHECK-NEXT: ret void 900; 901; GFX6-LABEL: sdiv_i8: 902; GFX6: ; %bb.0: 903; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 904; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 905; GFX6-NEXT: s_mov_b32 s7, 0xf000 906; GFX6-NEXT: s_mov_b32 s6, -1 907; GFX6-NEXT: s_waitcnt lgkmcnt(0) 908; GFX6-NEXT: s_bfe_i32 s1, s0, 0x80008 909; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 910; GFX6-NEXT: s_sext_i32_i8 s0, s0 911; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 912; GFX6-NEXT: s_xor_b32 s0, s0, s1 913; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 914; GFX6-NEXT: s_ashr_i32 s0, s0, 30 915; GFX6-NEXT: s_or_b32 s0, s0, 1 916; GFX6-NEXT: v_mov_b32_e32 v3, s0 917; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 918; GFX6-NEXT: v_trunc_f32_e32 v2, v2 919; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 920; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 921; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 922; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 923; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 924; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 925; GFX6-NEXT: s_endpgm 926; 927; GFX9-LABEL: sdiv_i8: 928; GFX9: ; %bb.0: 929; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 930; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 931; GFX9-NEXT: v_mov_b32_e32 v1, 0 932; GFX9-NEXT: s_waitcnt lgkmcnt(0) 933; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80008 934; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 935; GFX9-NEXT: s_sext_i32_i8 s1, s4 936; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 937; GFX9-NEXT: s_xor_b32 s0, s1, s0 938; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 939; GFX9-NEXT: s_ashr_i32 s0, s0, 30 940; GFX9-NEXT: s_or_b32 s4, s0, 1 941; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 942; GFX9-NEXT: v_trunc_f32_e32 v3, v3 943; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 944; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 945; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 946; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 947; GFX9-NEXT: s_cselect_b32 s0, s4, 0 948; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 949; GFX9-NEXT: global_store_byte v1, v0, s[2:3] 950; GFX9-NEXT: s_endpgm 951 %r = sdiv i8 %x, %y 952 store i8 %r, i8 addrspace(1)* %out 953 ret void 954} 955 956define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 957; CHECK-LABEL: @srem_i8( 958; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 959; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 960; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 961; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 962; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 963; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 964; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 965; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 966; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 967; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 968; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 969; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 970; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 971; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 972; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 973; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 974; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 975; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 976; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 977; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 978; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 24 979; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 24 980; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8 981; CHECK-NEXT: store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]], align 1 982; CHECK-NEXT: ret void 983; 984; GFX6-LABEL: srem_i8: 985; GFX6: ; %bb.0: 986; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 987; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 988; GFX6-NEXT: s_mov_b32 s7, 0xf000 989; GFX6-NEXT: s_mov_b32 s6, -1 990; GFX6-NEXT: s_waitcnt lgkmcnt(0) 991; GFX6-NEXT: s_bfe_i32 s1, s0, 0x80008 992; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 993; GFX6-NEXT: s_sext_i32_i8 s3, s0 994; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 995; GFX6-NEXT: s_xor_b32 s1, s3, s1 996; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 997; GFX6-NEXT: s_ashr_i32 s1, s1, 30 998; GFX6-NEXT: s_or_b32 s1, s1, 1 999; GFX6-NEXT: v_mov_b32_e32 v3, s1 1000; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 1001; GFX6-NEXT: v_trunc_f32_e32 v2, v2 1002; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 1003; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 1004; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 1005; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 1006; GFX6-NEXT: s_lshr_b32 s2, s0, 8 1007; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1008; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 1009; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1010; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 1011; GFX6-NEXT: s_endpgm 1012; 1013; GFX9-LABEL: srem_i8: 1014; GFX9: ; %bb.0: 1015; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 1016; GFX9-NEXT: s_nop 0 1017; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1018; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1019; GFX9-NEXT: s_bfe_i32 s2, s4, 0x80008 1020; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 1021; GFX9-NEXT: s_sext_i32_i8 s3, s4 1022; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 1023; GFX9-NEXT: s_xor_b32 s2, s3, s2 1024; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 1025; GFX9-NEXT: s_ashr_i32 s2, s2, 30 1026; GFX9-NEXT: s_lshr_b32 s5, s4, 8 1027; GFX9-NEXT: s_or_b32 s6, s2, 1 1028; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 1029; GFX9-NEXT: v_trunc_f32_e32 v2, v2 1030; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 1031; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 1032; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 1033; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 1034; GFX9-NEXT: s_cselect_b32 s2, s6, 0 1035; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 1036; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 1037; GFX9-NEXT: v_mov_b32_e32 v1, 0 1038; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 1039; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 1040; GFX9-NEXT: s_endpgm 1041 %r = srem i8 %x, %y 1042 store i8 %r, i8 addrspace(1)* %out 1043 ret void 1044} 1045 1046define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1047; CHECK-LABEL: @udiv_v4i32( 1048; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1049; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1050; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 1051; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 1052; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 1053; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 1054; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 1055; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 1056; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 1057; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 1058; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 1059; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 1060; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 1061; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 1062; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 1063; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 1064; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1065; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1066; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1067; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1068; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1069; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 1070; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 1071; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 1072; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1 1073; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]] 1074; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]] 1075; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]] 1076; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]] 1077; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 1078; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 1079; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i64 0 1080; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[X]], i64 1 1081; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1082; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float 1083; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 1084; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000 1085; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32 1086; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]] 1087; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]] 1088; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64 1089; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64 1090; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]] 1091; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 1092; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32 1093; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32 1094; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]] 1095; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64 1096; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64 1097; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]] 1098; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 1099; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32 1100; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32 1101; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]] 1102; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]] 1103; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]] 1104; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1 1105; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]] 1106; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]] 1107; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]] 1108; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]] 1109; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1 1110; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]] 1111; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP63]], i64 1 1112; CHECK-NEXT: [[TMP65:%.*]] = extractelement <4 x i32> [[X]], i64 2 1113; CHECK-NEXT: [[TMP66:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1114; CHECK-NEXT: [[TMP67:%.*]] = uitofp i32 [[TMP66]] to float 1115; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP67]]) 1116; CHECK-NEXT: [[TMP69:%.*]] = fmul fast float [[TMP68]], 0x41EFFFFFC0000000 1117; CHECK-NEXT: [[TMP70:%.*]] = fptoui float [[TMP69]] to i32 1118; CHECK-NEXT: [[TMP71:%.*]] = sub i32 0, [[TMP66]] 1119; CHECK-NEXT: [[TMP72:%.*]] = mul i32 [[TMP71]], [[TMP70]] 1120; CHECK-NEXT: [[TMP73:%.*]] = zext i32 [[TMP70]] to i64 1121; CHECK-NEXT: [[TMP74:%.*]] = zext i32 [[TMP72]] to i64 1122; CHECK-NEXT: [[TMP75:%.*]] = mul i64 [[TMP73]], [[TMP74]] 1123; CHECK-NEXT: [[TMP76:%.*]] = trunc i64 [[TMP75]] to i32 1124; CHECK-NEXT: [[TMP77:%.*]] = lshr i64 [[TMP75]], 32 1125; CHECK-NEXT: [[TMP78:%.*]] = trunc i64 [[TMP77]] to i32 1126; CHECK-NEXT: [[TMP79:%.*]] = add i32 [[TMP70]], [[TMP78]] 1127; CHECK-NEXT: [[TMP80:%.*]] = zext i32 [[TMP65]] to i64 1128; CHECK-NEXT: [[TMP81:%.*]] = zext i32 [[TMP79]] to i64 1129; CHECK-NEXT: [[TMP82:%.*]] = mul i64 [[TMP80]], [[TMP81]] 1130; CHECK-NEXT: [[TMP83:%.*]] = trunc i64 [[TMP82]] to i32 1131; CHECK-NEXT: [[TMP84:%.*]] = lshr i64 [[TMP82]], 32 1132; CHECK-NEXT: [[TMP85:%.*]] = trunc i64 [[TMP84]] to i32 1133; CHECK-NEXT: [[TMP86:%.*]] = mul i32 [[TMP85]], [[TMP66]] 1134; CHECK-NEXT: [[TMP87:%.*]] = sub i32 [[TMP65]], [[TMP86]] 1135; CHECK-NEXT: [[TMP88:%.*]] = icmp uge i32 [[TMP87]], [[TMP66]] 1136; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP85]], 1 1137; CHECK-NEXT: [[TMP90:%.*]] = select i1 [[TMP88]], i32 [[TMP89]], i32 [[TMP85]] 1138; CHECK-NEXT: [[TMP91:%.*]] = sub i32 [[TMP87]], [[TMP66]] 1139; CHECK-NEXT: [[TMP92:%.*]] = select i1 [[TMP88]], i32 [[TMP91]], i32 [[TMP87]] 1140; CHECK-NEXT: [[TMP93:%.*]] = icmp uge i32 [[TMP92]], [[TMP66]] 1141; CHECK-NEXT: [[TMP94:%.*]] = add i32 [[TMP90]], 1 1142; CHECK-NEXT: [[TMP95:%.*]] = select i1 [[TMP93]], i32 [[TMP94]], i32 [[TMP90]] 1143; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP95]], i64 2 1144; CHECK-NEXT: [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 3 1145; CHECK-NEXT: [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1146; CHECK-NEXT: [[TMP99:%.*]] = uitofp i32 [[TMP98]] to float 1147; CHECK-NEXT: [[TMP100:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP99]]) 1148; CHECK-NEXT: [[TMP101:%.*]] = fmul fast float [[TMP100]], 0x41EFFFFFC0000000 1149; CHECK-NEXT: [[TMP102:%.*]] = fptoui float [[TMP101]] to i32 1150; CHECK-NEXT: [[TMP103:%.*]] = sub i32 0, [[TMP98]] 1151; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP102]] 1152; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP102]] to i64 1153; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64 1154; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 1155; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 1156; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 1157; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 1158; CHECK-NEXT: [[TMP111:%.*]] = add i32 [[TMP102]], [[TMP110]] 1159; CHECK-NEXT: [[TMP112:%.*]] = zext i32 [[TMP97]] to i64 1160; CHECK-NEXT: [[TMP113:%.*]] = zext i32 [[TMP111]] to i64 1161; CHECK-NEXT: [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]] 1162; CHECK-NEXT: [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32 1163; CHECK-NEXT: [[TMP116:%.*]] = lshr i64 [[TMP114]], 32 1164; CHECK-NEXT: [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32 1165; CHECK-NEXT: [[TMP118:%.*]] = mul i32 [[TMP117]], [[TMP98]] 1166; CHECK-NEXT: [[TMP119:%.*]] = sub i32 [[TMP97]], [[TMP118]] 1167; CHECK-NEXT: [[TMP120:%.*]] = icmp uge i32 [[TMP119]], [[TMP98]] 1168; CHECK-NEXT: [[TMP121:%.*]] = add i32 [[TMP117]], 1 1169; CHECK-NEXT: [[TMP122:%.*]] = select i1 [[TMP120]], i32 [[TMP121]], i32 [[TMP117]] 1170; CHECK-NEXT: [[TMP123:%.*]] = sub i32 [[TMP119]], [[TMP98]] 1171; CHECK-NEXT: [[TMP124:%.*]] = select i1 [[TMP120]], i32 [[TMP123]], i32 [[TMP119]] 1172; CHECK-NEXT: [[TMP125:%.*]] = icmp uge i32 [[TMP124]], [[TMP98]] 1173; CHECK-NEXT: [[TMP126:%.*]] = add i32 [[TMP122]], 1 1174; CHECK-NEXT: [[TMP127:%.*]] = select i1 [[TMP125]], i32 [[TMP126]], i32 [[TMP122]] 1175; CHECK-NEXT: [[TMP128:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP127]], i64 3 1176; CHECK-NEXT: store <4 x i32> [[TMP128]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1177; CHECK-NEXT: ret void 1178; 1179; GFX6-LABEL: udiv_v4i32: 1180; GFX6: ; %bb.0: 1181; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1182; GFX6-NEXT: s_mov_b32 s3, 0x4f7ffffe 1183; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 1184; GFX6-NEXT: s_mov_b32 s15, 0xf000 1185; GFX6-NEXT: s_mov_b32 s14, -1 1186; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1187; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 1188; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 1189; GFX6-NEXT: s_sub_i32 s2, 0, s8 1190; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s10 1191; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 1192; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 1193; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s11 1194; GFX6-NEXT: v_mul_f32_e32 v0, s3, v0 1195; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 1196; GFX6-NEXT: v_mul_f32_e32 v1, s3, v1 1197; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 1198; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 1199; GFX6-NEXT: s_sub_i32 s2, 0, s9 1200; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 1201; GFX6-NEXT: s_sub_i32 s2, 0, s10 1202; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 1203; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 1204; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1205; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 1206; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 1207; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 1208; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 1209; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 1210; GFX6-NEXT: v_mul_lo_u32 v5, v1, s9 1211; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 1212; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 1213; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 1214; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v2 1215; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 1216; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 1217; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 1218; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v4 1219; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1220; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s5, v5 1221; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1222; GFX6-NEXT: v_mul_f32_e32 v2, s3, v2 1223; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 1224; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v3 1225; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 1226; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s9, v3 1227; GFX6-NEXT: v_mul_lo_u32 v4, s2, v2 1228; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1229; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v1 1230; GFX6-NEXT: s_sub_i32 s0, 0, s11 1231; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4 1232; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 1233; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v6 1234; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 1235; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1236; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 1237; GFX6-NEXT: v_mul_f32_e32 v4, s3, v4 1238; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 1239; GFX6-NEXT: v_mul_lo_u32 v3, v2, s10 1240; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v2 1241; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 1242; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 1243; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v3 1244; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 1245; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 1246; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s10, v3 1247; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 1248; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 1249; GFX6-NEXT: v_mul_hi_u32 v4, s7, v4 1250; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v2 1251; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 1252; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1253; GFX6-NEXT: v_mul_lo_u32 v6, v4, s11 1254; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1255; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v6 1256; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v3 1257; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] 1258; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s11, v3 1259; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1260; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1261; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1262; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 1263; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 1264; GFX6-NEXT: s_endpgm 1265; 1266; GFX9-LABEL: udiv_v4i32: 1267; GFX9: ; %bb.0: 1268; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 1269; GFX9-NEXT: s_mov_b32 s12, 0x4f7ffffe 1270; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1271; GFX9-NEXT: v_mov_b32_e32 v4, 0 1272; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1273; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 1274; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 1275; GFX9-NEXT: s_sub_i32 s2, 0, s8 1276; GFX9-NEXT: s_sub_i32 s3, 0, s9 1277; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1278; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 1279; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s10 1280; GFX9-NEXT: v_mul_f32_e32 v0, s12, v0 1281; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1282; GFX9-NEXT: v_mul_f32_e32 v1, s12, v1 1283; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 1284; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 1285; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 1286; GFX9-NEXT: s_sub_i32 s2, 0, s10 1287; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 1288; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 1289; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 1290; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 1291; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 1292; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 1293; GFX9-NEXT: v_mul_f32_e32 v3, s12, v5 1294; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 1295; GFX9-NEXT: v_mul_lo_u32 v5, v0, s8 1296; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s11 1297; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 1298; GFX9-NEXT: v_add_u32_e32 v7, 1, v0 1299; GFX9-NEXT: v_sub_u32_e32 v5, s4, v5 1300; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 1301; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc 1302; GFX9-NEXT: v_subrev_u32_e32 v7, s8, v5 1303; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc 1304; GFX9-NEXT: v_mul_lo_u32 v6, v1, s9 1305; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 1306; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 1307; GFX9-NEXT: v_add_u32_e32 v7, 1, v0 1308; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc 1309; GFX9-NEXT: v_mul_lo_u32 v7, s2, v3 1310; GFX9-NEXT: v_sub_u32_e32 v6, s5, v6 1311; GFX9-NEXT: v_mul_f32_e32 v2, s12, v2 1312; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 1313; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v6 1314; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1315; GFX9-NEXT: v_mul_hi_u32 v5, v3, v7 1316; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 1317; GFX9-NEXT: s_sub_i32 s2, 0, s11 1318; GFX9-NEXT: v_subrev_u32_e32 v7, s9, v6 1319; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 1320; GFX9-NEXT: v_mul_lo_u32 v5, s2, v2 1321; GFX9-NEXT: v_mul_hi_u32 v3, s6, v3 1322; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc 1323; GFX9-NEXT: v_add_u32_e32 v7, 1, v1 1324; GFX9-NEXT: v_mul_hi_u32 v5, v2, v5 1325; GFX9-NEXT: v_mul_lo_u32 v8, v3, s10 1326; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v6 1327; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc 1328; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 1329; GFX9-NEXT: v_mul_hi_u32 v5, s7, v2 1330; GFX9-NEXT: v_sub_u32_e32 v6, s6, v8 1331; GFX9-NEXT: v_add_u32_e32 v7, 1, v3 1332; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v6 1333; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc 1334; GFX9-NEXT: v_subrev_u32_e32 v3, s10, v6 1335; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc 1336; GFX9-NEXT: v_mul_lo_u32 v6, v5, s11 1337; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 1338; GFX9-NEXT: v_add_u32_e32 v7, 1, v2 1339; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc 1340; GFX9-NEXT: v_sub_u32_e32 v3, s7, v6 1341; GFX9-NEXT: v_add_u32_e32 v6, 1, v5 1342; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1343; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 1344; GFX9-NEXT: v_subrev_u32_e32 v6, s11, v3 1345; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1346; GFX9-NEXT: v_add_u32_e32 v6, 1, v5 1347; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1348; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 1349; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1350; GFX9-NEXT: s_endpgm 1351 %r = udiv <4 x i32> %x, %y 1352 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1353 ret void 1354} 1355 1356define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1357; CHECK-LABEL: @urem_v4i32( 1358; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1359; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1360; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 1361; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 1362; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 1363; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 1364; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 1365; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 1366; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 1367; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 1368; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 1369; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 1370; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 1371; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 1372; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 1373; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 1374; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1375; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1376; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1377; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1378; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1379; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 1380; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 1381; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 1382; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]] 1383; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]] 1384; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]] 1385; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]] 1386; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]] 1387; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> undef, i32 [[TMP29]], i64 0 1388; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[X]], i64 1 1389; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1390; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float 1391; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 1392; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000 1393; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32 1394; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]] 1395; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]] 1396; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64 1397; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64 1398; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]] 1399; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32 1400; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32 1401; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 1402; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]] 1403; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64 1404; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64 1405; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]] 1406; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32 1407; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32 1408; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 1409; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]] 1410; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]] 1411; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]] 1412; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]] 1413; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]] 1414; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]] 1415; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]] 1416; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]] 1417; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP59]], i64 1 1418; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i32> [[X]], i64 2 1419; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1420; CHECK-NEXT: [[TMP63:%.*]] = uitofp i32 [[TMP62]] to float 1421; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP63]]) 1422; CHECK-NEXT: [[TMP65:%.*]] = fmul fast float [[TMP64]], 0x41EFFFFFC0000000 1423; CHECK-NEXT: [[TMP66:%.*]] = fptoui float [[TMP65]] to i32 1424; CHECK-NEXT: [[TMP67:%.*]] = sub i32 0, [[TMP62]] 1425; CHECK-NEXT: [[TMP68:%.*]] = mul i32 [[TMP67]], [[TMP66]] 1426; CHECK-NEXT: [[TMP69:%.*]] = zext i32 [[TMP66]] to i64 1427; CHECK-NEXT: [[TMP70:%.*]] = zext i32 [[TMP68]] to i64 1428; CHECK-NEXT: [[TMP71:%.*]] = mul i64 [[TMP69]], [[TMP70]] 1429; CHECK-NEXT: [[TMP72:%.*]] = trunc i64 [[TMP71]] to i32 1430; CHECK-NEXT: [[TMP73:%.*]] = lshr i64 [[TMP71]], 32 1431; CHECK-NEXT: [[TMP74:%.*]] = trunc i64 [[TMP73]] to i32 1432; CHECK-NEXT: [[TMP75:%.*]] = add i32 [[TMP66]], [[TMP74]] 1433; CHECK-NEXT: [[TMP76:%.*]] = zext i32 [[TMP61]] to i64 1434; CHECK-NEXT: [[TMP77:%.*]] = zext i32 [[TMP75]] to i64 1435; CHECK-NEXT: [[TMP78:%.*]] = mul i64 [[TMP76]], [[TMP77]] 1436; CHECK-NEXT: [[TMP79:%.*]] = trunc i64 [[TMP78]] to i32 1437; CHECK-NEXT: [[TMP80:%.*]] = lshr i64 [[TMP78]], 32 1438; CHECK-NEXT: [[TMP81:%.*]] = trunc i64 [[TMP80]] to i32 1439; CHECK-NEXT: [[TMP82:%.*]] = mul i32 [[TMP81]], [[TMP62]] 1440; CHECK-NEXT: [[TMP83:%.*]] = sub i32 [[TMP61]], [[TMP82]] 1441; CHECK-NEXT: [[TMP84:%.*]] = icmp uge i32 [[TMP83]], [[TMP62]] 1442; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP83]], [[TMP62]] 1443; CHECK-NEXT: [[TMP86:%.*]] = select i1 [[TMP84]], i32 [[TMP85]], i32 [[TMP83]] 1444; CHECK-NEXT: [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP62]] 1445; CHECK-NEXT: [[TMP88:%.*]] = sub i32 [[TMP86]], [[TMP62]] 1446; CHECK-NEXT: [[TMP89:%.*]] = select i1 [[TMP87]], i32 [[TMP88]], i32 [[TMP86]] 1447; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP89]], i64 2 1448; CHECK-NEXT: [[TMP91:%.*]] = extractelement <4 x i32> [[X]], i64 3 1449; CHECK-NEXT: [[TMP92:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1450; CHECK-NEXT: [[TMP93:%.*]] = uitofp i32 [[TMP92]] to float 1451; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP93]]) 1452; CHECK-NEXT: [[TMP95:%.*]] = fmul fast float [[TMP94]], 0x41EFFFFFC0000000 1453; CHECK-NEXT: [[TMP96:%.*]] = fptoui float [[TMP95]] to i32 1454; CHECK-NEXT: [[TMP97:%.*]] = sub i32 0, [[TMP92]] 1455; CHECK-NEXT: [[TMP98:%.*]] = mul i32 [[TMP97]], [[TMP96]] 1456; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP96]] to i64 1457; CHECK-NEXT: [[TMP100:%.*]] = zext i32 [[TMP98]] to i64 1458; CHECK-NEXT: [[TMP101:%.*]] = mul i64 [[TMP99]], [[TMP100]] 1459; CHECK-NEXT: [[TMP102:%.*]] = trunc i64 [[TMP101]] to i32 1460; CHECK-NEXT: [[TMP103:%.*]] = lshr i64 [[TMP101]], 32 1461; CHECK-NEXT: [[TMP104:%.*]] = trunc i64 [[TMP103]] to i32 1462; CHECK-NEXT: [[TMP105:%.*]] = add i32 [[TMP96]], [[TMP104]] 1463; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP91]] to i64 1464; CHECK-NEXT: [[TMP107:%.*]] = zext i32 [[TMP105]] to i64 1465; CHECK-NEXT: [[TMP108:%.*]] = mul i64 [[TMP106]], [[TMP107]] 1466; CHECK-NEXT: [[TMP109:%.*]] = trunc i64 [[TMP108]] to i32 1467; CHECK-NEXT: [[TMP110:%.*]] = lshr i64 [[TMP108]], 32 1468; CHECK-NEXT: [[TMP111:%.*]] = trunc i64 [[TMP110]] to i32 1469; CHECK-NEXT: [[TMP112:%.*]] = mul i32 [[TMP111]], [[TMP92]] 1470; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP91]], [[TMP112]] 1471; CHECK-NEXT: [[TMP114:%.*]] = icmp uge i32 [[TMP113]], [[TMP92]] 1472; CHECK-NEXT: [[TMP115:%.*]] = sub i32 [[TMP113]], [[TMP92]] 1473; CHECK-NEXT: [[TMP116:%.*]] = select i1 [[TMP114]], i32 [[TMP115]], i32 [[TMP113]] 1474; CHECK-NEXT: [[TMP117:%.*]] = icmp uge i32 [[TMP116]], [[TMP92]] 1475; CHECK-NEXT: [[TMP118:%.*]] = sub i32 [[TMP116]], [[TMP92]] 1476; CHECK-NEXT: [[TMP119:%.*]] = select i1 [[TMP117]], i32 [[TMP118]], i32 [[TMP116]] 1477; CHECK-NEXT: [[TMP120:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP119]], i64 3 1478; CHECK-NEXT: store <4 x i32> [[TMP120]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1479; CHECK-NEXT: ret void 1480; 1481; GFX6-LABEL: urem_v4i32: 1482; GFX6: ; %bb.0: 1483; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1484; GFX6-NEXT: s_mov_b32 s13, 0x4f7ffffe 1485; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1486; GFX6-NEXT: s_mov_b32 s3, 0xf000 1487; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1488; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 1489; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 1490; GFX6-NEXT: s_sub_i32 s2, 0, s8 1491; GFX6-NEXT: s_sub_i32 s12, 0, s9 1492; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 1493; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 1494; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 1495; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s11 1496; GFX6-NEXT: v_mul_f32_e32 v0, s13, v0 1497; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 1498; GFX6-NEXT: v_mul_f32_e32 v1, s13, v1 1499; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 1500; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 1501; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 1502; GFX6-NEXT: s_mov_b32 s2, -1 1503; GFX6-NEXT: v_mul_lo_u32 v4, s12, v1 1504; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 1505; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 1506; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1507; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 1508; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 1509; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 1510; GFX6-NEXT: v_mul_f32_e32 v2, s13, v3 1511; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 1512; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 1513; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 1514; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1515; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 1516; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1517; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1518; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 1519; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1520; GFX6-NEXT: s_sub_i32 s4, 0, s10 1521; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1522; GFX6-NEXT: v_mul_lo_u32 v3, s4, v2 1523; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 1524; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 1525; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1526; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1527; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 1528; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 1529; GFX6-NEXT: s_sub_i32 s4, 0, s11 1530; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 1531; GFX6-NEXT: v_mul_f32_e32 v3, s13, v4 1532; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 1533; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 1534; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 1535; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1536; GFX6-NEXT: v_mul_lo_u32 v5, s4, v3 1537; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1538; GFX6-NEXT: v_mul_lo_u32 v2, v2, s10 1539; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 1540; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 1541; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 1542; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 1543; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 1544; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1545; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1546; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s10, v2 1547; GFX6-NEXT: v_mul_lo_u32 v3, v3, s11 1548; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1549; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 1550; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 1551; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 1552; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1553; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1554; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 1555; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1556; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1557; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1558; GFX6-NEXT: s_endpgm 1559; 1560; GFX9-LABEL: urem_v4i32: 1561; GFX9: ; %bb.0: 1562; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 1563; GFX9-NEXT: s_mov_b32 s12, 0x4f7ffffe 1564; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1565; GFX9-NEXT: v_mov_b32_e32 v4, 0 1566; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1567; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 1568; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 1569; GFX9-NEXT: s_sub_i32 s2, 0, s8 1570; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s10 1571; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1572; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 1573; GFX9-NEXT: s_sub_i32 s3, 0, s9 1574; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 1575; GFX9-NEXT: v_mul_f32_e32 v0, s12, v0 1576; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1577; GFX9-NEXT: v_mul_f32_e32 v1, s12, v1 1578; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 1579; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s11 1580; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 1581; GFX9-NEXT: s_sub_i32 s2, 0, s10 1582; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 1583; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 1584; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 1585; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 1586; GFX9-NEXT: v_mul_f32_e32 v2, s12, v5 1587; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 1588; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 1589; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v6 1590; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 1591; GFX9-NEXT: v_mul_lo_u32 v5, s2, v2 1592; GFX9-NEXT: s_sub_i32 s2, 0, s11 1593; GFX9-NEXT: v_mul_f32_e32 v3, s12, v3 1594; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 1595; GFX9-NEXT: v_mul_hi_u32 v5, v2, v5 1596; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 1597; GFX9-NEXT: v_mul_lo_u32 v0, v0, s8 1598; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 1599; GFX9-NEXT: v_mul_lo_u32 v5, s2, v3 1600; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 1601; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 1602; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 1603; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 1604; GFX9-NEXT: v_subrev_u32_e32 v6, s8, v0 1605; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1606; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 1607; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 1608; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 1609; GFX9-NEXT: v_mul_lo_u32 v2, v2, s10 1610; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 1611; GFX9-NEXT: v_subrev_u32_e32 v6, s8, v0 1612; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1613; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 1614; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v1 1615; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1616; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 1617; GFX9-NEXT: v_mul_lo_u32 v3, v3, s11 1618; GFX9-NEXT: v_sub_u32_e32 v2, s6, v2 1619; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v1 1620; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1621; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 1622; GFX9-NEXT: v_subrev_u32_e32 v5, s10, v2 1623; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1624; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1625; GFX9-NEXT: v_subrev_u32_e32 v5, s10, v2 1626; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1627; GFX9-NEXT: v_sub_u32_e32 v3, s7, v3 1628; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1629; GFX9-NEXT: v_subrev_u32_e32 v5, s11, v3 1630; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1631; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 1632; GFX9-NEXT: v_subrev_u32_e32 v5, s11, v3 1633; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1634; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 1635; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1636; GFX9-NEXT: s_endpgm 1637 %r = urem <4 x i32> %x, %y 1638 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1639 ret void 1640} 1641 1642define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1643; CHECK-LABEL: @sdiv_v4i32( 1644; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1645; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1646; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 1647; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 1648; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 1649; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 1650; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 1651; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 1652; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 1653; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 1654; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 1655; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000 1656; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 1657; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]] 1658; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]] 1659; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64 1660; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1661; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1662; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1663; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1664; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1665; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]] 1666; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64 1667; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64 1668; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 1669; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 1670; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 1671; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 1672; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]] 1673; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]] 1674; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]] 1675; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1 1676; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]] 1677; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]] 1678; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]] 1679; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]] 1680; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1 1681; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]] 1682; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]] 1683; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]] 1684; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> undef, i32 [[TMP40]], i64 0 1685; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[X]], i64 1 1686; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1687; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31 1688; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31 1689; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]] 1690; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]] 1691; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]] 1692; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]] 1693; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]] 1694; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float 1695; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]]) 1696; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000 1697; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32 1698; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]] 1699; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]] 1700; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64 1701; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64 1702; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]] 1703; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32 1704; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32 1705; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32 1706; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]] 1707; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64 1708; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64 1709; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]] 1710; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32 1711; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32 1712; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32 1713; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]] 1714; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]] 1715; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]] 1716; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1 1717; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]] 1718; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]] 1719; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]] 1720; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]] 1721; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1 1722; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]] 1723; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]] 1724; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]] 1725; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP81]], i64 1 1726; CHECK-NEXT: [[TMP83:%.*]] = extractelement <4 x i32> [[X]], i64 2 1727; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1728; CHECK-NEXT: [[TMP85:%.*]] = ashr i32 [[TMP83]], 31 1729; CHECK-NEXT: [[TMP86:%.*]] = ashr i32 [[TMP84]], 31 1730; CHECK-NEXT: [[TMP87:%.*]] = xor i32 [[TMP85]], [[TMP86]] 1731; CHECK-NEXT: [[TMP88:%.*]] = add i32 [[TMP83]], [[TMP85]] 1732; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP84]], [[TMP86]] 1733; CHECK-NEXT: [[TMP90:%.*]] = xor i32 [[TMP88]], [[TMP85]] 1734; CHECK-NEXT: [[TMP91:%.*]] = xor i32 [[TMP89]], [[TMP86]] 1735; CHECK-NEXT: [[TMP92:%.*]] = uitofp i32 [[TMP91]] to float 1736; CHECK-NEXT: [[TMP93:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP92]]) 1737; CHECK-NEXT: [[TMP94:%.*]] = fmul fast float [[TMP93]], 0x41EFFFFFC0000000 1738; CHECK-NEXT: [[TMP95:%.*]] = fptoui float [[TMP94]] to i32 1739; CHECK-NEXT: [[TMP96:%.*]] = sub i32 0, [[TMP91]] 1740; CHECK-NEXT: [[TMP97:%.*]] = mul i32 [[TMP96]], [[TMP95]] 1741; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP95]] to i64 1742; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64 1743; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]] 1744; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 1745; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32 1746; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32 1747; CHECK-NEXT: [[TMP104:%.*]] = add i32 [[TMP95]], [[TMP103]] 1748; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP90]] to i64 1749; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64 1750; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 1751; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 1752; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 1753; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 1754; CHECK-NEXT: [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP91]] 1755; CHECK-NEXT: [[TMP112:%.*]] = sub i32 [[TMP90]], [[TMP111]] 1756; CHECK-NEXT: [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP91]] 1757; CHECK-NEXT: [[TMP114:%.*]] = add i32 [[TMP110]], 1 1758; CHECK-NEXT: [[TMP115:%.*]] = select i1 [[TMP113]], i32 [[TMP114]], i32 [[TMP110]] 1759; CHECK-NEXT: [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP91]] 1760; CHECK-NEXT: [[TMP117:%.*]] = select i1 [[TMP113]], i32 [[TMP116]], i32 [[TMP112]] 1761; CHECK-NEXT: [[TMP118:%.*]] = icmp uge i32 [[TMP117]], [[TMP91]] 1762; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], 1 1763; CHECK-NEXT: [[TMP120:%.*]] = select i1 [[TMP118]], i32 [[TMP119]], i32 [[TMP115]] 1764; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP120]], [[TMP87]] 1765; CHECK-NEXT: [[TMP122:%.*]] = sub i32 [[TMP121]], [[TMP87]] 1766; CHECK-NEXT: [[TMP123:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP122]], i64 2 1767; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i32> [[X]], i64 3 1768; CHECK-NEXT: [[TMP125:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1769; CHECK-NEXT: [[TMP126:%.*]] = ashr i32 [[TMP124]], 31 1770; CHECK-NEXT: [[TMP127:%.*]] = ashr i32 [[TMP125]], 31 1771; CHECK-NEXT: [[TMP128:%.*]] = xor i32 [[TMP126]], [[TMP127]] 1772; CHECK-NEXT: [[TMP129:%.*]] = add i32 [[TMP124]], [[TMP126]] 1773; CHECK-NEXT: [[TMP130:%.*]] = add i32 [[TMP125]], [[TMP127]] 1774; CHECK-NEXT: [[TMP131:%.*]] = xor i32 [[TMP129]], [[TMP126]] 1775; CHECK-NEXT: [[TMP132:%.*]] = xor i32 [[TMP130]], [[TMP127]] 1776; CHECK-NEXT: [[TMP133:%.*]] = uitofp i32 [[TMP132]] to float 1777; CHECK-NEXT: [[TMP134:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP133]]) 1778; CHECK-NEXT: [[TMP135:%.*]] = fmul fast float [[TMP134]], 0x41EFFFFFC0000000 1779; CHECK-NEXT: [[TMP136:%.*]] = fptoui float [[TMP135]] to i32 1780; CHECK-NEXT: [[TMP137:%.*]] = sub i32 0, [[TMP132]] 1781; CHECK-NEXT: [[TMP138:%.*]] = mul i32 [[TMP137]], [[TMP136]] 1782; CHECK-NEXT: [[TMP139:%.*]] = zext i32 [[TMP136]] to i64 1783; CHECK-NEXT: [[TMP140:%.*]] = zext i32 [[TMP138]] to i64 1784; CHECK-NEXT: [[TMP141:%.*]] = mul i64 [[TMP139]], [[TMP140]] 1785; CHECK-NEXT: [[TMP142:%.*]] = trunc i64 [[TMP141]] to i32 1786; CHECK-NEXT: [[TMP143:%.*]] = lshr i64 [[TMP141]], 32 1787; CHECK-NEXT: [[TMP144:%.*]] = trunc i64 [[TMP143]] to i32 1788; CHECK-NEXT: [[TMP145:%.*]] = add i32 [[TMP136]], [[TMP144]] 1789; CHECK-NEXT: [[TMP146:%.*]] = zext i32 [[TMP131]] to i64 1790; CHECK-NEXT: [[TMP147:%.*]] = zext i32 [[TMP145]] to i64 1791; CHECK-NEXT: [[TMP148:%.*]] = mul i64 [[TMP146]], [[TMP147]] 1792; CHECK-NEXT: [[TMP149:%.*]] = trunc i64 [[TMP148]] to i32 1793; CHECK-NEXT: [[TMP150:%.*]] = lshr i64 [[TMP148]], 32 1794; CHECK-NEXT: [[TMP151:%.*]] = trunc i64 [[TMP150]] to i32 1795; CHECK-NEXT: [[TMP152:%.*]] = mul i32 [[TMP151]], [[TMP132]] 1796; CHECK-NEXT: [[TMP153:%.*]] = sub i32 [[TMP131]], [[TMP152]] 1797; CHECK-NEXT: [[TMP154:%.*]] = icmp uge i32 [[TMP153]], [[TMP132]] 1798; CHECK-NEXT: [[TMP155:%.*]] = add i32 [[TMP151]], 1 1799; CHECK-NEXT: [[TMP156:%.*]] = select i1 [[TMP154]], i32 [[TMP155]], i32 [[TMP151]] 1800; CHECK-NEXT: [[TMP157:%.*]] = sub i32 [[TMP153]], [[TMP132]] 1801; CHECK-NEXT: [[TMP158:%.*]] = select i1 [[TMP154]], i32 [[TMP157]], i32 [[TMP153]] 1802; CHECK-NEXT: [[TMP159:%.*]] = icmp uge i32 [[TMP158]], [[TMP132]] 1803; CHECK-NEXT: [[TMP160:%.*]] = add i32 [[TMP156]], 1 1804; CHECK-NEXT: [[TMP161:%.*]] = select i1 [[TMP159]], i32 [[TMP160]], i32 [[TMP156]] 1805; CHECK-NEXT: [[TMP162:%.*]] = xor i32 [[TMP161]], [[TMP128]] 1806; CHECK-NEXT: [[TMP163:%.*]] = sub i32 [[TMP162]], [[TMP128]] 1807; CHECK-NEXT: [[TMP164:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP163]], i64 3 1808; CHECK-NEXT: store <4 x i32> [[TMP164]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1809; CHECK-NEXT: ret void 1810; 1811; GFX6-LABEL: sdiv_v4i32: 1812; GFX6: ; %bb.0: 1813; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1814; GFX6-NEXT: s_mov_b32 s16, 0x4f7ffffe 1815; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 1816; GFX6-NEXT: s_mov_b32 s15, 0xf000 1817; GFX6-NEXT: s_mov_b32 s14, -1 1818; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1819; GFX6-NEXT: s_ashr_i32 s2, s8, 31 1820; GFX6-NEXT: s_add_i32 s3, s8, s2 1821; GFX6-NEXT: s_xor_b32 s3, s3, s2 1822; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 1823; GFX6-NEXT: s_ashr_i32 s8, s9, 31 1824; GFX6-NEXT: s_add_i32 s0, s9, s8 1825; GFX6-NEXT: s_xor_b32 s9, s0, s8 1826; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 1827; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 1828; GFX6-NEXT: s_sub_i32 s1, 0, s3 1829; GFX6-NEXT: s_ashr_i32 s0, s4, 31 1830; GFX6-NEXT: v_mul_f32_e32 v0, s16, v0 1831; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 1832; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 1833; GFX6-NEXT: s_xor_b32 s2, s0, s2 1834; GFX6-NEXT: v_mul_lo_u32 v2, s1, v0 1835; GFX6-NEXT: s_add_i32 s1, s4, s0 1836; GFX6-NEXT: v_mul_f32_e32 v1, s16, v1 1837; GFX6-NEXT: s_xor_b32 s1, s1, s0 1838; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 1839; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 1840; GFX6-NEXT: s_sub_i32 s0, 0, s9 1841; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1842; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 1843; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 1844; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 1845; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 1846; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 1847; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 1848; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v3 1849; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 1850; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s3, v3 1851; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] 1852; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 1853; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 1854; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 1855; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 1856; GFX6-NEXT: s_ashr_i32 s0, s5, 31 1857; GFX6-NEXT: s_add_i32 s1, s5, s0 1858; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 1859; GFX6-NEXT: s_ashr_i32 s3, s10, 31 1860; GFX6-NEXT: s_xor_b32 s1, s1, s0 1861; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 1862; GFX6-NEXT: s_xor_b32 s2, s0, s8 1863; GFX6-NEXT: s_add_i32 s0, s10, s3 1864; GFX6-NEXT: s_xor_b32 s4, s0, s3 1865; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 1866; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 1867; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 1868; GFX6-NEXT: v_mul_lo_u32 v2, v1, s9 1869; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1870; GFX6-NEXT: v_mul_f32_e32 v3, s16, v3 1871; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 1872; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 1873; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 1874; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 1875; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v2 1876; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 1877; GFX6-NEXT: s_sub_i32 s0, 0, s4 1878; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 1879; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1880; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 1881; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1882; GFX6-NEXT: v_mul_hi_u32 v2, v3, v5 1883; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 1884; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 1885; GFX6-NEXT: s_ashr_i32 s2, s11, 31 1886; GFX6-NEXT: s_ashr_i32 s0, s6, 31 1887; GFX6-NEXT: s_add_i32 s5, s11, s2 1888; GFX6-NEXT: s_add_i32 s1, s6, s0 1889; GFX6-NEXT: s_xor_b32 s5, s5, s2 1890; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s5 1891; GFX6-NEXT: s_xor_b32 s1, s1, s0 1892; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 1893; GFX6-NEXT: v_mul_hi_u32 v2, s1, v2 1894; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v4 1895; GFX6-NEXT: s_xor_b32 s3, s0, s3 1896; GFX6-NEXT: v_mul_lo_u32 v3, v2, s4 1897; GFX6-NEXT: v_mul_f32_e32 v4, s16, v4 1898; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 1899; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v2 1900; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 1901; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v3 1902; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 1903; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v3 1904; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1905; GFX6-NEXT: s_sub_i32 s0, 0, s5 1906; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 1907; GFX6-NEXT: s_ashr_i32 s0, s7, 31 1908; GFX6-NEXT: s_add_i32 s1, s7, s0 1909; GFX6-NEXT: s_xor_b32 s1, s1, s0 1910; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 1911; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v2 1912; GFX6-NEXT: s_xor_b32 s2, s0, s2 1913; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 1914; GFX6-NEXT: v_mul_hi_u32 v4, s1, v4 1915; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 1916; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 1917; GFX6-NEXT: v_xor_b32_e32 v2, s3, v2 1918; GFX6-NEXT: v_mul_lo_u32 v3, v4, s5 1919; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1920; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v2 1921; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 1922; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v3 1923; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] 1924; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v3 1925; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1926; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1927; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 1928; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 1929; GFX6-NEXT: v_xor_b32_e32 v3, s2, v3 1930; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v3 1931; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 1932; GFX6-NEXT: s_endpgm 1933; 1934; GFX9-LABEL: sdiv_v4i32: 1935; GFX9: ; %bb.0: 1936; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 1937; GFX9-NEXT: s_mov_b32 s15, 0x4f7ffffe 1938; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1939; GFX9-NEXT: v_mov_b32_e32 v4, 0 1940; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1941; GFX9-NEXT: s_ashr_i32 s2, s8, 31 1942; GFX9-NEXT: s_add_i32 s3, s8, s2 1943; GFX9-NEXT: s_xor_b32 s3, s3, s2 1944; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 1945; GFX9-NEXT: s_ashr_i32 s12, s9, 31 1946; GFX9-NEXT: s_add_i32 s9, s9, s12 1947; GFX9-NEXT: s_xor_b32 s9, s9, s12 1948; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1949; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 1950; GFX9-NEXT: s_sub_i32 s14, 0, s3 1951; GFX9-NEXT: s_ashr_i32 s8, s4, 31 1952; GFX9-NEXT: v_mul_f32_e32 v0, s15, v0 1953; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1954; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 1955; GFX9-NEXT: s_add_i32 s4, s4, s8 1956; GFX9-NEXT: s_xor_b32 s4, s4, s8 1957; GFX9-NEXT: v_mul_lo_u32 v2, s14, v0 1958; GFX9-NEXT: v_mul_f32_e32 v1, s15, v1 1959; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 1960; GFX9-NEXT: s_sub_i32 s14, 0, s9 1961; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 1962; GFX9-NEXT: s_ashr_i32 s13, s5, 31 1963; GFX9-NEXT: v_mul_lo_u32 v3, s14, v1 1964; GFX9-NEXT: s_add_i32 s5, s5, s13 1965; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 1966; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 1967; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 1968; GFX9-NEXT: s_xor_b32 s5, s5, s13 1969; GFX9-NEXT: s_xor_b32 s2, s8, s2 1970; GFX9-NEXT: v_mul_lo_u32 v3, v0, s3 1971; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 1972; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 1973; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 1974; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 1975; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 1976; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 1977; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v3 1978; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 1979; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 1980; GFX9-NEXT: s_ashr_i32 s3, s10, 31 1981; GFX9-NEXT: s_add_i32 s4, s10, s3 1982; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 1983; GFX9-NEXT: s_xor_b32 s4, s4, s3 1984; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1985; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 1986; GFX9-NEXT: v_mul_lo_u32 v2, v1, s9 1987; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 1988; GFX9-NEXT: s_ashr_i32 s8, s11, 31 1989; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 1990; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 1991; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 1992; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1993; GFX9-NEXT: v_mul_f32_e32 v3, s15, v3 1994; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 1995; GFX9-NEXT: v_subrev_u32_e32 v5, s9, v2 1996; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1997; GFX9-NEXT: s_sub_i32 s5, 0, s4 1998; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 1999; GFX9-NEXT: v_mul_lo_u32 v2, s5, v3 2000; GFX9-NEXT: s_add_i32 s9, s11, s8 2001; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 2002; GFX9-NEXT: s_xor_b32 s9, s9, s8 2003; GFX9-NEXT: v_mul_hi_u32 v2, v3, v2 2004; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 2005; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s9 2006; GFX9-NEXT: s_ashr_i32 s5, s6, 31 2007; GFX9-NEXT: s_add_i32 s6, s6, s5 2008; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 2009; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v5 2010; GFX9-NEXT: s_xor_b32 s6, s6, s5 2011; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 2012; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 2013; GFX9-NEXT: v_mul_f32_e32 v3, s15, v3 2014; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 2015; GFX9-NEXT: v_mul_lo_u32 v5, v2, s4 2016; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 2017; GFX9-NEXT: s_xor_b32 s2, s13, s12 2018; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 2019; GFX9-NEXT: v_subrev_u32_e32 v1, s2, v1 2020; GFX9-NEXT: s_xor_b32 s2, s5, s3 2021; GFX9-NEXT: s_sub_i32 s3, 0, s9 2022; GFX9-NEXT: v_mul_lo_u32 v7, s3, v3 2023; GFX9-NEXT: v_sub_u32_e32 v5, s6, v5 2024; GFX9-NEXT: v_add_u32_e32 v6, 1, v2 2025; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 2026; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2027; GFX9-NEXT: v_subrev_u32_e32 v6, s4, v5 2028; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 2029; GFX9-NEXT: v_mul_hi_u32 v6, v3, v7 2030; GFX9-NEXT: s_ashr_i32 s3, s7, 31 2031; GFX9-NEXT: s_add_i32 s5, s7, s3 2032; GFX9-NEXT: s_xor_b32 s5, s5, s3 2033; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 2034; GFX9-NEXT: v_mul_hi_u32 v3, s5, v3 2035; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 2036; GFX9-NEXT: v_add_u32_e32 v6, 1, v2 2037; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2038; GFX9-NEXT: v_mul_lo_u32 v5, v3, s9 2039; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 2040; GFX9-NEXT: v_xor_b32_e32 v2, s2, v2 2041; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v2 2042; GFX9-NEXT: v_sub_u32_e32 v5, s5, v5 2043; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 2044; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2045; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v5 2046; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 2047; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 2048; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 2049; GFX9-NEXT: s_xor_b32 s2, s3, s8 2050; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2051; GFX9-NEXT: v_xor_b32_e32 v3, s2, v3 2052; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v3 2053; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2054; GFX9-NEXT: s_endpgm 2055 %r = sdiv <4 x i32> %x, %y 2056 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 2057 ret void 2058} 2059 2060define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 2061; CHECK-LABEL: @srem_v4i32( 2062; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 2063; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 2064; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 2065; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 2066; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 2067; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 2068; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 2069; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 2070; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 2071; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2072; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000 2073; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 2074; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]] 2075; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]] 2076; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64 2077; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 2078; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]] 2079; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 2080; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32 2081; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 2082; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] 2083; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 2084; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64 2085; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 2086; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 2087; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 2088; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 2089; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]] 2090; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]] 2091; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]] 2092; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]] 2093; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]] 2094; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]] 2095; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]] 2096; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]] 2097; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]] 2098; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]] 2099; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i32> undef, i32 [[TMP37]], i64 0 2100; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i32> [[X]], i64 1 2101; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[Y]], i64 1 2102; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31 2103; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31 2104; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]] 2105; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]] 2106; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]] 2107; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]] 2108; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float 2109; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]]) 2110; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000 2111; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32 2112; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]] 2113; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]] 2114; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64 2115; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64 2116; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]] 2117; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32 2118; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32 2119; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32 2120; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]] 2121; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64 2122; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64 2123; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]] 2124; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32 2125; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32 2126; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32 2127; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]] 2128; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]] 2129; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]] 2130; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]] 2131; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]] 2132; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]] 2133; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]] 2134; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]] 2135; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]] 2136; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]] 2137; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP75]], i64 1 2138; CHECK-NEXT: [[TMP77:%.*]] = extractelement <4 x i32> [[X]], i64 2 2139; CHECK-NEXT: [[TMP78:%.*]] = extractelement <4 x i32> [[Y]], i64 2 2140; CHECK-NEXT: [[TMP79:%.*]] = ashr i32 [[TMP77]], 31 2141; CHECK-NEXT: [[TMP80:%.*]] = ashr i32 [[TMP78]], 31 2142; CHECK-NEXT: [[TMP81:%.*]] = add i32 [[TMP77]], [[TMP79]] 2143; CHECK-NEXT: [[TMP82:%.*]] = add i32 [[TMP78]], [[TMP80]] 2144; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP79]] 2145; CHECK-NEXT: [[TMP84:%.*]] = xor i32 [[TMP82]], [[TMP80]] 2146; CHECK-NEXT: [[TMP85:%.*]] = uitofp i32 [[TMP84]] to float 2147; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP85]]) 2148; CHECK-NEXT: [[TMP87:%.*]] = fmul fast float [[TMP86]], 0x41EFFFFFC0000000 2149; CHECK-NEXT: [[TMP88:%.*]] = fptoui float [[TMP87]] to i32 2150; CHECK-NEXT: [[TMP89:%.*]] = sub i32 0, [[TMP84]] 2151; CHECK-NEXT: [[TMP90:%.*]] = mul i32 [[TMP89]], [[TMP88]] 2152; CHECK-NEXT: [[TMP91:%.*]] = zext i32 [[TMP88]] to i64 2153; CHECK-NEXT: [[TMP92:%.*]] = zext i32 [[TMP90]] to i64 2154; CHECK-NEXT: [[TMP93:%.*]] = mul i64 [[TMP91]], [[TMP92]] 2155; CHECK-NEXT: [[TMP94:%.*]] = trunc i64 [[TMP93]] to i32 2156; CHECK-NEXT: [[TMP95:%.*]] = lshr i64 [[TMP93]], 32 2157; CHECK-NEXT: [[TMP96:%.*]] = trunc i64 [[TMP95]] to i32 2158; CHECK-NEXT: [[TMP97:%.*]] = add i32 [[TMP88]], [[TMP96]] 2159; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP83]] to i64 2160; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64 2161; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]] 2162; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 2163; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32 2164; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32 2165; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP84]] 2166; CHECK-NEXT: [[TMP105:%.*]] = sub i32 [[TMP83]], [[TMP104]] 2167; CHECK-NEXT: [[TMP106:%.*]] = icmp uge i32 [[TMP105]], [[TMP84]] 2168; CHECK-NEXT: [[TMP107:%.*]] = sub i32 [[TMP105]], [[TMP84]] 2169; CHECK-NEXT: [[TMP108:%.*]] = select i1 [[TMP106]], i32 [[TMP107]], i32 [[TMP105]] 2170; CHECK-NEXT: [[TMP109:%.*]] = icmp uge i32 [[TMP108]], [[TMP84]] 2171; CHECK-NEXT: [[TMP110:%.*]] = sub i32 [[TMP108]], [[TMP84]] 2172; CHECK-NEXT: [[TMP111:%.*]] = select i1 [[TMP109]], i32 [[TMP110]], i32 [[TMP108]] 2173; CHECK-NEXT: [[TMP112:%.*]] = xor i32 [[TMP111]], [[TMP79]] 2174; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP112]], [[TMP79]] 2175; CHECK-NEXT: [[TMP114:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP113]], i64 2 2176; CHECK-NEXT: [[TMP115:%.*]] = extractelement <4 x i32> [[X]], i64 3 2177; CHECK-NEXT: [[TMP116:%.*]] = extractelement <4 x i32> [[Y]], i64 3 2178; CHECK-NEXT: [[TMP117:%.*]] = ashr i32 [[TMP115]], 31 2179; CHECK-NEXT: [[TMP118:%.*]] = ashr i32 [[TMP116]], 31 2180; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], [[TMP117]] 2181; CHECK-NEXT: [[TMP120:%.*]] = add i32 [[TMP116]], [[TMP118]] 2182; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP119]], [[TMP117]] 2183; CHECK-NEXT: [[TMP122:%.*]] = xor i32 [[TMP120]], [[TMP118]] 2184; CHECK-NEXT: [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float 2185; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]]) 2186; CHECK-NEXT: [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41EFFFFFC0000000 2187; CHECK-NEXT: [[TMP126:%.*]] = fptoui float [[TMP125]] to i32 2188; CHECK-NEXT: [[TMP127:%.*]] = sub i32 0, [[TMP122]] 2189; CHECK-NEXT: [[TMP128:%.*]] = mul i32 [[TMP127]], [[TMP126]] 2190; CHECK-NEXT: [[TMP129:%.*]] = zext i32 [[TMP126]] to i64 2191; CHECK-NEXT: [[TMP130:%.*]] = zext i32 [[TMP128]] to i64 2192; CHECK-NEXT: [[TMP131:%.*]] = mul i64 [[TMP129]], [[TMP130]] 2193; CHECK-NEXT: [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32 2194; CHECK-NEXT: [[TMP133:%.*]] = lshr i64 [[TMP131]], 32 2195; CHECK-NEXT: [[TMP134:%.*]] = trunc i64 [[TMP133]] to i32 2196; CHECK-NEXT: [[TMP135:%.*]] = add i32 [[TMP126]], [[TMP134]] 2197; CHECK-NEXT: [[TMP136:%.*]] = zext i32 [[TMP121]] to i64 2198; CHECK-NEXT: [[TMP137:%.*]] = zext i32 [[TMP135]] to i64 2199; CHECK-NEXT: [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]] 2200; CHECK-NEXT: [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32 2201; CHECK-NEXT: [[TMP140:%.*]] = lshr i64 [[TMP138]], 32 2202; CHECK-NEXT: [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32 2203; CHECK-NEXT: [[TMP142:%.*]] = mul i32 [[TMP141]], [[TMP122]] 2204; CHECK-NEXT: [[TMP143:%.*]] = sub i32 [[TMP121]], [[TMP142]] 2205; CHECK-NEXT: [[TMP144:%.*]] = icmp uge i32 [[TMP143]], [[TMP122]] 2206; CHECK-NEXT: [[TMP145:%.*]] = sub i32 [[TMP143]], [[TMP122]] 2207; CHECK-NEXT: [[TMP146:%.*]] = select i1 [[TMP144]], i32 [[TMP145]], i32 [[TMP143]] 2208; CHECK-NEXT: [[TMP147:%.*]] = icmp uge i32 [[TMP146]], [[TMP122]] 2209; CHECK-NEXT: [[TMP148:%.*]] = sub i32 [[TMP146]], [[TMP122]] 2210; CHECK-NEXT: [[TMP149:%.*]] = select i1 [[TMP147]], i32 [[TMP148]], i32 [[TMP146]] 2211; CHECK-NEXT: [[TMP150:%.*]] = xor i32 [[TMP149]], [[TMP117]] 2212; CHECK-NEXT: [[TMP151:%.*]] = sub i32 [[TMP150]], [[TMP117]] 2213; CHECK-NEXT: [[TMP152:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP151]], i64 3 2214; CHECK-NEXT: store <4 x i32> [[TMP152]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 2215; CHECK-NEXT: ret void 2216; 2217; GFX6-LABEL: srem_v4i32: 2218; GFX6: ; %bb.0: 2219; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 2220; GFX6-NEXT: s_mov_b32 s14, 0x4f7ffffe 2221; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2222; GFX6-NEXT: s_mov_b32 s3, 0xf000 2223; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2224; GFX6-NEXT: s_ashr_i32 s2, s8, 31 2225; GFX6-NEXT: s_add_i32 s8, s8, s2 2226; GFX6-NEXT: s_xor_b32 s8, s8, s2 2227; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 2228; GFX6-NEXT: s_ashr_i32 s12, s9, 31 2229; GFX6-NEXT: s_add_i32 s9, s9, s12 2230; GFX6-NEXT: s_xor_b32 s9, s9, s12 2231; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 2232; GFX6-NEXT: s_sub_i32 s13, 0, s8 2233; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 2234; GFX6-NEXT: s_ashr_i32 s12, s4, 31 2235; GFX6-NEXT: v_mul_f32_e32 v0, s14, v0 2236; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 2237; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 2238; GFX6-NEXT: s_add_i32 s4, s4, s12 2239; GFX6-NEXT: s_xor_b32 s4, s4, s12 2240; GFX6-NEXT: v_mul_lo_u32 v2, s13, v0 2241; GFX6-NEXT: v_mul_f32_e32 v1, s14, v1 2242; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2243; GFX6-NEXT: s_sub_i32 s13, 0, s9 2244; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 2245; GFX6-NEXT: s_mov_b32 s2, -1 2246; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 2247; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 2248; GFX6-NEXT: v_mul_lo_u32 v2, s13, v1 2249; GFX6-NEXT: s_ashr_i32 s13, s5, 31 2250; GFX6-NEXT: s_add_i32 s5, s5, s13 2251; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 2252; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 2253; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 2254; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 2255; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2256; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 2257; GFX6-NEXT: s_xor_b32 s4, s5, s13 2258; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2259; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 2260; GFX6-NEXT: s_ashr_i32 s5, s10, 31 2261; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2262; GFX6-NEXT: s_add_i32 s8, s10, s5 2263; GFX6-NEXT: s_xor_b32 s5, s8, s5 2264; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 2265; GFX6-NEXT: v_mul_hi_u32 v1, s4, v1 2266; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 2267; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 2268; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 2269; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 2270; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 2271; GFX6-NEXT: s_ashr_i32 s8, s11, 31 2272; GFX6-NEXT: v_mul_f32_e32 v2, s14, v2 2273; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 2274; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 2275; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v1 2276; GFX6-NEXT: s_sub_i32 s4, 0, s5 2277; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2278; GFX6-NEXT: v_mul_lo_u32 v4, s4, v2 2279; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2280; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v1 2281; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2282; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2283; GFX6-NEXT: v_mul_hi_u32 v3, v2, v4 2284; GFX6-NEXT: s_add_i32 s9, s11, s8 2285; GFX6-NEXT: s_ashr_i32 s4, s6, 31 2286; GFX6-NEXT: s_xor_b32 s8, s9, s8 2287; GFX6-NEXT: s_add_i32 s6, s6, s4 2288; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 2289; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 2290; GFX6-NEXT: s_xor_b32 s6, s6, s4 2291; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 2292; GFX6-NEXT: v_xor_b32_e32 v1, s13, v1 2293; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 2294; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s13, v1 2295; GFX6-NEXT: v_mul_lo_u32 v2, v2, s5 2296; GFX6-NEXT: v_mul_f32_e32 v3, s14, v3 2297; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 2298; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 2299; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v2 2300; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 2301; GFX6-NEXT: s_sub_i32 s6, 0, s8 2302; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 2303; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 2304; GFX6-NEXT: s_ashr_i32 s6, s7, 31 2305; GFX6-NEXT: s_add_i32 s7, s7, s6 2306; GFX6-NEXT: s_xor_b32 s7, s7, s6 2307; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 2308; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v2 2309; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 2310; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 2311; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 2312; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2313; GFX6-NEXT: v_xor_b32_e32 v2, s4, v2 2314; GFX6-NEXT: v_mul_lo_u32 v3, v3, s8 2315; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s4, v2 2316; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 2317; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 2318; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 2319; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 2320; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 2321; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 2322; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 2323; GFX6-NEXT: v_xor_b32_e32 v3, s6, v3 2324; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s6, v3 2325; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2326; GFX6-NEXT: s_endpgm 2327; 2328; GFX9-LABEL: srem_v4i32: 2329; GFX9: ; %bb.0: 2330; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 2331; GFX9-NEXT: s_mov_b32 s13, 0x4f7ffffe 2332; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2333; GFX9-NEXT: v_mov_b32_e32 v4, 0 2334; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2335; GFX9-NEXT: s_ashr_i32 s2, s8, 31 2336; GFX9-NEXT: s_add_i32 s8, s8, s2 2337; GFX9-NEXT: s_xor_b32 s2, s8, s2 2338; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 2339; GFX9-NEXT: s_ashr_i32 s3, s9, 31 2340; GFX9-NEXT: s_sub_i32 s12, 0, s2 2341; GFX9-NEXT: s_add_i32 s8, s9, s3 2342; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 2343; GFX9-NEXT: s_xor_b32 s3, s8, s3 2344; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 2345; GFX9-NEXT: s_ashr_i32 s8, s4, 31 2346; GFX9-NEXT: v_mul_f32_e32 v0, s13, v0 2347; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 2348; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 2349; GFX9-NEXT: s_add_i32 s4, s4, s8 2350; GFX9-NEXT: s_xor_b32 s4, s4, s8 2351; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0 2352; GFX9-NEXT: v_mul_f32_e32 v1, s13, v1 2353; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 2354; GFX9-NEXT: s_sub_i32 s12, 0, s3 2355; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 2356; GFX9-NEXT: s_ashr_i32 s9, s5, 31 2357; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 2358; GFX9-NEXT: s_add_i32 s5, s5, s9 2359; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 2360; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 2361; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 2362; GFX9-NEXT: s_xor_b32 s5, s5, s9 2363; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 2364; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 2365; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 2366; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 2367; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v0 2368; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 2369; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2370; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v0 2371; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 2372; GFX9-NEXT: s_ashr_i32 s2, s10, 31 2373; GFX9-NEXT: s_add_i32 s4, s10, s2 2374; GFX9-NEXT: s_xor_b32 s2, s4, s2 2375; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2376; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 2377; GFX9-NEXT: v_mul_lo_u32 v1, v1, s3 2378; GFX9-NEXT: v_xor_b32_e32 v0, s8, v0 2379; GFX9-NEXT: v_subrev_u32_e32 v0, s8, v0 2380; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 2381; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 2382; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 2383; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 2384; GFX9-NEXT: v_mul_f32_e32 v2, s13, v2 2385; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 2386; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2387; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 2388; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 2389; GFX9-NEXT: s_sub_i32 s3, 0, s2 2390; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2391; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 2392; GFX9-NEXT: s_ashr_i32 s3, s11, 31 2393; GFX9-NEXT: s_add_i32 s4, s11, s3 2394; GFX9-NEXT: s_xor_b32 s3, s4, s3 2395; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 2396; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 2397; GFX9-NEXT: s_ashr_i32 s4, s6, 31 2398; GFX9-NEXT: s_add_i32 s5, s6, s4 2399; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 2400; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 2401; GFX9-NEXT: s_xor_b32 s5, s5, s4 2402; GFX9-NEXT: v_mul_hi_u32 v2, s5, v2 2403; GFX9-NEXT: v_mul_f32_e32 v3, s13, v5 2404; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 2405; GFX9-NEXT: s_sub_i32 s6, 0, s3 2406; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 2407; GFX9-NEXT: v_xor_b32_e32 v1, s9, v1 2408; GFX9-NEXT: v_mul_lo_u32 v5, s6, v3 2409; GFX9-NEXT: v_subrev_u32_e32 v1, s9, v1 2410; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 2411; GFX9-NEXT: s_ashr_i32 s5, s7, 31 2412; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 2413; GFX9-NEXT: s_add_i32 s6, s7, s5 2414; GFX9-NEXT: s_xor_b32 s6, s6, s5 2415; GFX9-NEXT: v_subrev_u32_e32 v6, s2, v2 2416; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 2417; GFX9-NEXT: v_mul_hi_u32 v3, s6, v3 2418; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 2419; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2420; GFX9-NEXT: v_subrev_u32_e32 v5, s2, v2 2421; GFX9-NEXT: v_mul_lo_u32 v3, v3, s3 2422; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 2423; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2424; GFX9-NEXT: v_xor_b32_e32 v2, s4, v2 2425; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 2426; GFX9-NEXT: v_subrev_u32_e32 v5, s3, v3 2427; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 2428; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2429; GFX9-NEXT: v_subrev_u32_e32 v5, s3, v3 2430; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 2431; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2432; GFX9-NEXT: v_xor_b32_e32 v3, s5, v3 2433; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 2434; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v3 2435; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2436; GFX9-NEXT: s_endpgm 2437 %r = srem <4 x i32> %x, %y 2438 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 2439 ret void 2440} 2441 2442define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 2443; CHECK-LABEL: @udiv_v4i16( 2444; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2445; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2446; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 2447; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 2448; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 2449; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 2450; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 2451; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 2452; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 2453; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 2454; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 2455; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 2456; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 2457; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 2458; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 2459; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 2460; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 2461; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 2462; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 2463; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i16> undef, i16 [[TMP19]], i64 0 2464; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1 2465; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2466; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 2467; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 2468; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 2469; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 2470; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 2471; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 2472; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 2473; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 2474; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 2475; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 2476; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 2477; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 2478; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 2479; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 2480; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 2481; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 2482; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 2483; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1 2484; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2 2485; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2486; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 2487; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 2488; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 2489; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 2490; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 2491; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 2492; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 2493; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 2494; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 2495; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 2496; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 2497; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 2498; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 2499; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 2500; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 2501; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 2502; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 2503; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2 2504; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3 2505; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2506; CHECK-NEXT: [[TMP63:%.*]] = zext i16 [[TMP61]] to i32 2507; CHECK-NEXT: [[TMP64:%.*]] = zext i16 [[TMP62]] to i32 2508; CHECK-NEXT: [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float 2509; CHECK-NEXT: [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float 2510; CHECK-NEXT: [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]]) 2511; CHECK-NEXT: [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]] 2512; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]]) 2513; CHECK-NEXT: [[TMP70:%.*]] = fneg fast float [[TMP69]] 2514; CHECK-NEXT: [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]]) 2515; CHECK-NEXT: [[TMP72:%.*]] = fptoui float [[TMP69]] to i32 2516; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]]) 2517; CHECK-NEXT: [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 2518; CHECK-NEXT: [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]] 2519; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0 2520; CHECK-NEXT: [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]] 2521; CHECK-NEXT: [[TMP78:%.*]] = and i32 [[TMP77]], 65535 2522; CHECK-NEXT: [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16 2523; CHECK-NEXT: [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3 2524; CHECK-NEXT: store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 2525; CHECK-NEXT: ret void 2526; 2527; GFX6-LABEL: udiv_v4i16: 2528; GFX6: ; %bb.0: 2529; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2530; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 2531; GFX6-NEXT: s_mov_b32 s8, 0xffff 2532; GFX6-NEXT: s_mov_b32 s7, 0xf000 2533; GFX6-NEXT: s_mov_b32 s6, -1 2534; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2535; GFX6-NEXT: s_and_b32 s9, s2, s8 2536; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 2537; GFX6-NEXT: s_lshr_b32 s9, s0, 16 2538; GFX6-NEXT: s_and_b32 s0, s0, s8 2539; GFX6-NEXT: s_lshr_b32 s2, s2, 16 2540; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 2541; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 2542; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 2543; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s9 2544; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 2545; GFX6-NEXT: s_and_b32 s2, s3, s8 2546; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 2547; GFX6-NEXT: v_trunc_f32_e32 v2, v2 2548; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 2549; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 2550; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2551; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 2552; GFX6-NEXT: v_trunc_f32_e32 v1, v1 2553; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 2554; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 2555; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s2 2556; GFX6-NEXT: s_lshr_b32 s0, s1, 16 2557; GFX6-NEXT: s_and_b32 s1, s1, s8 2558; GFX6-NEXT: s_lshr_b32 s10, s3, 16 2559; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 2560; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2561; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 2562; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 2563; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 2564; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc 2565; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v3 2566; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2567; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 2568; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s0 2569; GFX6-NEXT: v_trunc_f32_e32 v1, v1 2570; GFX6-NEXT: v_mad_f32 v5, -v1, v4, v5 2571; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 2572; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2573; GFX6-NEXT: v_mul_f32_e32 v4, v6, v7 2574; GFX6-NEXT: v_trunc_f32_e32 v4, v4 2575; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v4 2576; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2577; GFX6-NEXT: v_mad_f32 v4, -v4, v3, v6 2578; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 2579; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 2580; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 2581; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2582; GFX6-NEXT: v_and_b32_e32 v1, s8, v1 2583; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 2584; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 2585; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2586; GFX6-NEXT: s_endpgm 2587; 2588; GFX9-LABEL: udiv_v4i16: 2589; GFX9: ; %bb.0: 2590; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 2591; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 2592; GFX9-NEXT: s_mov_b32 s8, 0xffff 2593; GFX9-NEXT: v_mov_b32_e32 v2, 0 2594; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2595; GFX9-NEXT: s_and_b32 s1, s6, s8 2596; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 2597; GFX9-NEXT: s_lshr_b32 s0, s4, 16 2598; GFX9-NEXT: s_and_b32 s4, s4, s8 2599; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 2600; GFX9-NEXT: s_lshr_b32 s4, s6, 16 2601; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 2602; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 2603; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 2604; GFX9-NEXT: s_and_b32 s0, s7, s8 2605; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 2606; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 2607; GFX9-NEXT: v_trunc_f32_e32 v3, v3 2608; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 2609; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 2610; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2611; GFX9-NEXT: v_mul_f32_e32 v1, v5, v6 2612; GFX9-NEXT: v_trunc_f32_e32 v1, v1 2613; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 2614; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 2615; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 2616; GFX9-NEXT: s_and_b32 s0, s5, s8 2617; GFX9-NEXT: s_lshr_b32 s6, s7, 16 2618; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 2619; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 2620; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 2621; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 2622; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 2623; GFX9-NEXT: s_lshr_b32 s1, s5, 16 2624; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc 2625; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v4 2626; GFX9-NEXT: v_mul_f32_e32 v1, v6, v7 2627; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 2628; GFX9-NEXT: v_trunc_f32_e32 v1, v1 2629; GFX9-NEXT: v_mad_f32 v6, -v1, v5, v6 2630; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 2631; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 2632; GFX9-NEXT: v_mul_f32_e32 v5, v7, v8 2633; GFX9-NEXT: v_trunc_f32_e32 v5, v5 2634; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v5 2635; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 2636; GFX9-NEXT: v_mad_f32 v5, -v5, v4, v7 2637; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 2638; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff 2639; GFX9-NEXT: v_and_b32_e32 v0, v5, v0 2640; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc 2641; GFX9-NEXT: v_and_b32_e32 v1, v5, v1 2642; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 2643; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 2644; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 2645; GFX9-NEXT: s_endpgm 2646 %r = udiv <4 x i16> %x, %y 2647 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 2648 ret void 2649} 2650 2651define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 2652; CHECK-LABEL: @urem_v4i16( 2653; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2654; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2655; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 2656; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 2657; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 2658; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 2659; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 2660; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 2661; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 2662; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 2663; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 2664; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 2665; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 2666; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 2667; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 2668; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 2669; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 2670; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 2671; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 2672; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 2673; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 2674; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i16> undef, i16 [[TMP21]], i64 0 2675; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1 2676; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2677; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 2678; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 2679; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 2680; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 2681; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 2682; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 2683; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 2684; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 2685; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 2686; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 2687; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 2688; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 2689; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 2690; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 2691; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 2692; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 2693; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 2694; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 2695; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 2696; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1 2697; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2 2698; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2699; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 2700; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 2701; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 2702; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 2703; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 2704; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 2705; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 2706; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 2707; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 2708; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 2709; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 2710; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 2711; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 2712; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 2713; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 2714; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 2715; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 2716; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 2717; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 2718; CHECK-NEXT: [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2 2719; CHECK-NEXT: [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3 2720; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2721; CHECK-NEXT: [[TMP69:%.*]] = zext i16 [[TMP67]] to i32 2722; CHECK-NEXT: [[TMP70:%.*]] = zext i16 [[TMP68]] to i32 2723; CHECK-NEXT: [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float 2724; CHECK-NEXT: [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float 2725; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]]) 2726; CHECK-NEXT: [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]] 2727; CHECK-NEXT: [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]]) 2728; CHECK-NEXT: [[TMP76:%.*]] = fneg fast float [[TMP75]] 2729; CHECK-NEXT: [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]]) 2730; CHECK-NEXT: [[TMP78:%.*]] = fptoui float [[TMP75]] to i32 2731; CHECK-NEXT: [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]]) 2732; CHECK-NEXT: [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]]) 2733; CHECK-NEXT: [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]] 2734; CHECK-NEXT: [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0 2735; CHECK-NEXT: [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]] 2736; CHECK-NEXT: [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]] 2737; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]] 2738; CHECK-NEXT: [[TMP86:%.*]] = and i32 [[TMP85]], 65535 2739; CHECK-NEXT: [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16 2740; CHECK-NEXT: [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3 2741; CHECK-NEXT: store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 2742; CHECK-NEXT: ret void 2743; 2744; GFX6-LABEL: urem_v4i16: 2745; GFX6: ; %bb.0: 2746; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2747; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 2748; GFX6-NEXT: s_mov_b32 s8, 0xffff 2749; GFX6-NEXT: s_mov_b32 s7, 0xf000 2750; GFX6-NEXT: s_mov_b32 s6, -1 2751; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2752; GFX6-NEXT: s_and_b32 s9, s2, s8 2753; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 2754; GFX6-NEXT: s_and_b32 s10, s0, s8 2755; GFX6-NEXT: s_lshr_b32 s11, s2, 16 2756; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 2757; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 2758; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s11 2759; GFX6-NEXT: s_lshr_b32 s9, s0, 16 2760; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s9 2761; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 2762; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 2763; GFX6-NEXT: v_trunc_f32_e32 v2, v2 2764; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 2765; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 2766; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2767; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 2768; GFX6-NEXT: v_trunc_f32_e32 v1, v1 2769; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 2770; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1 2771; GFX6-NEXT: v_mad_f32 v1, -v1, v3, v4 2772; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3 2773; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 2774; GFX6-NEXT: s_and_b32 s2, s3, s8 2775; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 2776; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 2777; GFX6-NEXT: s_and_b32 s2, s1, s8 2778; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11 2779; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 2780; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 2781; GFX6-NEXT: s_lshr_b32 s12, s3, 16 2782; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s9, v1 2783; GFX6-NEXT: s_lshr_b32 s10, s1, 16 2784; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 2785; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s12 2786; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s10 2787; GFX6-NEXT: v_trunc_f32_e32 v1, v1 2788; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2789; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 2790; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v3 2791; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 2792; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2793; GFX6-NEXT: v_mul_f32_e32 v2, v6, v7 2794; GFX6-NEXT: v_trunc_f32_e32 v2, v2 2795; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 2796; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 2797; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v6 2798; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 2799; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 2800; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 2801; GFX6-NEXT: v_mul_lo_u32 v2, v2, s12 2802; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 2803; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 2804; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 2805; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2806; GFX6-NEXT: v_and_b32_e32 v1, s8, v1 2807; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 2808; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 2809; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 2810; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 2811; GFX6-NEXT: s_endpgm 2812; 2813; GFX9-LABEL: urem_v4i16: 2814; GFX9: ; %bb.0: 2815; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 2816; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 2817; GFX9-NEXT: s_mov_b32 s8, 0xffff 2818; GFX9-NEXT: v_mov_b32_e32 v2, 0 2819; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2820; GFX9-NEXT: s_and_b32 s1, s6, s8 2821; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 2822; GFX9-NEXT: s_and_b32 s9, s4, s8 2823; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 2824; GFX9-NEXT: s_lshr_b32 s9, s6, 16 2825; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 2826; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s9 2827; GFX9-NEXT: s_lshr_b32 s0, s4, 16 2828; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 2829; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 2830; GFX9-NEXT: v_trunc_f32_e32 v3, v3 2831; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 2832; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 2833; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 2834; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 2835; GFX9-NEXT: s_lshr_b32 s10, s7, 16 2836; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 2837; GFX9-NEXT: v_mul_f32_e32 v1, v5, v6 2838; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 2839; GFX9-NEXT: v_trunc_f32_e32 v1, v1 2840; GFX9-NEXT: s_and_b32 s6, s7, s8 2841; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 2842; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6 2843; GFX9-NEXT: s_and_b32 s6, s5, s8 2844; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 2845; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s10 2846; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s6 2847; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 2848; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 2849; GFX9-NEXT: s_lshr_b32 s1, s5, 16 2850; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v4 2851; GFX9-NEXT: v_mul_f32_e32 v3, v6, v7 2852; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 2853; GFX9-NEXT: v_trunc_f32_e32 v3, v3 2854; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 2855; GFX9-NEXT: v_mad_f32 v6, -v3, v5, v6 2856; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 2857; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 2858; GFX9-NEXT: v_mul_f32_e32 v5, v7, v8 2859; GFX9-NEXT: v_trunc_f32_e32 v5, v5 2860; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v5 2861; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 2862; GFX9-NEXT: v_mad_f32 v5, -v5, v4, v7 2863; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 2864; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc 2865; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 2866; GFX9-NEXT: v_mul_lo_u32 v3, v3, s7 2867; GFX9-NEXT: v_mul_lo_u32 v4, v4, s10 2868; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 2869; GFX9-NEXT: v_sub_u32_e32 v5, s0, v1 2870; GFX9-NEXT: v_sub_u32_e32 v1, s5, v3 2871; GFX9-NEXT: v_sub_u32_e32 v3, s1, v4 2872; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff 2873; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 2874; GFX9-NEXT: v_and_b32_e32 v0, v4, v0 2875; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 2876; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 2877; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 2878; GFX9-NEXT: s_endpgm 2879 %r = urem <4 x i16> %x, %y 2880 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 2881 ret void 2882} 2883 2884define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 2885; CHECK-LABEL: @sdiv_v4i16( 2886; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 2887; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 2888; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 2889; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 2890; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 2891; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 2892; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 2893; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 2894; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 2895; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2896; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 2897; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 2898; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 2899; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 2900; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 2901; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 2902; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 2903; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 2904; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 2905; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 2906; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 2907; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 2908; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 2909; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> undef, i16 [[TMP23]], i64 0 2910; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1 2911; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1 2912; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 2913; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 2914; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 2915; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 2916; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 2917; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 2918; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 2919; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 2920; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 2921; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 2922; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 2923; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 2924; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 2925; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 2926; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 2927; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 2928; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 2929; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 2930; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 2931; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 2932; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 2933; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1 2934; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2 2935; CHECK-NEXT: [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2 2936; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 2937; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 2938; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 2939; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 2940; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 2941; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 2942; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 2943; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 2944; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 2945; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 2946; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 2947; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 2948; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 2949; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 2950; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 2951; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 2952; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 2953; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 2954; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 2955; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 2956; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 2957; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2 2958; CHECK-NEXT: [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3 2959; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3 2960; CHECK-NEXT: [[TMP75:%.*]] = sext i16 [[TMP73]] to i32 2961; CHECK-NEXT: [[TMP76:%.*]] = sext i16 [[TMP74]] to i32 2962; CHECK-NEXT: [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]] 2963; CHECK-NEXT: [[TMP78:%.*]] = ashr i32 [[TMP77]], 30 2964; CHECK-NEXT: [[TMP79:%.*]] = or i32 [[TMP78]], 1 2965; CHECK-NEXT: [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float 2966; CHECK-NEXT: [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float 2967; CHECK-NEXT: [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]]) 2968; CHECK-NEXT: [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]] 2969; CHECK-NEXT: [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]]) 2970; CHECK-NEXT: [[TMP85:%.*]] = fneg fast float [[TMP84]] 2971; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]]) 2972; CHECK-NEXT: [[TMP87:%.*]] = fptosi float [[TMP84]] to i32 2973; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]]) 2974; CHECK-NEXT: [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]]) 2975; CHECK-NEXT: [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]] 2976; CHECK-NEXT: [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0 2977; CHECK-NEXT: [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]] 2978; CHECK-NEXT: [[TMP93:%.*]] = shl i32 [[TMP92]], 16 2979; CHECK-NEXT: [[TMP94:%.*]] = ashr i32 [[TMP93]], 16 2980; CHECK-NEXT: [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16 2981; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3 2982; CHECK-NEXT: store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 2983; CHECK-NEXT: ret void 2984; 2985; GFX6-LABEL: sdiv_v4i16: 2986; GFX6: ; %bb.0: 2987; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 2988; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 2989; GFX6-NEXT: s_mov_b32 s7, 0xf000 2990; GFX6-NEXT: s_mov_b32 s6, -1 2991; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2992; GFX6-NEXT: s_sext_i32_i16 s8, s2 2993; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 2994; GFX6-NEXT: s_sext_i32_i16 s9, s0 2995; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 2996; GFX6-NEXT: s_xor_b32 s8, s9, s8 2997; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 2998; GFX6-NEXT: s_ashr_i32 s2, s2, 16 2999; GFX6-NEXT: s_ashr_i32 s8, s8, 30 3000; GFX6-NEXT: s_or_b32 s8, s8, 1 3001; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3002; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3003; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3004; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3005; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 3006; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2 3007; GFX6-NEXT: v_mov_b32_e32 v3, s8 3008; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3009; GFX6-NEXT: s_ashr_i32 s0, s0, 16 3010; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 3011; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 3012; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 3013; GFX6-NEXT: s_xor_b32 s0, s0, s2 3014; GFX6-NEXT: s_ashr_i32 s0, s0, 30 3015; GFX6-NEXT: s_or_b32 s0, s0, 1 3016; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 3017; GFX6-NEXT: v_trunc_f32_e32 v3, v3 3018; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 3019; GFX6-NEXT: v_mov_b32_e32 v4, s0 3020; GFX6-NEXT: s_sext_i32_i16 s0, s3 3021; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 3022; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 3023; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 3024; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 3025; GFX6-NEXT: s_sext_i32_i16 s2, s1 3026; GFX6-NEXT: v_add_i32_e32 v3, vcc, v1, v3 3027; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2 3028; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 3029; GFX6-NEXT: s_xor_b32 s0, s2, s0 3030; GFX6-NEXT: s_ashr_i32 s0, s0, 30 3031; GFX6-NEXT: s_or_b32 s0, s0, 1 3032; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 3033; GFX6-NEXT: v_trunc_f32_e32 v4, v4 3034; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1 3035; GFX6-NEXT: v_mov_b32_e32 v5, s0 3036; GFX6-NEXT: s_ashr_i32 s0, s3, 16 3037; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 3038; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 3039; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 3040; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc 3041; GFX6-NEXT: s_ashr_i32 s1, s1, 16 3042; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 3043; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 3044; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 3045; GFX6-NEXT: s_xor_b32 s0, s1, s0 3046; GFX6-NEXT: s_ashr_i32 s0, s0, 30 3047; GFX6-NEXT: s_or_b32 s0, s0, 1 3048; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 3049; GFX6-NEXT: v_trunc_f32_e32 v5, v5 3050; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 3051; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 3052; GFX6-NEXT: v_mov_b32_e32 v6, s0 3053; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| 3054; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc 3055; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 3056; GFX6-NEXT: s_mov_b32 s0, 0xffff 3057; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3058; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 3059; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 3060; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 3061; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 3062; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 3063; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3064; GFX6-NEXT: s_endpgm 3065; 3066; GFX9-LABEL: sdiv_v4i16: 3067; GFX9: ; %bb.0: 3068; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3069; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3070; GFX9-NEXT: v_mov_b32_e32 v2, 0 3071; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3072; GFX9-NEXT: s_sext_i32_i16 s0, s6 3073; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 3074; GFX9-NEXT: s_sext_i32_i16 s1, s4 3075; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 3076; GFX9-NEXT: s_xor_b32 s0, s1, s0 3077; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3078; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3079; GFX9-NEXT: s_or_b32 s8, s0, 1 3080; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 3081; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3082; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 3083; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3084; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 3085; GFX9-NEXT: s_cselect_b32 s0, s8, 0 3086; GFX9-NEXT: s_ashr_i32 s1, s6, 16 3087; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 3088; GFX9-NEXT: s_ashr_i32 s4, s4, 16 3089; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s4 3090; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 3091; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 3092; GFX9-NEXT: v_add_u32_e32 v3, s0, v3 3093; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 3094; GFX9-NEXT: s_xor_b32 s0, s4, s1 3095; GFX9-NEXT: v_trunc_f32_e32 v4, v4 3096; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3097; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 3098; GFX9-NEXT: s_or_b32 s4, s0, 1 3099; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3100; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 3101; GFX9-NEXT: s_sext_i32_i16 s1, s7 3102; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 3103; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 3104; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3105; GFX9-NEXT: v_add_u32_e32 v4, s0, v4 3106; GFX9-NEXT: s_sext_i32_i16 s0, s5 3107; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 3108; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 3109; GFX9-NEXT: s_xor_b32 s0, s0, s1 3110; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3111; GFX9-NEXT: s_or_b32 s4, s0, 1 3112; GFX9-NEXT: v_mul_f32_e32 v5, v1, v5 3113; GFX9-NEXT: v_trunc_f32_e32 v5, v5 3114; GFX9-NEXT: v_mad_f32 v1, -v5, v0, v1 3115; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3116; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 3117; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3118; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 3119; GFX9-NEXT: s_ashr_i32 s1, s7, 16 3120; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 3121; GFX9-NEXT: v_add_u32_e32 v1, s0, v5 3122; GFX9-NEXT: s_ashr_i32 s0, s5, 16 3123; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 3124; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v0 3125; GFX9-NEXT: s_xor_b32 s0, s0, s1 3126; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3127; GFX9-NEXT: s_or_b32 s4, s0, 1 3128; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 3129; GFX9-NEXT: v_trunc_f32_e32 v6, v6 3130; GFX9-NEXT: v_mad_f32 v5, -v6, v0, v5 3131; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 3132; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| 3133; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 3134; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3135; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff 3136; GFX9-NEXT: v_add_u32_e32 v0, s0, v6 3137; GFX9-NEXT: v_and_b32_e32 v1, v5, v1 3138; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 3139; GFX9-NEXT: v_and_b32_e32 v0, v5, v3 3140; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 3141; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3142; GFX9-NEXT: s_endpgm 3143 %r = sdiv <4 x i16> %x, %y 3144 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 3145 ret void 3146} 3147 3148define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 3149; CHECK-LABEL: @srem_v4i16( 3150; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 3151; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 3152; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 3153; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 3154; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 3155; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 3156; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 3157; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 3158; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 3159; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 3160; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 3161; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 3162; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 3163; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 3164; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 3165; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 3166; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3167; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 3168; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 3169; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 3170; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 3171; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 3172; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 3173; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 3174; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 3175; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> undef, i16 [[TMP25]], i64 0 3176; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1 3177; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1 3178; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 3179; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 3180; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 3181; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 3182; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 3183; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 3184; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 3185; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 3186; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 3187; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 3188; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 3189; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 3190; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 3191; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 3192; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 3193; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 3194; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 3195; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 3196; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 3197; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 3198; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 3199; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 3200; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 3201; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1 3202; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2 3203; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2 3204; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 3205; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 3206; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 3207; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 3208; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 3209; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 3210; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 3211; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 3212; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 3213; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 3214; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 3215; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 3216; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 3217; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 3218; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 3219; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 3220; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 3221; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 3222; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 3223; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 3224; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 3225; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 3226; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 3227; CHECK-NEXT: [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2 3228; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3 3229; CHECK-NEXT: [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3 3230; CHECK-NEXT: [[TMP81:%.*]] = sext i16 [[TMP79]] to i32 3231; CHECK-NEXT: [[TMP82:%.*]] = sext i16 [[TMP80]] to i32 3232; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]] 3233; CHECK-NEXT: [[TMP84:%.*]] = ashr i32 [[TMP83]], 30 3234; CHECK-NEXT: [[TMP85:%.*]] = or i32 [[TMP84]], 1 3235; CHECK-NEXT: [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float 3236; CHECK-NEXT: [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float 3237; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]]) 3238; CHECK-NEXT: [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]] 3239; CHECK-NEXT: [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]]) 3240; CHECK-NEXT: [[TMP91:%.*]] = fneg fast float [[TMP90]] 3241; CHECK-NEXT: [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]]) 3242; CHECK-NEXT: [[TMP93:%.*]] = fptosi float [[TMP90]] to i32 3243; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]]) 3244; CHECK-NEXT: [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]]) 3245; CHECK-NEXT: [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]] 3246; CHECK-NEXT: [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0 3247; CHECK-NEXT: [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]] 3248; CHECK-NEXT: [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]] 3249; CHECK-NEXT: [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]] 3250; CHECK-NEXT: [[TMP101:%.*]] = shl i32 [[TMP100]], 16 3251; CHECK-NEXT: [[TMP102:%.*]] = ashr i32 [[TMP101]], 16 3252; CHECK-NEXT: [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16 3253; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3 3254; CHECK-NEXT: store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 3255; CHECK-NEXT: ret void 3256; 3257; GFX6-LABEL: srem_v4i16: 3258; GFX6: ; %bb.0: 3259; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3260; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 3261; GFX6-NEXT: s_mov_b32 s7, 0xf000 3262; GFX6-NEXT: s_mov_b32 s6, -1 3263; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3264; GFX6-NEXT: s_sext_i32_i16 s8, s2 3265; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 3266; GFX6-NEXT: s_sext_i32_i16 s9, s0 3267; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 3268; GFX6-NEXT: s_xor_b32 s8, s9, s8 3269; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3270; GFX6-NEXT: s_ashr_i32 s8, s8, 30 3271; GFX6-NEXT: s_or_b32 s8, s8, 1 3272; GFX6-NEXT: v_mov_b32_e32 v3, s8 3273; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3274; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3275; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3276; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 3277; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3278; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3279; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 3280; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 3281; GFX6-NEXT: s_ashr_i32 s2, s2, 16 3282; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2 3283; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 3284; GFX6-NEXT: s_ashr_i32 s0, s0, 16 3285; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 3286; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 3287; GFX6-NEXT: s_xor_b32 s8, s0, s2 3288; GFX6-NEXT: s_ashr_i32 s8, s8, 30 3289; GFX6-NEXT: s_or_b32 s8, s8, 1 3290; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 3291; GFX6-NEXT: v_trunc_f32_e32 v3, v3 3292; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 3293; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 3294; GFX6-NEXT: v_mov_b32_e32 v4, s8 3295; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 3296; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 3297; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 3298; GFX6-NEXT: v_mul_lo_u32 v1, v1, s2 3299; GFX6-NEXT: s_sext_i32_i16 s2, s3 3300; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s2 3301; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s0, v1 3302; GFX6-NEXT: s_sext_i32_i16 s0, s1 3303; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 3304; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 3305; GFX6-NEXT: s_xor_b32 s0, s0, s2 3306; GFX6-NEXT: s_ashr_i32 s0, s0, 30 3307; GFX6-NEXT: s_or_b32 s0, s0, 1 3308; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 3309; GFX6-NEXT: v_trunc_f32_e32 v4, v4 3310; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1 3311; GFX6-NEXT: v_mov_b32_e32 v5, s0 3312; GFX6-NEXT: s_ashr_i32 s0, s3, 16 3313; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 3314; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 3315; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 3316; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc 3317; GFX6-NEXT: s_ashr_i32 s2, s1, 16 3318; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 3319; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s2 3320; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 3321; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 3322; GFX6-NEXT: s_xor_b32 s3, s2, s0 3323; GFX6-NEXT: s_ashr_i32 s3, s3, 30 3324; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 3325; GFX6-NEXT: v_trunc_f32_e32 v5, v5 3326; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 3327; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 3328; GFX6-NEXT: s_or_b32 s3, s3, 1 3329; GFX6-NEXT: v_mov_b32_e32 v6, s3 3330; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| 3331; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc 3332; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 3333; GFX6-NEXT: v_mul_lo_u32 v2, v2, s0 3334; GFX6-NEXT: s_mov_b32 s0, 0xffff 3335; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 3336; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 3337; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 3338; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3339; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 3340; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 3341; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 3342; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 3343; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3344; GFX6-NEXT: s_endpgm 3345; 3346; GFX9-LABEL: srem_v4i16: 3347; GFX9: ; %bb.0: 3348; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3349; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3350; GFX9-NEXT: v_mov_b32_e32 v2, 0 3351; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3352; GFX9-NEXT: s_sext_i32_i16 s0, s6 3353; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 3354; GFX9-NEXT: s_sext_i32_i16 s1, s4 3355; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 3356; GFX9-NEXT: s_xor_b32 s0, s1, s0 3357; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3358; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3359; GFX9-NEXT: s_or_b32 s8, s0, 1 3360; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 3361; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3362; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 3363; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3364; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 3365; GFX9-NEXT: s_cselect_b32 s0, s8, 0 3366; GFX9-NEXT: s_ashr_i32 s9, s6, 16 3367; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 3368; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s9 3369; GFX9-NEXT: s_ashr_i32 s8, s4, 16 3370; GFX9-NEXT: v_add_u32_e32 v1, s0, v3 3371; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s8 3372; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 3373; GFX9-NEXT: s_xor_b32 s0, s8, s9 3374; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3375; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 3376; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 3377; GFX9-NEXT: v_trunc_f32_e32 v4, v4 3378; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 3379; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 3380; GFX9-NEXT: s_or_b32 s6, s0, 1 3381; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| 3382; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 3383; GFX9-NEXT: s_cselect_b32 s0, s6, 0 3384; GFX9-NEXT: v_add_u32_e32 v0, s0, v4 3385; GFX9-NEXT: s_sext_i32_i16 s0, s7 3386; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 3387; GFX9-NEXT: s_sext_i32_i16 s1, s5 3388; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 3389; GFX9-NEXT: s_xor_b32 s0, s1, s0 3390; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 3391; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3392; GFX9-NEXT: s_or_b32 s6, s0, 1 3393; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 3394; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 3395; GFX9-NEXT: v_trunc_f32_e32 v5, v5 3396; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 3397; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 3398; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 3399; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 3400; GFX9-NEXT: s_cselect_b32 s0, s6, 0 3401; GFX9-NEXT: s_ashr_i32 s6, s7, 16 3402; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s6 3403; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 3404; GFX9-NEXT: v_mul_lo_u32 v3, v3, s7 3405; GFX9-NEXT: s_ashr_i32 s7, s5, 16 3406; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s7 3407; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 3408; GFX9-NEXT: s_xor_b32 s0, s7, s6 3409; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3410; GFX9-NEXT: s_or_b32 s9, s0, 1 3411; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 3412; GFX9-NEXT: v_trunc_f32_e32 v6, v6 3413; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 3414; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 3415; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| 3416; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 3417; GFX9-NEXT: s_cselect_b32 s0, s9, 0 3418; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 3419; GFX9-NEXT: v_mul_lo_u32 v4, v4, s6 3420; GFX9-NEXT: v_sub_u32_e32 v5, s4, v1 3421; GFX9-NEXT: v_sub_u32_e32 v1, s5, v3 3422; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 3423; GFX9-NEXT: v_sub_u32_e32 v3, s7, v4 3424; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff 3425; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 3426; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 3427; GFX9-NEXT: v_and_b32_e32 v3, v4, v5 3428; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 3429; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3430; GFX9-NEXT: s_endpgm 3431 %r = srem <4 x i16> %x, %y 3432 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 3433 ret void 3434} 3435 3436define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 3437; CHECK-LABEL: @udiv_i3( 3438; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 3439; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 3440; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 3441; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 3442; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 3443; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 3444; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 3445; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 3446; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 3447; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 3448; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3449; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 3450; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 3451; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 3452; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 3453; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 7 3454; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3 3455; CHECK-NEXT: store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]], align 1 3456; CHECK-NEXT: ret void 3457; 3458; GFX6-LABEL: udiv_i3: 3459; GFX6: ; %bb.0: 3460; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3461; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 3462; GFX6-NEXT: s_mov_b32 s7, 0xf000 3463; GFX6-NEXT: s_mov_b32 s6, -1 3464; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3465; GFX6-NEXT: s_bfe_u32 s1, s0, 0x30008 3466; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s1 3467; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 3468; GFX6-NEXT: s_and_b32 s0, s0, 7 3469; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 3470; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 3471; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3472; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 3473; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 3474; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3475; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 3476; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 3477; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 3478; GFX6-NEXT: s_endpgm 3479; 3480; GFX9-LABEL: udiv_i3: 3481; GFX9: ; %bb.0: 3482; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3483; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 3484; GFX9-NEXT: v_mov_b32_e32 v2, 0 3485; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3486; GFX9-NEXT: s_bfe_u32 s0, s4, 0x30008 3487; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 3488; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 3489; GFX9-NEXT: s_and_b32 s0, s4, 7 3490; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 3491; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 3492; GFX9-NEXT: v_trunc_f32_e32 v1, v1 3493; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 3494; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v3 3495; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3496; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 3497; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 3498; GFX9-NEXT: global_store_byte v2, v0, s[2:3] 3499; GFX9-NEXT: s_endpgm 3500 %r = udiv i3 %x, %y 3501 store i3 %r, i3 addrspace(1)* %out 3502 ret void 3503} 3504 3505define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 3506; CHECK-LABEL: @urem_i3( 3507; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 3508; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 3509; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 3510; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 3511; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 3512; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 3513; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 3514; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 3515; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 3516; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 3517; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3518; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 3519; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 3520; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 3521; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 3522; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 3523; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 3524; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 7 3525; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3 3526; CHECK-NEXT: store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]], align 1 3527; CHECK-NEXT: ret void 3528; 3529; GFX6-LABEL: urem_i3: 3530; GFX6: ; %bb.0: 3531; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3532; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 3533; GFX6-NEXT: s_mov_b32 s7, 0xf000 3534; GFX6-NEXT: s_mov_b32 s6, -1 3535; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3536; GFX6-NEXT: s_bfe_u32 s1, s0, 0x30008 3537; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s1 3538; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 3539; GFX6-NEXT: s_and_b32 s2, s0, 7 3540; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 3541; GFX6-NEXT: s_lshr_b32 s1, s0, 8 3542; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 3543; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3544; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 3545; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 3546; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3547; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 3548; GFX6-NEXT: v_mul_lo_u32 v0, v0, s1 3549; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 3550; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 3551; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 3552; GFX6-NEXT: s_endpgm 3553; 3554; GFX9-LABEL: urem_i3: 3555; GFX9: ; %bb.0: 3556; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 3557; GFX9-NEXT: s_nop 0 3558; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3559; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3560; GFX9-NEXT: s_bfe_u32 s3, s2, 0x30008 3561; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 3562; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 3563; GFX9-NEXT: s_and_b32 s4, s2, 7 3564; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 3565; GFX9-NEXT: s_lshr_b32 s3, s2, 8 3566; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 3567; GFX9-NEXT: v_trunc_f32_e32 v1, v1 3568; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 3569; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2 3570; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3571; GFX9-NEXT: v_mov_b32_e32 v1, 0 3572; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 3573; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 3574; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 3575; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 3576; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 3577; GFX9-NEXT: s_endpgm 3578 %r = urem i3 %x, %y 3579 store i3 %r, i3 addrspace(1)* %out 3580 ret void 3581} 3582 3583define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 3584; CHECK-LABEL: @sdiv_i3( 3585; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 3586; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 3587; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 3588; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 3589; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 3590; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 3591; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 3592; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 3593; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 3594; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 3595; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 3596; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 3597; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 3598; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 3599; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 3600; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 3601; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 3602; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 3603; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 29 3604; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 29 3605; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3 3606; CHECK-NEXT: store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]], align 1 3607; CHECK-NEXT: ret void 3608; 3609; GFX6-LABEL: sdiv_i3: 3610; GFX6: ; %bb.0: 3611; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3612; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 3613; GFX6-NEXT: s_mov_b32 s7, 0xf000 3614; GFX6-NEXT: s_mov_b32 s6, -1 3615; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3616; GFX6-NEXT: s_bfe_i32 s1, s0, 0x30008 3617; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 3618; GFX6-NEXT: s_bfe_i32 s0, s0, 0x30000 3619; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 3620; GFX6-NEXT: s_xor_b32 s0, s0, s1 3621; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3622; GFX6-NEXT: s_ashr_i32 s0, s0, 30 3623; GFX6-NEXT: s_or_b32 s0, s0, 1 3624; GFX6-NEXT: v_mov_b32_e32 v3, s0 3625; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3626; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3627; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3628; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 3629; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3630; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3631; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 3632; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 3633; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 3634; GFX6-NEXT: s_endpgm 3635; 3636; GFX9-LABEL: sdiv_i3: 3637; GFX9: ; %bb.0: 3638; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3639; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 3640; GFX9-NEXT: v_mov_b32_e32 v1, 0 3641; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3642; GFX9-NEXT: s_bfe_i32 s0, s4, 0x30008 3643; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 3644; GFX9-NEXT: s_bfe_i32 s1, s4, 0x30000 3645; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 3646; GFX9-NEXT: s_xor_b32 s0, s1, s0 3647; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3648; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3649; GFX9-NEXT: s_or_b32 s4, s0, 1 3650; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 3651; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3652; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 3653; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 3654; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 3655; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 3656; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3657; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 3658; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 3659; GFX9-NEXT: global_store_byte v1, v0, s[2:3] 3660; GFX9-NEXT: s_endpgm 3661 %r = sdiv i3 %x, %y 3662 store i3 %r, i3 addrspace(1)* %out 3663 ret void 3664} 3665 3666define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 3667; CHECK-LABEL: @srem_i3( 3668; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 3669; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 3670; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 3671; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 3672; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 3673; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 3674; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 3675; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 3676; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 3677; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 3678; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 3679; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 3680; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 3681; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 3682; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 3683; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 3684; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 3685; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 3686; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 3687; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 3688; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 29 3689; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 29 3690; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3 3691; CHECK-NEXT: store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]], align 1 3692; CHECK-NEXT: ret void 3693; 3694; GFX6-LABEL: srem_i3: 3695; GFX6: ; %bb.0: 3696; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3697; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 3698; GFX6-NEXT: s_mov_b32 s7, 0xf000 3699; GFX6-NEXT: s_mov_b32 s6, -1 3700; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3701; GFX6-NEXT: s_bfe_i32 s1, s0, 0x30008 3702; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 3703; GFX6-NEXT: s_bfe_i32 s3, s0, 0x30000 3704; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 3705; GFX6-NEXT: s_xor_b32 s1, s3, s1 3706; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3707; GFX6-NEXT: s_ashr_i32 s1, s1, 30 3708; GFX6-NEXT: s_or_b32 s1, s1, 1 3709; GFX6-NEXT: v_mov_b32_e32 v3, s1 3710; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3711; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3712; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3713; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 3714; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3715; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3716; GFX6-NEXT: s_lshr_b32 s2, s0, 8 3717; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 3718; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 3719; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 3720; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 3721; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 3722; GFX6-NEXT: s_endpgm 3723; 3724; GFX9-LABEL: srem_i3: 3725; GFX9: ; %bb.0: 3726; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 3727; GFX9-NEXT: s_nop 0 3728; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3729; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3730; GFX9-NEXT: s_bfe_i32 s2, s4, 0x30008 3731; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 3732; GFX9-NEXT: s_bfe_i32 s3, s4, 0x30000 3733; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 3734; GFX9-NEXT: s_xor_b32 s2, s3, s2 3735; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 3736; GFX9-NEXT: s_ashr_i32 s2, s2, 30 3737; GFX9-NEXT: s_lshr_b32 s5, s4, 8 3738; GFX9-NEXT: s_or_b32 s6, s2, 1 3739; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 3740; GFX9-NEXT: v_trunc_f32_e32 v2, v2 3741; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 3742; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 3743; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 3744; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 3745; GFX9-NEXT: s_cselect_b32 s2, s6, 0 3746; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 3747; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 3748; GFX9-NEXT: v_mov_b32_e32 v1, 0 3749; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 3750; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 3751; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 3752; GFX9-NEXT: s_endpgm 3753 %r = srem i3 %x, %y 3754 store i3 %r, i3 addrspace(1)* %out 3755 ret void 3756} 3757 3758define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 3759; CHECK-LABEL: @udiv_v3i16( 3760; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 3761; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 3762; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 3763; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 3764; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3765; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3766; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3767; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3768; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3769; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3770; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3771; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3772; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3773; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3774; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3775; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3776; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3777; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 3778; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 3779; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i16> undef, i16 [[TMP19]], i64 0 3780; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1 3781; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1 3782; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 3783; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 3784; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 3785; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 3786; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 3787; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 3788; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 3789; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 3790; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 3791; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 3792; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 3793; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 3794; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 3795; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 3796; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 3797; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 3798; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 3799; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1 3800; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2 3801; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2 3802; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 3803; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 3804; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 3805; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 3806; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 3807; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 3808; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 3809; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 3810; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 3811; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 3812; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 3813; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 3814; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 3815; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 3816; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 3817; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 3818; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 3819; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2 3820; CHECK-NEXT: store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 3821; CHECK-NEXT: ret void 3822; 3823; GFX6-LABEL: udiv_v3i16: 3824; GFX6: ; %bb.0: 3825; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3826; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3827; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3828; GFX6-NEXT: s_mov_b32 s8, 0xffff 3829; GFX6-NEXT: s_mov_b32 s7, 0xf000 3830; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3831; GFX6-NEXT: s_and_b32 s6, s0, s8 3832; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 3833; GFX6-NEXT: s_and_b32 s6, s2, s8 3834; GFX6-NEXT: s_lshr_b32 s0, s0, 16 3835; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s0 3836; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s6 3837; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3838; GFX6-NEXT: s_lshr_b32 s0, s2, 16 3839; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s0 3840; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 3841; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3842; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3843; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3844; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 3845; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3846; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 3847; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3848; GFX6-NEXT: s_and_b32 s0, s1, s8 3849; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 3850; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 3851; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s0 3852; GFX6-NEXT: s_and_b32 s0, s3, s8 3853; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 3854; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 3855; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 3856; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 3857; GFX6-NEXT: s_mov_b32 s6, -1 3858; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3859; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6 3860; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3861; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 3862; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5 3863; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 3864; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 3865; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 3866; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 3867; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 3868; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 3869; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 3870; GFX6-NEXT: s_endpgm 3871; 3872; GFX9-LABEL: udiv_v3i16: 3873; GFX9: ; %bb.0: 3874; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3875; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 3876; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 3877; GFX9-NEXT: s_mov_b32 s8, 0xffff 3878; GFX9-NEXT: v_mov_b32_e32 v1, 0 3879; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3880; GFX9-NEXT: s_and_b32 s0, s6, s8 3881; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 3882; GFX9-NEXT: s_and_b32 s0, s4, s8 3883; GFX9-NEXT: s_lshr_b32 s1, s6, 16 3884; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s0 3885; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3886; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s1 3887; GFX9-NEXT: s_lshr_b32 s0, s4, 16 3888; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 3889; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 3890; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 3891; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3892; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 3893; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 3894; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 3895; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6 3896; GFX9-NEXT: v_trunc_f32_e32 v2, v2 3897; GFX9-NEXT: s_and_b32 s0, s7, s8 3898; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 3899; GFX9-NEXT: v_mad_f32 v3, -v2, v4, v5 3900; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 3901; GFX9-NEXT: s_and_b32 s0, s5, s8 3902; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 3903; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 3904; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 3905; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 3906; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 3907; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 3908; GFX9-NEXT: v_mul_f32_e32 v3, v6, v7 3909; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3910; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 3911; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v6 3912; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 3913; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 3914; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 3915; GFX9-NEXT: global_store_short v1, v3, s[2:3] offset:4 3916; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 3917; GFX9-NEXT: s_endpgm 3918 %r = udiv <3 x i16> %x, %y 3919 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 3920 ret void 3921} 3922 3923define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 3924; CHECK-LABEL: @urem_v3i16( 3925; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 3926; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 3927; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 3928; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 3929; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3930; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3931; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3932; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3933; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3934; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3935; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3936; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3937; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3938; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3939; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3940; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3941; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3942; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 3943; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 3944; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 3945; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 3946; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i16> undef, i16 [[TMP21]], i64 0 3947; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1 3948; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1 3949; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 3950; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 3951; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 3952; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 3953; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 3954; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 3955; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 3956; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 3957; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 3958; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 3959; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 3960; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 3961; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 3962; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 3963; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 3964; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 3965; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 3966; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 3967; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 3968; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1 3969; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2 3970; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2 3971; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 3972; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 3973; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 3974; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 3975; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 3976; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 3977; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 3978; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 3979; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 3980; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 3981; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 3982; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 3983; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 3984; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 3985; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 3986; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 3987; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 3988; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 3989; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 3990; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2 3991; CHECK-NEXT: store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 3992; CHECK-NEXT: ret void 3993; 3994; GFX6-LABEL: urem_v3i16: 3995; GFX6: ; %bb.0: 3996; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3997; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 3998; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 3999; GFX6-NEXT: s_mov_b32 s8, 0xffff 4000; GFX6-NEXT: s_mov_b32 s7, 0xf000 4001; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4002; GFX6-NEXT: v_mov_b32_e32 v1, s2 4003; GFX6-NEXT: s_and_b32 s6, s0, s8 4004; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 4005; GFX6-NEXT: s_and_b32 s6, s2, s8 4006; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 4007; GFX6-NEXT: v_mov_b32_e32 v4, s0 4008; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 4009; GFX6-NEXT: v_alignbit_b32 v4, s1, v4, 16 4010; GFX6-NEXT: v_and_b32_e32 v5, s8, v4 4011; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 4012; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 4013; GFX6-NEXT: v_trunc_f32_e32 v3, v3 4014; GFX6-NEXT: v_mad_f32 v2, -v3, v0, v2 4015; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 4016; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 4017; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v5 4018; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 4019; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc 4020; GFX6-NEXT: v_mul_lo_u32 v0, v0, s0 4021; GFX6-NEXT: s_and_b32 s0, s1, s8 4022; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 4023; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 4024; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s0 4025; GFX6-NEXT: s_and_b32 s0, s3, s8 4026; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s0 4027; GFX6-NEXT: v_mul_f32_e32 v5, v3, v5 4028; GFX6-NEXT: v_trunc_f32_e32 v5, v5 4029; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 4030; GFX6-NEXT: v_mad_f32 v3, -v5, v2, v3 4031; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 4032; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 4033; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 4034; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 4035; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 4036; GFX6-NEXT: v_trunc_f32_e32 v3, v3 4037; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 4038; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3 4039; GFX6-NEXT: v_mad_f32 v3, -v3, v6, v7 4040; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v6 4041; GFX6-NEXT: s_mov_b32 s6, -1 4042; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 4043; GFX6-NEXT: v_mul_lo_u32 v3, v3, s1 4044; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 4045; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 4046; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 4047; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 4048; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 4049; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 4050; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4051; GFX6-NEXT: s_endpgm 4052; 4053; GFX9-LABEL: urem_v3i16: 4054; GFX9: ; %bb.0: 4055; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4056; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 4057; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 4058; GFX9-NEXT: s_mov_b32 s8, 0xffff 4059; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4060; GFX9-NEXT: s_and_b32 s0, s4, s8 4061; GFX9-NEXT: s_and_b32 s1, s6, s8 4062; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 4063; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 4064; GFX9-NEXT: s_lshr_b32 s6, s6, 16 4065; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 4066; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 4067; GFX9-NEXT: s_lshr_b32 s4, s4, 16 4068; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 4069; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v2 4070; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 4071; GFX9-NEXT: v_trunc_f32_e32 v3, v3 4072; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v3 4073; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 4074; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 4075; GFX9-NEXT: v_mul_f32_e32 v1, v4, v5 4076; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc 4077; GFX9-NEXT: v_mul_lo_u32 v0, v0, s1 4078; GFX9-NEXT: v_trunc_f32_e32 v1, v1 4079; GFX9-NEXT: s_and_b32 s1, s7, s8 4080; GFX9-NEXT: v_mad_f32 v3, -v1, v2, v4 4081; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s1 4082; GFX9-NEXT: s_and_b32 s5, s5, s8 4083; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5 4084; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 4085; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 4086; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 4087; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 4088; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 4089; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6 4090; GFX9-NEXT: v_trunc_f32_e32 v2, v2 4091; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 4092; GFX9-NEXT: v_mad_f32 v2, -v2, v4, v5 4093; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 4094; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 4095; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc 4096; GFX9-NEXT: v_mul_lo_u32 v2, v2, s1 4097; GFX9-NEXT: v_mov_b32_e32 v3, 0 4098; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 4099; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 4100; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 4101; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 4102; GFX9-NEXT: global_store_short v3, v2, s[2:3] offset:4 4103; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 4104; GFX9-NEXT: s_endpgm 4105 %r = urem <3 x i16> %x, %y 4106 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 4107 ret void 4108} 4109 4110define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 4111; CHECK-LABEL: @sdiv_v3i16( 4112; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 4113; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 4114; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 4115; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 4116; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 4117; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 4118; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 4119; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 4120; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 4121; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 4122; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 4123; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 4124; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 4125; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 4126; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 4127; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 4128; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4129; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 4130; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 4131; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 4132; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 4133; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 4134; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 4135; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i16> undef, i16 [[TMP23]], i64 0 4136; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1 4137; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1 4138; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 4139; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 4140; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 4141; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 4142; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 4143; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 4144; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 4145; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 4146; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 4147; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 4148; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 4149; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 4150; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 4151; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 4152; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 4153; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 4154; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 4155; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 4156; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 4157; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 4158; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 4159; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1 4160; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2 4161; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2 4162; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 4163; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 4164; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 4165; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 4166; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 4167; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 4168; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 4169; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 4170; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 4171; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 4172; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 4173; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 4174; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 4175; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 4176; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 4177; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 4178; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 4179; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 4180; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 4181; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 4182; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 4183; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2 4184; CHECK-NEXT: store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 4185; CHECK-NEXT: ret void 4186; 4187; GFX6-LABEL: sdiv_v3i16: 4188; GFX6: ; %bb.0: 4189; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4190; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4191; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 4192; GFX6-NEXT: s_mov_b32 s7, 0xf000 4193; GFX6-NEXT: s_mov_b32 s6, -1 4194; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4195; GFX6-NEXT: s_sext_i32_i16 s9, s2 4196; GFX6-NEXT: s_sext_i32_i16 s8, s0 4197; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 4198; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 4199; GFX6-NEXT: s_xor_b32 s8, s9, s8 4200; GFX6-NEXT: s_ashr_i32 s0, s0, 16 4201; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 4202; GFX6-NEXT: s_ashr_i32 s8, s8, 30 4203; GFX6-NEXT: s_or_b32 s8, s8, 1 4204; GFX6-NEXT: v_mov_b32_e32 v3, s8 4205; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 4206; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4207; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 4208; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 4209; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 4210; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 4211; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 4212; GFX6-NEXT: s_ashr_i32 s2, s2, 16 4213; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 4214; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s2 4215; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 4216; GFX6-NEXT: s_xor_b32 s0, s2, s0 4217; GFX6-NEXT: s_ashr_i32 s0, s0, 30 4218; GFX6-NEXT: s_or_b32 s0, s0, 1 4219; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 4220; GFX6-NEXT: v_trunc_f32_e32 v3, v3 4221; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 4222; GFX6-NEXT: v_mov_b32_e32 v4, s0 4223; GFX6-NEXT: s_sext_i32_i16 s0, s1 4224; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 4225; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 4226; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 4227; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 4228; GFX6-NEXT: s_sext_i32_i16 s1, s3 4229; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 4230; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s1 4231; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 4232; GFX6-NEXT: s_xor_b32 s0, s1, s0 4233; GFX6-NEXT: s_ashr_i32 s0, s0, 30 4234; GFX6-NEXT: s_or_b32 s0, s0, 1 4235; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 4236; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4237; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 4238; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 4239; GFX6-NEXT: v_mov_b32_e32 v5, s0 4240; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 4241; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 4242; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 4243; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 4244; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 4245; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 4246; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 4247; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4248; GFX6-NEXT: s_endpgm 4249; 4250; GFX9-LABEL: sdiv_v3i16: 4251; GFX9: ; %bb.0: 4252; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4253; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 4254; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 4255; GFX9-NEXT: v_mov_b32_e32 v1, 0 4256; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4257; GFX9-NEXT: s_sext_i32_i16 s1, s4 4258; GFX9-NEXT: s_sext_i32_i16 s0, s6 4259; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 4260; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 4261; GFX9-NEXT: s_xor_b32 s0, s1, s0 4262; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4263; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 4264; GFX9-NEXT: s_or_b32 s8, s0, 1 4265; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 4266; GFX9-NEXT: v_trunc_f32_e32 v3, v3 4267; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 4268; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 4269; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 4270; GFX9-NEXT: s_cselect_b32 s0, s8, 0 4271; GFX9-NEXT: s_ashr_i32 s1, s6, 16 4272; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 4273; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 4274; GFX9-NEXT: s_ashr_i32 s4, s4, 16 4275; GFX9-NEXT: v_add_u32_e32 v2, s0, v3 4276; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 4277; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 4278; GFX9-NEXT: s_xor_b32 s0, s4, s1 4279; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4280; GFX9-NEXT: s_or_b32 s4, s0, 1 4281; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 4282; GFX9-NEXT: v_trunc_f32_e32 v4, v4 4283; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 4284; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| 4285; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 4286; GFX9-NEXT: s_sext_i32_i16 s1, s7 4287; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 4288; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 4289; GFX9-NEXT: s_cselect_b32 s0, s4, 0 4290; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 4291; GFX9-NEXT: v_add_u32_e32 v3, s0, v4 4292; GFX9-NEXT: s_sext_i32_i16 s0, s5 4293; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 4294; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 4295; GFX9-NEXT: s_xor_b32 s0, s0, s1 4296; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4297; GFX9-NEXT: s_or_b32 s4, s0, 1 4298; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 4299; GFX9-NEXT: v_trunc_f32_e32 v5, v5 4300; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 4301; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 4302; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| 4303; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 4304; GFX9-NEXT: s_cselect_b32 s0, s4, 0 4305; GFX9-NEXT: v_add_u32_e32 v0, s0, v5 4306; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 4307; GFX9-NEXT: global_store_short v1, v0, s[2:3] offset:4 4308; GFX9-NEXT: global_store_dword v1, v2, s[2:3] 4309; GFX9-NEXT: s_endpgm 4310 %r = sdiv <3 x i16> %x, %y 4311 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 4312 ret void 4313} 4314 4315define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 4316; CHECK-LABEL: @srem_v3i16( 4317; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 4318; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 4319; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 4320; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 4321; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 4322; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 4323; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 4324; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 4325; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 4326; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 4327; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 4328; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 4329; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 4330; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 4331; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 4332; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 4333; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4334; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 4335; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 4336; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 4337; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 4338; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 4339; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 4340; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 4341; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 4342; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i16> undef, i16 [[TMP25]], i64 0 4343; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1 4344; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1 4345; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 4346; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 4347; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 4348; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 4349; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 4350; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 4351; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 4352; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 4353; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 4354; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 4355; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 4356; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 4357; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 4358; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 4359; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 4360; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 4361; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 4362; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 4363; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 4364; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 4365; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 4366; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 4367; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 4368; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1 4369; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2 4370; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2 4371; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 4372; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 4373; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 4374; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 4375; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 4376; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 4377; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 4378; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 4379; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 4380; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 4381; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 4382; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 4383; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 4384; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 4385; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 4386; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 4387; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 4388; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 4389; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 4390; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 4391; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 4392; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 4393; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 4394; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2 4395; CHECK-NEXT: store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 4396; CHECK-NEXT: ret void 4397; 4398; GFX6-LABEL: srem_v3i16: 4399; GFX6: ; %bb.0: 4400; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4401; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4402; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 4403; GFX6-NEXT: s_mov_b32 s7, 0xf000 4404; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4405; GFX6-NEXT: s_sext_i32_i16 s8, s2 4406; GFX6-NEXT: s_sext_i32_i16 s6, s0 4407; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 4408; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s8 4409; GFX6-NEXT: s_xor_b32 s6, s8, s6 4410; GFX6-NEXT: s_ashr_i32 s6, s6, 30 4411; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 4412; GFX6-NEXT: s_or_b32 s6, s6, 1 4413; GFX6-NEXT: v_mov_b32_e32 v3, s6 4414; GFX6-NEXT: s_mov_b32 s6, -1 4415; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 4416; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4417; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 4418; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 4419; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 4420; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 4421; GFX6-NEXT: v_mov_b32_e32 v1, s2 4422; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 4423; GFX6-NEXT: v_mov_b32_e32 v2, s0 4424; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 16 4425; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 4426; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v3 4427; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 4428; GFX6-NEXT: v_bfe_i32 v5, v1, 0, 16 4429; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v5 4430; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 4431; GFX6-NEXT: v_mul_lo_u32 v0, v0, s0 4432; GFX6-NEXT: v_xor_b32_e32 v3, v5, v3 4433; GFX6-NEXT: s_sext_i32_i16 s0, s1 4434; GFX6-NEXT: v_mul_f32_e32 v5, v6, v7 4435; GFX6-NEXT: v_trunc_f32_e32 v5, v5 4436; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 4437; GFX6-NEXT: v_mad_f32 v6, -v5, v4, v6 4438; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 4439; GFX6-NEXT: v_ashrrev_i32_e32 v3, 30, v3 4440; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4| 4441; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s0 4442; GFX6-NEXT: v_or_b32_e32 v3, 1, v3 4443; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 4444; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 4445; GFX6-NEXT: s_sext_i32_i16 s2, s3 4446; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 4447; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s2 4448; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v4 4449; GFX6-NEXT: s_xor_b32 s0, s2, s0 4450; GFX6-NEXT: s_ashr_i32 s0, s0, 30 4451; GFX6-NEXT: s_or_b32 s0, s0, 1 4452; GFX6-NEXT: v_mul_f32_e32 v5, v3, v5 4453; GFX6-NEXT: v_trunc_f32_e32 v5, v5 4454; GFX6-NEXT: v_mad_f32 v3, -v5, v4, v3 4455; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 4456; GFX6-NEXT: v_mov_b32_e32 v6, s0 4457; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v4| 4458; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 4459; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 4460; GFX6-NEXT: v_mul_lo_u32 v3, v3, s1 4461; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 4462; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 4463; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 4464; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 4465; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 4466; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 4467; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4468; GFX6-NEXT: s_endpgm 4469; 4470; GFX9-LABEL: srem_v3i16: 4471; GFX9: ; %bb.0: 4472; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 4473; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 4474; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 4475; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4476; GFX9-NEXT: s_sext_i32_i16 s8, s2 4477; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 4478; GFX9-NEXT: s_sext_i32_i16 s9, s6 4479; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 4480; GFX9-NEXT: s_xor_b32 s0, s9, s8 4481; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 4482; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4483; GFX9-NEXT: s_or_b32 s10, s0, 1 4484; GFX9-NEXT: s_sext_i32_i16 s3, s3 4485; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 4486; GFX9-NEXT: v_trunc_f32_e32 v2, v2 4487; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 4488; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 4489; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 4490; GFX9-NEXT: s_cselect_b32 s0, s10, 0 4491; GFX9-NEXT: s_ashr_i32 s2, s2, 16 4492; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 4493; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 4494; GFX9-NEXT: s_ashr_i32 s6, s6, 16 4495; GFX9-NEXT: v_add_u32_e32 v1, s0, v2 4496; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s6 4497; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 4498; GFX9-NEXT: s_xor_b32 s0, s6, s2 4499; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4500; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 4501; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 4502; GFX9-NEXT: v_trunc_f32_e32 v3, v3 4503; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 4504; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 4505; GFX9-NEXT: s_or_b32 s8, s0, 1 4506; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 4507; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 4508; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 4509; GFX9-NEXT: s_cselect_b32 s0, s8, 0 4510; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 4511; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 4512; GFX9-NEXT: s_sext_i32_i16 s2, s7 4513; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2 4514; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 4515; GFX9-NEXT: s_xor_b32 s0, s2, s3 4516; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4517; GFX9-NEXT: s_or_b32 s7, s0, 1 4518; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 4519; GFX9-NEXT: v_trunc_f32_e32 v4, v4 4520; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 4521; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 4522; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| 4523; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 4524; GFX9-NEXT: s_cselect_b32 s0, s7, 0 4525; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 4526; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 4527; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 4528; GFX9-NEXT: v_mov_b32_e32 v3, 0 4529; GFX9-NEXT: v_sub_u32_e32 v0, s6, v0 4530; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 4531; GFX9-NEXT: v_sub_u32_e32 v2, s2, v2 4532; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 4533; GFX9-NEXT: global_store_short v3, v2, s[4:5] offset:4 4534; GFX9-NEXT: global_store_dword v3, v0, s[4:5] 4535; GFX9-NEXT: s_endpgm 4536 %r = srem <3 x i16> %x, %y 4537 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 4538 ret void 4539} 4540 4541define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 4542; CHECK-LABEL: @udiv_v3i15( 4543; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 4544; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 4545; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 4546; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 4547; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 4548; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 4549; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 4550; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 4551; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 4552; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 4553; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 4554; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 4555; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 4556; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 4557; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 4558; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 4559; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 4560; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 32767 4561; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15 4562; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i15> undef, i15 [[TMP19]], i64 0 4563; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1 4564; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1 4565; CHECK-NEXT: [[TMP23:%.*]] = zext i15 [[TMP21]] to i32 4566; CHECK-NEXT: [[TMP24:%.*]] = zext i15 [[TMP22]] to i32 4567; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 4568; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 4569; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 4570; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 4571; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 4572; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 4573; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 4574; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 4575; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 4576; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 4577; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 4578; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 4579; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 4580; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 32767 4581; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15 4582; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1 4583; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2 4584; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2 4585; CHECK-NEXT: [[TMP43:%.*]] = zext i15 [[TMP41]] to i32 4586; CHECK-NEXT: [[TMP44:%.*]] = zext i15 [[TMP42]] to i32 4587; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 4588; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 4589; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 4590; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 4591; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 4592; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 4593; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 4594; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 4595; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 4596; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 4597; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 4598; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 4599; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 4600; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 32767 4601; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15 4602; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2 4603; CHECK-NEXT: store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 4604; CHECK-NEXT: ret void 4605; 4606; GFX6-LABEL: udiv_v3i15: 4607; GFX6: ; %bb.0: 4608; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4609; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4610; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 4611; GFX6-NEXT: s_mov_b32 s7, 0xf000 4612; GFX6-NEXT: s_mov_b32 s6, -1 4613; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4614; GFX6-NEXT: v_mov_b32_e32 v0, s2 4615; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 4616; GFX6-NEXT: s_movk_i32 s3, 0x7fff 4617; GFX6-NEXT: s_and_b32 s9, s0, s3 4618; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 4619; GFX6-NEXT: v_mov_b32_e32 v2, s0 4620; GFX6-NEXT: s_and_b32 s8, s2, s3 4621; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf000f 4622; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 4623; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 4624; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 4625; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf000f 4626; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 4627; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s2 4628; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 4629; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 4630; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 4631; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4632; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 4633; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 4634; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v2 4635; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 4636; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7 4637; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 4638; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4639; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 4640; GFX6-NEXT: v_mad_f32 v4, -v1, v5, v6 4641; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 4642; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0 4643; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 4644; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v5 4645; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 4646; GFX6-NEXT: v_mul_f32_e32 v1, v0, v6 4647; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4648; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v1 4649; GFX6-NEXT: v_mad_f32 v0, -v1, v2, v0 4650; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 4651; GFX6-NEXT: v_and_b32_e32 v2, s3, v3 4652; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc 4653; GFX6-NEXT: v_and_b32_e32 v3, s3, v4 4654; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 4655; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 4656; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 4657; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 4658; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4659; GFX6-NEXT: s_waitcnt expcnt(0) 4660; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 4661; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 4662; GFX6-NEXT: s_endpgm 4663; 4664; GFX9-LABEL: udiv_v3i15: 4665; GFX9: ; %bb.0: 4666; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4667; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 4668; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 4669; GFX9-NEXT: s_movk_i32 s8, 0x7fff 4670; GFX9-NEXT: v_mov_b32_e32 v2, 0 4671; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4672; GFX9-NEXT: s_and_b32 s0, s4, s8 4673; GFX9-NEXT: s_and_b32 s1, s6, s8 4674; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 4675; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 4676; GFX9-NEXT: s_bfe_u32 s0, s6, 0xf000f 4677; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 4678; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 4679; GFX9-NEXT: v_mov_b32_e32 v3, s6 4680; GFX9-NEXT: s_bfe_u32 s1, s4, 0xf000f 4681; GFX9-NEXT: v_alignbit_b32 v3, s7, v3, 30 4682; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 4683; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 4684; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 4685; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 4686; GFX9-NEXT: v_trunc_f32_e32 v5, v5 4687; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 4688; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 4689; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 4690; GFX9-NEXT: v_mov_b32_e32 v0, s4 4691; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 4692; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 4693; GFX9-NEXT: v_mul_f32_e32 v1, v7, v8 4694; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 4695; GFX9-NEXT: v_trunc_f32_e32 v1, v1 4696; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 4697; GFX9-NEXT: v_mad_f32 v5, -v1, v6, v7 4698; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 4699; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 4700; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 4701; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v6 4702; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc 4703; GFX9-NEXT: v_mul_f32_e32 v1, v0, v7 4704; GFX9-NEXT: v_trunc_f32_e32 v1, v1 4705; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v1 4706; GFX9-NEXT: v_mad_f32 v0, -v1, v3, v0 4707; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v3 4708; GFX9-NEXT: v_and_b32_e32 v3, s8, v4 4709; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc 4710; GFX9-NEXT: v_and_b32_e32 v4, s8, v5 4711; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 4712; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 4713; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 4714; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 4715; GFX9-NEXT: global_store_dword v2, v0, s[2:3] 4716; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 4717; GFX9-NEXT: global_store_short v2, v0, s[2:3] offset:4 4718; GFX9-NEXT: s_endpgm 4719 %r = udiv <3 x i15> %x, %y 4720 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 4721 ret void 4722} 4723 4724define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 4725; CHECK-LABEL: @urem_v3i15( 4726; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 4727; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 4728; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 4729; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 4730; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 4731; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 4732; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 4733; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 4734; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 4735; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 4736; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 4737; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 4738; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 4739; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 4740; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 4741; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 4742; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 4743; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 4744; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 4745; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 32767 4746; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15 4747; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i15> undef, i15 [[TMP21]], i64 0 4748; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1 4749; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1 4750; CHECK-NEXT: [[TMP25:%.*]] = zext i15 [[TMP23]] to i32 4751; CHECK-NEXT: [[TMP26:%.*]] = zext i15 [[TMP24]] to i32 4752; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 4753; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 4754; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 4755; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 4756; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 4757; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 4758; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 4759; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 4760; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 4761; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 4762; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 4763; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 4764; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 4765; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 4766; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 4767; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 32767 4768; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15 4769; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1 4770; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2 4771; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2 4772; CHECK-NEXT: [[TMP47:%.*]] = zext i15 [[TMP45]] to i32 4773; CHECK-NEXT: [[TMP48:%.*]] = zext i15 [[TMP46]] to i32 4774; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 4775; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 4776; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 4777; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 4778; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 4779; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 4780; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 4781; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 4782; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 4783; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 4784; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 4785; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 4786; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 4787; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 4788; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 4789; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 32767 4790; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15 4791; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2 4792; CHECK-NEXT: store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 4793; CHECK-NEXT: ret void 4794; 4795; GFX6-LABEL: urem_v3i15: 4796; GFX6: ; %bb.0: 4797; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4798; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4799; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 4800; GFX6-NEXT: s_mov_b32 s7, 0xf000 4801; GFX6-NEXT: s_mov_b32 s6, -1 4802; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4803; GFX6-NEXT: v_mov_b32_e32 v0, s2 4804; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 4805; GFX6-NEXT: s_movk_i32 s3, 0x7fff 4806; GFX6-NEXT: s_and_b32 s10, s0, s3 4807; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 4808; GFX6-NEXT: s_and_b32 s9, s2, s3 4809; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s9 4810; GFX6-NEXT: v_mov_b32_e32 v2, s0 4811; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 4812; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 4813; GFX6-NEXT: s_bfe_u32 s1, s0, 0xf000f 4814; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 4815; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 4816; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4817; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 4818; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 4819; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 4820; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf000f 4821; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 4822; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc 4823; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 4824; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 4825; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 4826; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 4827; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s2, v1 4828; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 4829; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2 4830; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0 4831; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4832; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 4833; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 4834; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 4835; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 4836; GFX6-NEXT: s_lshr_b32 s0, s0, 15 4837; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 4838; GFX6-NEXT: v_trunc_f32_e32 v3, v3 4839; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3 4840; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4841; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 4842; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 4843; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 4844; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 4845; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 4846; GFX6-NEXT: s_lshr_b32 s8, s2, 15 4847; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v1 4848; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 4849; GFX6-NEXT: v_and_b32_e32 v3, s3, v3 4850; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 4851; GFX6-NEXT: v_and_b32_e32 v2, s3, v6 4852; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 4853; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 4854; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 4855; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4856; GFX6-NEXT: s_waitcnt expcnt(0) 4857; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 4858; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 4859; GFX6-NEXT: s_endpgm 4860; 4861; GFX9-LABEL: urem_v3i15: 4862; GFX9: ; %bb.0: 4863; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4864; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 4865; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 4866; GFX9-NEXT: s_movk_i32 s8, 0x7fff 4867; GFX9-NEXT: v_mov_b32_e32 v2, 0 4868; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4869; GFX9-NEXT: v_mov_b32_e32 v0, s4 4870; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 4871; GFX9-NEXT: s_and_b32 s5, s6, s8 4872; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 4873; GFX9-NEXT: s_and_b32 s0, s4, s8 4874; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 4875; GFX9-NEXT: s_bfe_u32 s5, s6, 0xf000f 4876; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 4877; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s5 4878; GFX9-NEXT: v_mov_b32_e32 v3, s6 4879; GFX9-NEXT: v_alignbit_b32 v3, s7, v3, 30 4880; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 4881; GFX9-NEXT: v_trunc_f32_e32 v5, v5 4882; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 4883; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 4884; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 4885; GFX9-NEXT: s_bfe_u32 s1, s4, 0xf000f 4886; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 4887; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc 4888; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 4889; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 4890; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 4891; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 4892; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v5 4893; GFX9-NEXT: s_lshr_b32 s0, s6, 15 4894; GFX9-NEXT: v_mul_f32_e32 v4, v7, v8 4895; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 4896; GFX9-NEXT: v_trunc_f32_e32 v4, v4 4897; GFX9-NEXT: v_mad_f32 v7, -v4, v6, v7 4898; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 4899; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 4900; GFX9-NEXT: v_mul_f32_e32 v6, v8, v9 4901; GFX9-NEXT: v_trunc_f32_e32 v6, v6 4902; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 4903; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 4904; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v8 4905; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 4906; GFX9-NEXT: v_mul_lo_u32 v4, v4, s0 4907; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc 4908; GFX9-NEXT: v_mul_lo_u32 v3, v5, v3 4909; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 4910; GFX9-NEXT: s_lshr_b32 s0, s4, 15 4911; GFX9-NEXT: v_sub_u32_e32 v4, s0, v4 4912; GFX9-NEXT: v_and_b32_e32 v4, s8, v4 4913; GFX9-NEXT: v_sub_u32_e32 v5, s4, v1 4914; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 4915; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 4916; GFX9-NEXT: v_and_b32_e32 v3, s8, v5 4917; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 4918; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 4919; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 4920; GFX9-NEXT: global_store_dword v2, v0, s[2:3] 4921; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 4922; GFX9-NEXT: global_store_short v2, v0, s[2:3] offset:4 4923; GFX9-NEXT: s_endpgm 4924 %r = urem <3 x i15> %x, %y 4925 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 4926 ret void 4927} 4928 4929define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 4930; CHECK-LABEL: @sdiv_v3i15( 4931; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 4932; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 4933; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 4934; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 4935; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 4936; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 4937; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 4938; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 4939; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 4940; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 4941; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 4942; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 4943; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 4944; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 4945; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 4946; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 4947; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4948; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 4949; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 4950; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 4951; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 17 4952; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 17 4953; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15 4954; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i15> undef, i15 [[TMP23]], i64 0 4955; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1 4956; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1 4957; CHECK-NEXT: [[TMP27:%.*]] = sext i15 [[TMP25]] to i32 4958; CHECK-NEXT: [[TMP28:%.*]] = sext i15 [[TMP26]] to i32 4959; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 4960; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 4961; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 4962; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 4963; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 4964; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 4965; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 4966; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 4967; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 4968; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 4969; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 4970; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 4971; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 4972; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 4973; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 4974; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 4975; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 17 4976; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 17 4977; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15 4978; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1 4979; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2 4980; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2 4981; CHECK-NEXT: [[TMP51:%.*]] = sext i15 [[TMP49]] to i32 4982; CHECK-NEXT: [[TMP52:%.*]] = sext i15 [[TMP50]] to i32 4983; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 4984; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 4985; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 4986; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 4987; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 4988; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 4989; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 4990; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 4991; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 4992; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 4993; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 4994; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 4995; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 4996; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 4997; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 4998; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 4999; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 17 5000; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 17 5001; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15 5002; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2 5003; CHECK-NEXT: store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 5004; CHECK-NEXT: ret void 5005; 5006; GFX6-LABEL: sdiv_v3i15: 5007; GFX6: ; %bb.0: 5008; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5009; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 5010; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 5011; GFX6-NEXT: s_mov_b32 s7, 0xf000 5012; GFX6-NEXT: s_mov_b32 s6, -1 5013; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5014; GFX6-NEXT: v_mov_b32_e32 v0, s2 5015; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 5016; GFX6-NEXT: s_bfe_i32 s3, s0, 0xf0000 5017; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s3 5018; GFX6-NEXT: v_mov_b32_e32 v1, s0 5019; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 30 5020; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf0000 5021; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s1 5022; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 5023; GFX6-NEXT: s_xor_b32 s1, s1, s3 5024; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f 5025; GFX6-NEXT: s_ashr_i32 s1, s1, 30 5026; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 5027; GFX6-NEXT: v_trunc_f32_e32 v4, v4 5028; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 5029; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 5030; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 5031; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s0 5032; GFX6-NEXT: s_or_b32 s1, s1, 1 5033; GFX6-NEXT: v_mov_b32_e32 v5, s1 5034; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 5035; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf000f 5036; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 5037; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 5038; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 5039; GFX6-NEXT: s_xor_b32 s0, s1, s0 5040; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 5041; GFX6-NEXT: s_ashr_i32 s0, s0, 30 5042; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 5043; GFX6-NEXT: v_trunc_f32_e32 v5, v5 5044; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 5045; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| 5046; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 5047; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v1 5048; GFX6-NEXT: s_or_b32 s0, s0, 1 5049; GFX6-NEXT: v_mov_b32_e32 v6, s0 5050; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 5051; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 5052; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 5053; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v0 5054; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 5055; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 5056; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 5057; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 5058; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 5059; GFX6-NEXT: v_trunc_f32_e32 v1, v1 5060; GFX6-NEXT: v_mad_f32 v5, -v1, v4, v5 5061; GFX6-NEXT: v_cvt_i32_f32_e32 v1, v1 5062; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| 5063; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 5064; GFX6-NEXT: s_movk_i32 s0, 0x7fff 5065; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5066; GFX6-NEXT: v_and_b32_e32 v3, s0, v3 5067; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 5068; GFX6-NEXT: v_and_b32_e32 v2, s0, v2 5069; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 5070; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 5071; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 5072; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5073; GFX6-NEXT: s_waitcnt expcnt(0) 5074; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5075; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 5076; GFX6-NEXT: s_endpgm 5077; 5078; GFX9-LABEL: sdiv_v3i15: 5079; GFX9: ; %bb.0: 5080; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5081; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5082; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 5083; GFX9-NEXT: v_mov_b32_e32 v2, 0 5084; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5085; GFX9-NEXT: s_bfe_i32 s1, s4, 0xf0000 5086; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf0000 5087; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 5088; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 5089; GFX9-NEXT: s_xor_b32 s0, s1, s0 5090; GFX9-NEXT: v_mov_b32_e32 v0, s4 5091; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 5092; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5093; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 5094; GFX9-NEXT: s_or_b32 s5, s0, 1 5095; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 5096; GFX9-NEXT: v_trunc_f32_e32 v5, v5 5097; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 5098; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 5099; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 5100; GFX9-NEXT: s_cselect_b32 s0, s5, 0 5101; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 5102; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf000f 5103; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s1 5104; GFX9-NEXT: v_mov_b32_e32 v1, s6 5105; GFX9-NEXT: v_add_u32_e32 v4, s0, v5 5106; GFX9-NEXT: s_bfe_i32 s0, s4, 0xf000f 5107; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 5108; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 5109; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 5110; GFX9-NEXT: s_xor_b32 s0, s0, s1 5111; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5112; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 5113; GFX9-NEXT: v_trunc_f32_e32 v6, v6 5114; GFX9-NEXT: v_mad_f32 v5, -v6, v3, v5 5115; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 5116; GFX9-NEXT: s_or_b32 s4, s0, 1 5117; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| 5118; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 5119; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 5120; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 5121; GFX9-NEXT: s_cselect_b32 s0, s4, 0 5122; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 15 5123; GFX9-NEXT: v_add_u32_e32 v5, s0, v6 5124; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v0 5125; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 5126; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 5127; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 5128; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 5129; GFX9-NEXT: v_mul_f32_e32 v1, v6, v7 5130; GFX9-NEXT: v_trunc_f32_e32 v1, v1 5131; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v1 5132; GFX9-NEXT: v_mad_f32 v1, -v1, v3, v6 5133; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| 5134; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 5135; GFX9-NEXT: s_movk_i32 s0, 0x7fff 5136; GFX9-NEXT: v_add_u32_e32 v0, v7, v0 5137; GFX9-NEXT: v_and_b32_e32 v3, s0, v4 5138; GFX9-NEXT: v_and_b32_e32 v4, s0, v5 5139; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 5140; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 5141; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 5142; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 5143; GFX9-NEXT: global_store_dword v2, v0, s[2:3] 5144; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5145; GFX9-NEXT: global_store_short v2, v0, s[2:3] offset:4 5146; GFX9-NEXT: s_endpgm 5147 %r = sdiv <3 x i15> %x, %y 5148 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 5149 ret void 5150} 5151 5152define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 5153; CHECK-LABEL: @srem_v3i15( 5154; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 5155; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 5156; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 5157; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 5158; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 5159; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 5160; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 5161; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 5162; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 5163; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 5164; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 5165; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 5166; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 5167; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 5168; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 5169; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 5170; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 5171; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 5172; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 5173; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 5174; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 5175; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 5176; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 17 5177; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 17 5178; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15 5179; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i15> undef, i15 [[TMP25]], i64 0 5180; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1 5181; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1 5182; CHECK-NEXT: [[TMP29:%.*]] = sext i15 [[TMP27]] to i32 5183; CHECK-NEXT: [[TMP30:%.*]] = sext i15 [[TMP28]] to i32 5184; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 5185; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 5186; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 5187; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 5188; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 5189; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 5190; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 5191; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 5192; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 5193; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 5194; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 5195; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 5196; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 5197; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 5198; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 5199; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 5200; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 5201; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 5202; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 17 5203; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 17 5204; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15 5205; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1 5206; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2 5207; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2 5208; CHECK-NEXT: [[TMP55:%.*]] = sext i15 [[TMP53]] to i32 5209; CHECK-NEXT: [[TMP56:%.*]] = sext i15 [[TMP54]] to i32 5210; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 5211; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 5212; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 5213; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 5214; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 5215; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 5216; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 5217; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 5218; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 5219; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 5220; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 5221; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 5222; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 5223; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 5224; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 5225; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 5226; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 5227; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 5228; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 17 5229; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 17 5230; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15 5231; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2 5232; CHECK-NEXT: store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 5233; CHECK-NEXT: ret void 5234; 5235; GFX6-LABEL: srem_v3i15: 5236; GFX6: ; %bb.0: 5237; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5238; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 5239; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 5240; GFX6-NEXT: s_mov_b32 s7, 0xf000 5241; GFX6-NEXT: s_mov_b32 s6, -1 5242; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5243; GFX6-NEXT: v_mov_b32_e32 v0, s2 5244; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 5245; GFX6-NEXT: s_movk_i32 s3, 0x7fff 5246; GFX6-NEXT: s_and_b32 s11, s0, s3 5247; GFX6-NEXT: s_bfe_i32 s11, s11, 0xf0000 5248; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s11 5249; GFX6-NEXT: s_and_b32 s9, s2, s3 5250; GFX6-NEXT: s_bfe_i32 s9, s9, 0xf0000 5251; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s9 5252; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 5253; GFX6-NEXT: s_xor_b32 s9, s9, s11 5254; GFX6-NEXT: s_ashr_i32 s9, s9, 30 5255; GFX6-NEXT: s_or_b32 s9, s9, 1 5256; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 5257; GFX6-NEXT: v_trunc_f32_e32 v4, v4 5258; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 5259; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 5260; GFX6-NEXT: v_mov_b32_e32 v5, s9 5261; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 5262; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 5263; GFX6-NEXT: v_mov_b32_e32 v1, s0 5264; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 5265; GFX6-NEXT: s_bfe_u32 s12, s0, 0xf000f 5266; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 30 5267; GFX6-NEXT: v_mul_lo_u32 v2, v2, s0 5268; GFX6-NEXT: s_lshr_b32 s1, s0, 15 5269; GFX6-NEXT: s_bfe_i32 s0, s12, 0xf0000 5270; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s0 5271; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf000f 5272; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 5273; GFX6-NEXT: s_lshr_b32 s8, s2, 15 5274; GFX6-NEXT: s_bfe_i32 s2, s10, 0xf0000 5275; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s2 5276; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 5277; GFX6-NEXT: s_xor_b32 s0, s2, s0 5278; GFX6-NEXT: s_ashr_i32 s0, s0, 30 5279; GFX6-NEXT: s_or_b32 s0, s0, 1 5280; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 5281; GFX6-NEXT: v_trunc_f32_e32 v5, v5 5282; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 5283; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 5284; GFX6-NEXT: v_and_b32_e32 v1, s3, v1 5285; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| 5286; GFX6-NEXT: v_mov_b32_e32 v6, s0 5287; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 5288; GFX6-NEXT: v_bfe_i32 v4, v1, 0, 15 5289; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 5290; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v4 5291; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 5292; GFX6-NEXT: v_bfe_i32 v6, v0, 0, 15 5293; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v6 5294; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v5 5295; GFX6-NEXT: v_xor_b32_e32 v4, v6, v4 5296; GFX6-NEXT: v_ashrrev_i32_e32 v4, 30, v4 5297; GFX6-NEXT: v_or_b32_e32 v4, 1, v4 5298; GFX6-NEXT: v_mul_f32_e32 v6, v7, v8 5299; GFX6-NEXT: v_trunc_f32_e32 v6, v6 5300; GFX6-NEXT: v_mad_f32 v7, -v6, v5, v7 5301; GFX6-NEXT: v_cvt_i32_f32_e32 v6, v6 5302; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v5| 5303; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc 5304; GFX6-NEXT: v_mul_lo_u32 v3, v3, s1 5305; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 5306; GFX6-NEXT: v_mul_lo_u32 v1, v4, v1 5307; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 5308; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 5309; GFX6-NEXT: v_and_b32_e32 v3, s3, v3 5310; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 5311; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 5312; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 5313; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 5314; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 5315; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5316; GFX6-NEXT: s_waitcnt expcnt(0) 5317; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5318; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 5319; GFX6-NEXT: s_endpgm 5320; 5321; GFX9-LABEL: srem_v3i15: 5322; GFX9: ; %bb.0: 5323; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5324; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5325; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 5326; GFX9-NEXT: s_movk_i32 s8, 0x7fff 5327; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5328; GFX9-NEXT: s_and_b32 s0, s4, s8 5329; GFX9-NEXT: s_and_b32 s1, s6, s8 5330; GFX9-NEXT: s_bfe_i32 s1, s1, 0xf0000 5331; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 5332; GFX9-NEXT: s_bfe_i32 s0, s0, 0xf0000 5333; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 5334; GFX9-NEXT: s_xor_b32 s0, s0, s1 5335; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 5336; GFX9-NEXT: v_mov_b32_e32 v0, s4 5337; GFX9-NEXT: v_mov_b32_e32 v1, s6 5338; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5339; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 5340; GFX9-NEXT: v_trunc_f32_e32 v4, v4 5341; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 5342; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 5343; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 5344; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 5345; GFX9-NEXT: s_or_b32 s11, s0, 1 5346; GFX9-NEXT: s_lshr_b32 s9, s4, 15 5347; GFX9-NEXT: s_bfe_u32 s5, s4, 0xf000f 5348; GFX9-NEXT: s_lshr_b32 s7, s6, 15 5349; GFX9-NEXT: s_bfe_u32 s10, s6, 0xf000f 5350; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| 5351; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 5352; GFX9-NEXT: s_cselect_b32 s0, s11, 0 5353; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 5354; GFX9-NEXT: s_bfe_i32 s0, s10, 0xf0000 5355; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 5356; GFX9-NEXT: s_bfe_i32 s1, s5, 0xf0000 5357; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 5358; GFX9-NEXT: s_xor_b32 s0, s1, s0 5359; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 5360; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5361; GFX9-NEXT: s_or_b32 s5, s0, 1 5362; GFX9-NEXT: v_and_b32_e32 v1, s8, v1 5363; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 5364; GFX9-NEXT: v_trunc_f32_e32 v5, v5 5365; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 5366; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 5367; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 5368; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 5369; GFX9-NEXT: s_cselect_b32 s0, s5, 0 5370; GFX9-NEXT: v_bfe_i32 v4, v1, 0, 15 5371; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 5372; GFX9-NEXT: v_cvt_f32_i32_e32 v5, v4 5373; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 5374; GFX9-NEXT: v_bfe_i32 v6, v0, 0, 15 5375; GFX9-NEXT: v_cvt_f32_i32_e32 v7, v6 5376; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 5377; GFX9-NEXT: v_xor_b32_e32 v4, v6, v4 5378; GFX9-NEXT: v_ashrrev_i32_e32 v4, 30, v4 5379; GFX9-NEXT: v_or_b32_e32 v4, 1, v4 5380; GFX9-NEXT: v_mul_f32_e32 v6, v7, v8 5381; GFX9-NEXT: v_trunc_f32_e32 v6, v6 5382; GFX9-NEXT: v_cvt_i32_f32_e32 v8, v6 5383; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v7 5384; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v5| 5385; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc 5386; GFX9-NEXT: v_mul_lo_u32 v3, v3, s7 5387; GFX9-NEXT: v_add_u32_e32 v4, v8, v4 5388; GFX9-NEXT: v_mul_lo_u32 v2, v2, s6 5389; GFX9-NEXT: v_mul_lo_u32 v1, v4, v1 5390; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3 5391; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 5392; GFX9-NEXT: v_sub_u32_e32 v2, s4, v2 5393; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 5394; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 5395; GFX9-NEXT: v_and_b32_e32 v2, s8, v2 5396; GFX9-NEXT: v_lshlrev_b32_e32 v3, 15, v3 5397; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 5398; GFX9-NEXT: v_mov_b32_e32 v4, 0 5399; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 5400; GFX9-NEXT: global_store_dword v4, v0, s[2:3] 5401; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5402; GFX9-NEXT: global_store_short v4, v0, s[2:3] offset:4 5403; GFX9-NEXT: s_endpgm 5404 %r = srem <3 x i15> %x, %y 5405 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 5406 ret void 5407} 5408 5409define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 5410; CHECK-LABEL: @udiv_i32_oddk_denom( 5411; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 1235195 5412; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5413; CHECK-NEXT: ret void 5414; 5415; GFX6-LABEL: udiv_i32_oddk_denom: 5416; GFX6: ; %bb.0: 5417; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5418; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 5419; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 5420; GFX6-NEXT: s_mov_b32 s7, 0xf000 5421; GFX6-NEXT: s_mov_b32 s6, -1 5422; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5423; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 5424; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v0 5425; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 5426; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5427; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 5428; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5429; GFX6-NEXT: s_endpgm 5430; 5431; GFX9-LABEL: udiv_i32_oddk_denom: 5432; GFX9: ; %bb.0: 5433; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5434; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 5435; GFX9-NEXT: v_mov_b32_e32 v0, 0 5436; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5437; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 5438; GFX9-NEXT: s_sub_i32 s1, s4, s0 5439; GFX9-NEXT: s_lshr_b32 s1, s1, 1 5440; GFX9-NEXT: s_add_i32 s1, s1, s0 5441; GFX9-NEXT: s_lshr_b32 s0, s1, 20 5442; GFX9-NEXT: v_mov_b32_e32 v1, s0 5443; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 5444; GFX9-NEXT: s_endpgm 5445 %r = udiv i32 %x, 1235195 5446 store i32 %r, i32 addrspace(1)* %out 5447 ret void 5448} 5449 5450define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 5451; CHECK-LABEL: @udiv_i32_pow2k_denom( 5452; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 4096 5453; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5454; CHECK-NEXT: ret void 5455; 5456; GFX6-LABEL: udiv_i32_pow2k_denom: 5457; GFX6: ; %bb.0: 5458; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5459; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 5460; GFX6-NEXT: s_mov_b32 s7, 0xf000 5461; GFX6-NEXT: s_mov_b32 s6, -1 5462; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5463; GFX6-NEXT: s_lshr_b32 s0, s0, 12 5464; GFX6-NEXT: v_mov_b32_e32 v0, s0 5465; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5466; GFX6-NEXT: s_endpgm 5467; 5468; GFX9-LABEL: udiv_i32_pow2k_denom: 5469; GFX9: ; %bb.0: 5470; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5471; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 5472; GFX9-NEXT: v_mov_b32_e32 v0, 0 5473; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5474; GFX9-NEXT: s_lshr_b32 s0, s4, 12 5475; GFX9-NEXT: v_mov_b32_e32 v1, s0 5476; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 5477; GFX9-NEXT: s_endpgm 5478 %r = udiv i32 %x, 4096 5479 store i32 %r, i32 addrspace(1)* %out 5480 ret void 5481} 5482 5483define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 5484; CHECK-LABEL: @udiv_i32_pow2_shl_denom( 5485; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 5486; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]] 5487; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5488; CHECK-NEXT: ret void 5489; 5490; GFX6-LABEL: udiv_i32_pow2_shl_denom: 5491; GFX6: ; %bb.0: 5492; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5493; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5494; GFX6-NEXT: s_mov_b32 s7, 0xf000 5495; GFX6-NEXT: s_mov_b32 s6, -1 5496; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5497; GFX6-NEXT: s_add_i32 s1, s1, 12 5498; GFX6-NEXT: s_lshr_b32 s0, s0, s1 5499; GFX6-NEXT: v_mov_b32_e32 v0, s0 5500; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5501; GFX6-NEXT: s_endpgm 5502; 5503; GFX9-LABEL: udiv_i32_pow2_shl_denom: 5504; GFX9: ; %bb.0: 5505; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5506; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5507; GFX9-NEXT: v_mov_b32_e32 v0, 0 5508; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5509; GFX9-NEXT: s_add_i32 s0, s5, 12 5510; GFX9-NEXT: s_lshr_b32 s0, s4, s0 5511; GFX9-NEXT: v_mov_b32_e32 v1, s0 5512; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 5513; GFX9-NEXT: s_endpgm 5514 %shl.y = shl i32 4096, %y 5515 %r = udiv i32 %x, %shl.y 5516 store i32 %r, i32 addrspace(1)* %out 5517 ret void 5518} 5519 5520define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 5521; CHECK-LABEL: @udiv_v2i32_pow2k_denom( 5522; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5523; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 5524; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 5525; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 5526; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096 5527; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 5528; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 5529; CHECK-NEXT: ret void 5530; 5531; GFX6-LABEL: udiv_v2i32_pow2k_denom: 5532; GFX6: ; %bb.0: 5533; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5534; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5535; GFX6-NEXT: s_mov_b32 s7, 0xf000 5536; GFX6-NEXT: s_mov_b32 s6, -1 5537; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5538; GFX6-NEXT: s_lshr_b32 s0, s0, 12 5539; GFX6-NEXT: s_lshr_b32 s1, s1, 12 5540; GFX6-NEXT: v_mov_b32_e32 v0, s0 5541; GFX6-NEXT: v_mov_b32_e32 v1, s1 5542; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5543; GFX6-NEXT: s_endpgm 5544; 5545; GFX9-LABEL: udiv_v2i32_pow2k_denom: 5546; GFX9: ; %bb.0: 5547; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5548; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5549; GFX9-NEXT: v_mov_b32_e32 v2, 0 5550; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5551; GFX9-NEXT: s_lshr_b32 s0, s4, 12 5552; GFX9-NEXT: s_lshr_b32 s1, s5, 12 5553; GFX9-NEXT: v_mov_b32_e32 v0, s0 5554; GFX9-NEXT: v_mov_b32_e32 v1, s1 5555; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 5556; GFX9-NEXT: s_endpgm 5557 %r = udiv <2 x i32> %x, <i32 4096, i32 4096> 5558 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 5559 ret void 5560} 5561 5562define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 5563; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom( 5564; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5565; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 5566; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 5567; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 5568; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095 5569; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 5570; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 5571; CHECK-NEXT: ret void 5572; 5573; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom: 5574; GFX6: ; %bb.0: 5575; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5576; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5577; GFX6-NEXT: v_mov_b32_e32 v0, 0x100101 5578; GFX6-NEXT: s_mov_b32 s7, 0xf000 5579; GFX6-NEXT: s_mov_b32 s6, -1 5580; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5581; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 5582; GFX6-NEXT: s_lshr_b32 s0, s0, 12 5583; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v0 5584; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 5585; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5586; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v0 5587; GFX6-NEXT: v_mov_b32_e32 v0, s0 5588; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5589; GFX6-NEXT: s_endpgm 5590; 5591; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: 5592; GFX9: ; %bb.0: 5593; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5594; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5595; GFX9-NEXT: v_mov_b32_e32 v2, 0 5596; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5597; GFX9-NEXT: s_mul_hi_u32 s1, s5, 0x100101 5598; GFX9-NEXT: s_lshr_b32 s0, s4, 12 5599; GFX9-NEXT: s_sub_i32 s4, s5, s1 5600; GFX9-NEXT: s_lshr_b32 s4, s4, 1 5601; GFX9-NEXT: s_add_i32 s4, s4, s1 5602; GFX9-NEXT: s_lshr_b32 s1, s4, 11 5603; GFX9-NEXT: v_mov_b32_e32 v0, s0 5604; GFX9-NEXT: v_mov_b32_e32 v1, s1 5605; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 5606; GFX9-NEXT: s_endpgm 5607 %r = udiv <2 x i32> %x, <i32 4096, i32 4095> 5608 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 5609 ret void 5610} 5611 5612define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 5613; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom( 5614; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 5615; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5616; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 5617; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 5618; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 5619; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 5620; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 5621; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 5622; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 5623; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 5624; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 5625; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 5626; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 5627; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 5628; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 5629; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 5630; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 5631; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 5632; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 5633; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 5634; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 5635; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 5636; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 5637; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 5638; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 5639; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1 5640; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]] 5641; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]] 5642; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]] 5643; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]] 5644; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 5645; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 5646; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> undef, i32 [[TMP31]], i64 0 5647; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x i32> [[X]], i64 1 5648; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 5649; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float 5650; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 5651; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000 5652; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32 5653; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]] 5654; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]] 5655; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64 5656; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64 5657; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]] 5658; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 5659; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32 5660; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32 5661; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]] 5662; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64 5663; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64 5664; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]] 5665; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 5666; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32 5667; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32 5668; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]] 5669; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]] 5670; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]] 5671; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1 5672; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]] 5673; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]] 5674; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]] 5675; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]] 5676; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1 5677; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]] 5678; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP63]], i64 1 5679; CHECK-NEXT: store <2 x i32> [[TMP64]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 5680; CHECK-NEXT: ret void 5681; 5682; GFX6-LABEL: udiv_v2i32_pow2_shl_denom: 5683; GFX6: ; %bb.0: 5684; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 5685; GFX6-NEXT: s_movk_i32 s4, 0x1000 5686; GFX6-NEXT: s_mov_b32 s7, 0xf000 5687; GFX6-NEXT: s_mov_b32 s6, -1 5688; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5689; GFX6-NEXT: s_lshl_b32 s8, s4, s2 5690; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 5691; GFX6-NEXT: s_lshl_b32 s9, s4, s3 5692; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 5693; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5694; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 5695; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 5696; GFX6-NEXT: s_mov_b32 s0, 0x4f7ffffe 5697; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 5698; GFX6-NEXT: v_mul_f32_e32 v0, s0, v0 5699; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 5700; GFX6-NEXT: v_mul_f32_e32 v1, s0, v1 5701; GFX6-NEXT: s_sub_i32 s0, 0, s8 5702; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 5703; GFX6-NEXT: v_mul_lo_u32 v2, s0, v0 5704; GFX6-NEXT: s_sub_i32 s0, 0, s9 5705; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1 5706; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 5707; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 5708; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 5709; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5710; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 5711; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 5712; GFX6-NEXT: v_mul_hi_u32 v1, s3, v1 5713; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 5714; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 5715; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9 5716; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 5717; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 5718; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 5719; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v2 5720; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 5721; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 5722; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 5723; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 5724; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v4 5725; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 5726; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 5727; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 5728; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v2 5729; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 5730; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 5731; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 5732; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 5733; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5734; GFX6-NEXT: s_endpgm 5735; 5736; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: 5737; GFX9: ; %bb.0: 5738; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 5739; GFX9-NEXT: s_movk_i32 s4, 0x1000 5740; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5741; GFX9-NEXT: s_lshl_b32 s5, s4, s3 5742; GFX9-NEXT: s_lshl_b32 s4, s4, s2 5743; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 5744; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 5745; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe 5746; GFX9-NEXT: s_sub_i32 s3, 0, s5 5747; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 5748; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 5749; GFX9-NEXT: v_mul_f32_e32 v0, s2, v0 5750; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 5751; GFX9-NEXT: v_mul_f32_e32 v1, s2, v1 5752; GFX9-NEXT: s_sub_i32 s2, 0, s4 5753; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 5754; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 5755; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 5756; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 5757; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 5758; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 5759; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 5760; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5761; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5762; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 5763; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 5764; GFX9-NEXT: v_mul_hi_u32 v1, s3, v1 5765; GFX9-NEXT: v_mov_b32_e32 v2, 0 5766; GFX9-NEXT: v_mul_lo_u32 v3, v0, s4 5767; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 5768; GFX9-NEXT: v_mul_lo_u32 v4, v1, s5 5769; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 5770; GFX9-NEXT: v_sub_u32_e32 v3, s2, v3 5771; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 5772; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 5773; GFX9-NEXT: v_subrev_u32_e32 v5, s4, v3 5774; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 5775; GFX9-NEXT: v_sub_u32_e32 v4, s3, v4 5776; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 5777; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 5778; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 5779; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v4 5780; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v4 5781; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 5782; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc 5783; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 5784; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 5785; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 5786; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 5787; GFX9-NEXT: s_endpgm 5788 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 5789 %r = udiv <2 x i32> %x, %shl.y 5790 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 5791 ret void 5792} 5793 5794define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 5795; CHECK-LABEL: @urem_i32_oddk_denom( 5796; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 1235195 5797; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5798; CHECK-NEXT: ret void 5799; 5800; GFX6-LABEL: urem_i32_oddk_denom: 5801; GFX6: ; %bb.0: 5802; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 5803; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 5804; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb 5805; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5806; GFX6-NEXT: s_mov_b32 s3, 0xf000 5807; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5808; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 5809; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v0 5810; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 5811; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 5812; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 5813; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 5814; GFX6-NEXT: s_mov_b32 s2, -1 5815; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 5816; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 5817; GFX6-NEXT: s_endpgm 5818; 5819; GFX9-LABEL: urem_i32_oddk_denom: 5820; GFX9: ; %bb.0: 5821; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5822; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 5823; GFX9-NEXT: v_mov_b32_e32 v0, 0 5824; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5825; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 5826; GFX9-NEXT: s_sub_i32 s1, s4, s0 5827; GFX9-NEXT: s_lshr_b32 s1, s1, 1 5828; GFX9-NEXT: s_add_i32 s1, s1, s0 5829; GFX9-NEXT: s_lshr_b32 s0, s1, 20 5830; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb 5831; GFX9-NEXT: s_sub_i32 s0, s4, s0 5832; GFX9-NEXT: v_mov_b32_e32 v1, s0 5833; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 5834; GFX9-NEXT: s_endpgm 5835 %r = urem i32 %x, 1235195 5836 store i32 %r, i32 addrspace(1)* %out 5837 ret void 5838} 5839 5840define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 5841; CHECK-LABEL: @urem_i32_pow2k_denom( 5842; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 4096 5843; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5844; CHECK-NEXT: ret void 5845; 5846; GFX6-LABEL: urem_i32_pow2k_denom: 5847; GFX6: ; %bb.0: 5848; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5849; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 5850; GFX6-NEXT: s_mov_b32 s7, 0xf000 5851; GFX6-NEXT: s_mov_b32 s6, -1 5852; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5853; GFX6-NEXT: s_and_b32 s0, s0, 0xfff 5854; GFX6-NEXT: v_mov_b32_e32 v0, s0 5855; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5856; GFX6-NEXT: s_endpgm 5857; 5858; GFX9-LABEL: urem_i32_pow2k_denom: 5859; GFX9: ; %bb.0: 5860; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5861; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 5862; GFX9-NEXT: v_mov_b32_e32 v0, 0 5863; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5864; GFX9-NEXT: s_and_b32 s0, s4, 0xfff 5865; GFX9-NEXT: v_mov_b32_e32 v1, s0 5866; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 5867; GFX9-NEXT: s_endpgm 5868 %r = urem i32 %x, 4096 5869 store i32 %r, i32 addrspace(1)* %out 5870 ret void 5871} 5872 5873define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 5874; CHECK-LABEL: @urem_i32_pow2_shl_denom( 5875; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 5876; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]] 5877; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 5878; CHECK-NEXT: ret void 5879; 5880; GFX6-LABEL: urem_i32_pow2_shl_denom: 5881; GFX6: ; %bb.0: 5882; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5883; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5884; GFX6-NEXT: s_mov_b32 s7, 0xf000 5885; GFX6-NEXT: s_mov_b32 s6, -1 5886; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5887; GFX6-NEXT: s_lshl_b32 s1, 0x1000, s1 5888; GFX6-NEXT: s_add_i32 s1, s1, -1 5889; GFX6-NEXT: s_and_b32 s0, s0, s1 5890; GFX6-NEXT: v_mov_b32_e32 v0, s0 5891; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5892; GFX6-NEXT: s_endpgm 5893; 5894; GFX9-LABEL: urem_i32_pow2_shl_denom: 5895; GFX9: ; %bb.0: 5896; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5897; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5898; GFX9-NEXT: v_mov_b32_e32 v0, 0 5899; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5900; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s5 5901; GFX9-NEXT: s_add_i32 s0, s0, -1 5902; GFX9-NEXT: s_and_b32 s0, s4, s0 5903; GFX9-NEXT: v_mov_b32_e32 v1, s0 5904; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 5905; GFX9-NEXT: s_endpgm 5906 %shl.y = shl i32 4096, %y 5907 %r = urem i32 %x, %shl.y 5908 store i32 %r, i32 addrspace(1)* %out 5909 ret void 5910} 5911 5912define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 5913; CHECK-LABEL: @urem_v2i32_pow2k_denom( 5914; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5915; CHECK-NEXT: [[TMP2:%.*]] = urem i32 [[TMP1]], 4096 5916; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 5917; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 5918; CHECK-NEXT: [[TMP5:%.*]] = urem i32 [[TMP4]], 4096 5919; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 5920; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 5921; CHECK-NEXT: ret void 5922; 5923; GFX6-LABEL: urem_v2i32_pow2k_denom: 5924; GFX6: ; %bb.0: 5925; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5926; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5927; GFX6-NEXT: s_movk_i32 s2, 0xfff 5928; GFX6-NEXT: s_mov_b32 s7, 0xf000 5929; GFX6-NEXT: s_mov_b32 s6, -1 5930; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5931; GFX6-NEXT: s_and_b32 s0, s0, s2 5932; GFX6-NEXT: s_and_b32 s1, s1, s2 5933; GFX6-NEXT: v_mov_b32_e32 v0, s0 5934; GFX6-NEXT: v_mov_b32_e32 v1, s1 5935; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 5936; GFX6-NEXT: s_endpgm 5937; 5938; GFX9-LABEL: urem_v2i32_pow2k_denom: 5939; GFX9: ; %bb.0: 5940; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5941; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5942; GFX9-NEXT: s_movk_i32 s0, 0xfff 5943; GFX9-NEXT: v_mov_b32_e32 v2, 0 5944; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5945; GFX9-NEXT: s_and_b32 s1, s4, s0 5946; GFX9-NEXT: s_and_b32 s0, s5, s0 5947; GFX9-NEXT: v_mov_b32_e32 v0, s1 5948; GFX9-NEXT: v_mov_b32_e32 v1, s0 5949; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 5950; GFX9-NEXT: s_endpgm 5951 %r = urem <2 x i32> %x, <i32 4096, i32 4096> 5952 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 5953 ret void 5954} 5955 5956define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 5957; CHECK-LABEL: @urem_v2i32_pow2_shl_denom( 5958; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 5959; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 5960; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 5961; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 5962; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 5963; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 5964; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 5965; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 5966; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 5967; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 5968; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 5969; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 5970; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 5971; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 5972; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 5973; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 5974; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 5975; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 5976; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 5977; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 5978; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 5979; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 5980; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 5981; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 5982; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 5983; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]] 5984; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]] 5985; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]] 5986; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]] 5987; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]] 5988; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> undef, i32 [[TMP29]], i64 0 5989; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[X]], i64 1 5990; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 5991; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float 5992; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 5993; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000 5994; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32 5995; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]] 5996; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]] 5997; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64 5998; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64 5999; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]] 6000; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32 6001; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32 6002; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 6003; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]] 6004; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64 6005; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64 6006; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]] 6007; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32 6008; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32 6009; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 6010; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]] 6011; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]] 6012; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]] 6013; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]] 6014; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]] 6015; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]] 6016; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]] 6017; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]] 6018; CHECK-NEXT: [[TMP60:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP59]], i64 1 6019; CHECK-NEXT: store <2 x i32> [[TMP60]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 6020; CHECK-NEXT: ret void 6021; 6022; GFX6-LABEL: urem_v2i32_pow2_shl_denom: 6023; GFX6: ; %bb.0: 6024; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 6025; GFX6-NEXT: s_movk_i32 s4, 0x1000 6026; GFX6-NEXT: s_mov_b32 s5, 0x4f7ffffe 6027; GFX6-NEXT: s_mov_b32 s7, 0xf000 6028; GFX6-NEXT: s_mov_b32 s6, -1 6029; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6030; GFX6-NEXT: s_lshl_b32 s2, s4, s2 6031; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 6032; GFX6-NEXT: s_lshl_b32 s3, s4, s3 6033; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 6034; GFX6-NEXT: s_sub_i32 s4, 0, s2 6035; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6036; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 6037; GFX6-NEXT: v_mul_f32_e32 v0, s5, v0 6038; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6039; GFX6-NEXT: v_mul_f32_e32 v1, s5, v1 6040; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 6041; GFX6-NEXT: v_mul_lo_u32 v2, s4, v0 6042; GFX6-NEXT: s_sub_i32 s4, 0, s3 6043; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1 6044; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6045; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6046; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 6047; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 6048; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 6049; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6050; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 6051; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 6052; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 6053; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 6054; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 6055; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 6056; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v0 6057; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 6058; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6059; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v0 6060; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 6061; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6062; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 6063; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 6064; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 6065; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6066; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 6067; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 6068; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 6069; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6070; GFX6-NEXT: s_endpgm 6071; 6072; GFX9-LABEL: urem_v2i32_pow2_shl_denom: 6073; GFX9: ; %bb.0: 6074; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 6075; GFX9-NEXT: s_movk_i32 s4, 0x1000 6076; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6077; GFX9-NEXT: s_lshl_b32 s5, s4, s3 6078; GFX9-NEXT: s_lshl_b32 s4, s4, s2 6079; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 6080; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 6081; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe 6082; GFX9-NEXT: s_sub_i32 s3, 0, s5 6083; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6084; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 6085; GFX9-NEXT: v_mul_f32_e32 v0, s2, v0 6086; GFX9-NEXT: v_mul_f32_e32 v1, s2, v1 6087; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6088; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 6089; GFX9-NEXT: s_sub_i32 s2, 0, s4 6090; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 6091; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 6092; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6093; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 6094; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 6095; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6096; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 6097; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 6098; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6099; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 6100; GFX9-NEXT: v_mul_hi_u32 v1, s3, v1 6101; GFX9-NEXT: v_mov_b32_e32 v2, 0 6102; GFX9-NEXT: v_mul_lo_u32 v0, v0, s4 6103; GFX9-NEXT: v_mul_lo_u32 v1, v1, s5 6104; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 6105; GFX9-NEXT: v_sub_u32_e32 v1, s3, v1 6106; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v0 6107; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 6108; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6109; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v1 6110; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 6111; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 6112; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v0 6113; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 6114; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6115; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v1 6116; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 6117; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 6118; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 6119; GFX9-NEXT: s_endpgm 6120 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 6121 %r = urem <2 x i32> %x, %shl.y 6122 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 6123 ret void 6124} 6125 6126define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 6127; CHECK-LABEL: @sdiv_i32_oddk_denom( 6128; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195 6129; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6130; CHECK-NEXT: ret void 6131; 6132; GFX6-LABEL: sdiv_i32_oddk_denom: 6133; GFX6: ; %bb.0: 6134; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6135; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 6136; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 6137; GFX6-NEXT: s_mov_b32 s7, 0xf000 6138; GFX6-NEXT: s_mov_b32 s6, -1 6139; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6140; GFX6-NEXT: v_mul_hi_i32 v0, s0, v0 6141; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 6142; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 6143; GFX6-NEXT: v_ashrrev_i32_e32 v0, 20, v0 6144; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 6145; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 6146; GFX6-NEXT: s_endpgm 6147; 6148; GFX9-LABEL: sdiv_i32_oddk_denom: 6149; GFX9: ; %bb.0: 6150; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6151; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 6152; GFX9-NEXT: v_mov_b32_e32 v0, 0 6153; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6154; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 6155; GFX9-NEXT: s_add_i32 s0, s0, s4 6156; GFX9-NEXT: s_lshr_b32 s1, s0, 31 6157; GFX9-NEXT: s_ashr_i32 s0, s0, 20 6158; GFX9-NEXT: s_add_i32 s0, s0, s1 6159; GFX9-NEXT: v_mov_b32_e32 v1, s0 6160; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 6161; GFX9-NEXT: s_endpgm 6162 %r = sdiv i32 %x, 1235195 6163 store i32 %r, i32 addrspace(1)* %out 6164 ret void 6165} 6166 6167define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 6168; CHECK-LABEL: @sdiv_i32_pow2k_denom( 6169; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 4096 6170; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6171; CHECK-NEXT: ret void 6172; 6173; GFX6-LABEL: sdiv_i32_pow2k_denom: 6174; GFX6: ; %bb.0: 6175; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6176; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 6177; GFX6-NEXT: s_mov_b32 s7, 0xf000 6178; GFX6-NEXT: s_mov_b32 s6, -1 6179; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6180; GFX6-NEXT: s_ashr_i32 s1, s0, 31 6181; GFX6-NEXT: s_lshr_b32 s1, s1, 20 6182; GFX6-NEXT: s_add_i32 s0, s0, s1 6183; GFX6-NEXT: s_ashr_i32 s0, s0, 12 6184; GFX6-NEXT: v_mov_b32_e32 v0, s0 6185; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 6186; GFX6-NEXT: s_endpgm 6187; 6188; GFX9-LABEL: sdiv_i32_pow2k_denom: 6189; GFX9: ; %bb.0: 6190; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6191; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 6192; GFX9-NEXT: v_mov_b32_e32 v0, 0 6193; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6194; GFX9-NEXT: s_ashr_i32 s0, s4, 31 6195; GFX9-NEXT: s_lshr_b32 s0, s0, 20 6196; GFX9-NEXT: s_add_i32 s4, s4, s0 6197; GFX9-NEXT: s_ashr_i32 s0, s4, 12 6198; GFX9-NEXT: v_mov_b32_e32 v1, s0 6199; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 6200; GFX9-NEXT: s_endpgm 6201 %r = sdiv i32 %x, 4096 6202 store i32 %r, i32 addrspace(1)* %out 6203 ret void 6204} 6205 6206define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 6207; CHECK-LABEL: @sdiv_i32_pow2_shl_denom( 6208; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 6209; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]] 6210; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6211; CHECK-NEXT: ret void 6212; 6213; GFX6-LABEL: sdiv_i32_pow2_shl_denom: 6214; GFX6: ; %bb.0: 6215; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 6216; GFX6-NEXT: s_mov_b32 s7, 0xf000 6217; GFX6-NEXT: s_mov_b32 s6, -1 6218; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6219; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 6220; GFX6-NEXT: s_ashr_i32 s8, s3, 31 6221; GFX6-NEXT: s_add_i32 s3, s3, s8 6222; GFX6-NEXT: s_xor_b32 s3, s3, s8 6223; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 6224; GFX6-NEXT: s_sub_i32 s4, 0, s3 6225; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6226; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6227; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6228; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 6229; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6230; GFX6-NEXT: s_ashr_i32 s0, s2, 31 6231; GFX6-NEXT: s_add_i32 s1, s2, s0 6232; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 6233; GFX6-NEXT: s_xor_b32 s1, s1, s0 6234; GFX6-NEXT: s_xor_b32 s2, s0, s8 6235; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 6236; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 6237; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 6238; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 6239; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 6240; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 6241; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 6242; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 6243; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 6244; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 6245; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 6246; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6247; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 6248; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 6249; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6250; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 6251; GFX6-NEXT: s_endpgm 6252; 6253; GFX9-LABEL: sdiv_i32_pow2_shl_denom: 6254; GFX9: ; %bb.0: 6255; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6256; GFX9-NEXT: v_mov_b32_e32 v2, 0 6257; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6258; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6259; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 6260; GFX9-NEXT: s_ashr_i32 s4, s3, 31 6261; GFX9-NEXT: s_add_i32 s3, s3, s4 6262; GFX9-NEXT: s_xor_b32 s3, s3, s4 6263; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 6264; GFX9-NEXT: s_sub_i32 s5, 0, s3 6265; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6266; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6267; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6268; GFX9-NEXT: v_mul_lo_u32 v1, s5, v0 6269; GFX9-NEXT: s_ashr_i32 s5, s2, 31 6270; GFX9-NEXT: s_add_i32 s2, s2, s5 6271; GFX9-NEXT: s_xor_b32 s2, s2, s5 6272; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 6273; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 6274; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 6275; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3 6276; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 6277; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 6278; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 6279; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6280; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 6281; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 6282; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 6283; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 6284; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 6285; GFX9-NEXT: s_xor_b32 s2, s5, s4 6286; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 6287; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 6288; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 6289; GFX9-NEXT: s_endpgm 6290 %shl.y = shl i32 4096, %y 6291 %r = sdiv i32 %x, %shl.y 6292 store i32 %r, i32 addrspace(1)* %out 6293 ret void 6294} 6295 6296define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 6297; CHECK-LABEL: @sdiv_v2i32_pow2k_denom( 6298; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6299; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 6300; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 6301; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 6302; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096 6303; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 6304; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 6305; CHECK-NEXT: ret void 6306; 6307; GFX6-LABEL: sdiv_v2i32_pow2k_denom: 6308; GFX6: ; %bb.0: 6309; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6310; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6311; GFX6-NEXT: s_mov_b32 s7, 0xf000 6312; GFX6-NEXT: s_mov_b32 s6, -1 6313; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6314; GFX6-NEXT: s_ashr_i32 s2, s0, 31 6315; GFX6-NEXT: s_lshr_b32 s2, s2, 20 6316; GFX6-NEXT: s_ashr_i32 s3, s1, 31 6317; GFX6-NEXT: s_add_i32 s0, s0, s2 6318; GFX6-NEXT: s_lshr_b32 s2, s3, 20 6319; GFX6-NEXT: s_add_i32 s1, s1, s2 6320; GFX6-NEXT: s_ashr_i32 s0, s0, 12 6321; GFX6-NEXT: s_ashr_i32 s1, s1, 12 6322; GFX6-NEXT: v_mov_b32_e32 v0, s0 6323; GFX6-NEXT: v_mov_b32_e32 v1, s1 6324; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6325; GFX6-NEXT: s_endpgm 6326; 6327; GFX9-LABEL: sdiv_v2i32_pow2k_denom: 6328; GFX9: ; %bb.0: 6329; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6330; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 6331; GFX9-NEXT: v_mov_b32_e32 v2, 0 6332; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6333; GFX9-NEXT: s_ashr_i32 s0, s4, 31 6334; GFX9-NEXT: s_ashr_i32 s1, s5, 31 6335; GFX9-NEXT: s_lshr_b32 s0, s0, 20 6336; GFX9-NEXT: s_lshr_b32 s1, s1, 20 6337; GFX9-NEXT: s_add_i32 s0, s4, s0 6338; GFX9-NEXT: s_add_i32 s1, s5, s1 6339; GFX9-NEXT: s_ashr_i32 s0, s0, 12 6340; GFX9-NEXT: s_ashr_i32 s1, s1, 12 6341; GFX9-NEXT: v_mov_b32_e32 v0, s0 6342; GFX9-NEXT: v_mov_b32_e32 v1, s1 6343; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 6344; GFX9-NEXT: s_endpgm 6345 %r = sdiv <2 x i32> %x, <i32 4096, i32 4096> 6346 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 6347 ret void 6348} 6349 6350define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 6351; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom( 6352; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6353; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 6354; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 6355; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 6356; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095 6357; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 6358; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 6359; CHECK-NEXT: ret void 6360; 6361; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 6362; GFX6: ; %bb.0: 6363; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6364; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6365; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080081 6366; GFX6-NEXT: s_mov_b32 s7, 0xf000 6367; GFX6-NEXT: s_mov_b32 s6, -1 6368; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6369; GFX6-NEXT: v_mul_hi_i32 v0, s1, v0 6370; GFX6-NEXT: s_ashr_i32 s2, s0, 31 6371; GFX6-NEXT: s_lshr_b32 s2, s2, 20 6372; GFX6-NEXT: s_add_i32 s0, s0, s2 6373; GFX6-NEXT: v_add_i32_e32 v0, vcc, s1, v0 6374; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 6375; GFX6-NEXT: v_ashrrev_i32_e32 v0, 11, v0 6376; GFX6-NEXT: s_ashr_i32 s0, s0, 12 6377; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v0 6378; GFX6-NEXT: v_mov_b32_e32 v0, s0 6379; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6380; GFX6-NEXT: s_endpgm 6381; 6382; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 6383; GFX9: ; %bb.0: 6384; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6385; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 6386; GFX9-NEXT: v_mov_b32_e32 v2, 0 6387; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6388; GFX9-NEXT: s_ashr_i32 s0, s4, 31 6389; GFX9-NEXT: s_mul_hi_i32 s1, s5, 0x80080081 6390; GFX9-NEXT: s_lshr_b32 s0, s0, 20 6391; GFX9-NEXT: s_add_i32 s1, s1, s5 6392; GFX9-NEXT: s_add_i32 s0, s4, s0 6393; GFX9-NEXT: s_lshr_b32 s4, s1, 31 6394; GFX9-NEXT: s_ashr_i32 s1, s1, 11 6395; GFX9-NEXT: s_ashr_i32 s0, s0, 12 6396; GFX9-NEXT: s_add_i32 s1, s1, s4 6397; GFX9-NEXT: v_mov_b32_e32 v0, s0 6398; GFX9-NEXT: v_mov_b32_e32 v1, s1 6399; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 6400; GFX9-NEXT: s_endpgm 6401 %r = sdiv <2 x i32> %x, <i32 4096, i32 4095> 6402 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 6403 ret void 6404} 6405 6406define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 6407; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom( 6408; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 6409; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6410; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 6411; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 6412; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 6413; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 6414; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 6415; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 6416; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 6417; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 6418; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 6419; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 6420; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000 6421; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 6422; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]] 6423; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]] 6424; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64 6425; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 6426; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 6427; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 6428; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 6429; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 6430; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]] 6431; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64 6432; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64 6433; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 6434; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 6435; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 6436; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 6437; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]] 6438; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]] 6439; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]] 6440; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1 6441; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]] 6442; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]] 6443; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]] 6444; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]] 6445; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1 6446; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]] 6447; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]] 6448; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]] 6449; CHECK-NEXT: [[TMP41:%.*]] = insertelement <2 x i32> undef, i32 [[TMP40]], i64 0 6450; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[X]], i64 1 6451; CHECK-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 6452; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31 6453; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31 6454; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]] 6455; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]] 6456; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]] 6457; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]] 6458; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]] 6459; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float 6460; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]]) 6461; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000 6462; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32 6463; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]] 6464; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]] 6465; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64 6466; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64 6467; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]] 6468; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32 6469; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32 6470; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32 6471; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]] 6472; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64 6473; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64 6474; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]] 6475; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32 6476; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32 6477; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32 6478; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]] 6479; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]] 6480; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]] 6481; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1 6482; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]] 6483; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]] 6484; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]] 6485; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]] 6486; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1 6487; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]] 6488; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]] 6489; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]] 6490; CHECK-NEXT: [[TMP82:%.*]] = insertelement <2 x i32> [[TMP41]], i32 [[TMP81]], i64 1 6491; CHECK-NEXT: store <2 x i32> [[TMP82]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 6492; CHECK-NEXT: ret void 6493; 6494; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom: 6495; GFX6: ; %bb.0: 6496; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 6497; GFX6-NEXT: s_movk_i32 s10, 0x1000 6498; GFX6-NEXT: s_mov_b32 s12, 0x4f7ffffe 6499; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6500; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb 6501; GFX6-NEXT: s_mov_b32 s7, 0xf000 6502; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6503; GFX6-NEXT: s_lshl_b32 s2, s10, s2 6504; GFX6-NEXT: s_ashr_i32 s11, s2, 31 6505; GFX6-NEXT: s_add_i32 s2, s2, s11 6506; GFX6-NEXT: s_xor_b32 s2, s2, s11 6507; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 6508; GFX6-NEXT: s_lshl_b32 s0, s10, s3 6509; GFX6-NEXT: s_sub_i32 s10, 0, s2 6510; GFX6-NEXT: s_ashr_i32 s3, s0, 31 6511; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6512; GFX6-NEXT: s_add_i32 s0, s0, s3 6513; GFX6-NEXT: s_ashr_i32 s1, s8, 31 6514; GFX6-NEXT: s_mov_b32 s6, -1 6515; GFX6-NEXT: v_mul_f32_e32 v0, s12, v0 6516; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6517; GFX6-NEXT: v_mul_lo_u32 v1, s10, v0 6518; GFX6-NEXT: s_xor_b32 s10, s0, s3 6519; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s10 6520; GFX6-NEXT: s_add_i32 s0, s8, s1 6521; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 6522; GFX6-NEXT: s_xor_b32 s0, s0, s1 6523; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 6524; GFX6-NEXT: s_xor_b32 s8, s1, s11 6525; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 6526; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 6527; GFX6-NEXT: v_mul_f32_e32 v1, s12, v2 6528; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 6529; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 6530; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 6531; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s0, v2 6532; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 6533; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 6534; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v2 6535; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 6536; GFX6-NEXT: s_sub_i32 s0, 0, s10 6537; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1 6538; GFX6-NEXT: s_ashr_i32 s0, s9, 31 6539; GFX6-NEXT: s_add_i32 s1, s9, s0 6540; GFX6-NEXT: s_xor_b32 s1, s1, s0 6541; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 6542; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 6543; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 6544; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 6545; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 6546; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 6547; GFX6-NEXT: s_xor_b32 s2, s0, s3 6548; GFX6-NEXT: v_mul_lo_u32 v2, v1, s10 6549; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 6550; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 6551; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 6552; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 6553; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 6554; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 6555; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s10, v2 6556; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 6557; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 6558; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 6559; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 6560; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 6561; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 6562; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6563; GFX6-NEXT: s_endpgm 6564; 6565; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: 6566; GFX9: ; %bb.0: 6567; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 6568; GFX9-NEXT: s_movk_i32 s8, 0x1000 6569; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 6570; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 6571; GFX9-NEXT: s_mov_b32 s10, 0x4f7ffffe 6572; GFX9-NEXT: v_mov_b32_e32 v2, 0 6573; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6574; GFX9-NEXT: s_lshl_b32 s2, s8, s2 6575; GFX9-NEXT: s_ashr_i32 s9, s2, 31 6576; GFX9-NEXT: s_add_i32 s2, s2, s9 6577; GFX9-NEXT: s_xor_b32 s2, s2, s9 6578; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 6579; GFX9-NEXT: s_lshl_b32 s0, s8, s3 6580; GFX9-NEXT: s_ashr_i32 s1, s0, 31 6581; GFX9-NEXT: s_add_i32 s0, s0, s1 6582; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6583; GFX9-NEXT: s_xor_b32 s0, s0, s1 6584; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 6585; GFX9-NEXT: s_sub_i32 s3, 0, s2 6586; GFX9-NEXT: v_mul_f32_e32 v0, s10, v0 6587; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6588; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 6589; GFX9-NEXT: s_sub_i32 s8, 0, s0 6590; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 6591; GFX9-NEXT: v_mul_f32_e32 v1, s10, v1 6592; GFX9-NEXT: s_ashr_i32 s3, s6, 31 6593; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 6594; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 6595; GFX9-NEXT: s_add_i32 s6, s6, s3 6596; GFX9-NEXT: s_xor_b32 s6, s6, s3 6597; GFX9-NEXT: s_xor_b32 s3, s3, s9 6598; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 6599; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 6600; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 6601; GFX9-NEXT: s_ashr_i32 s8, s7, 31 6602; GFX9-NEXT: s_xor_b32 s1, s8, s1 6603; GFX9-NEXT: v_mul_lo_u32 v4, v0, s2 6604; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 6605; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 6606; GFX9-NEXT: v_sub_u32_e32 v4, s6, v4 6607; GFX9-NEXT: s_add_i32 s6, s7, s8 6608; GFX9-NEXT: s_xor_b32 s6, s6, s8 6609; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 6610; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 6611; GFX9-NEXT: v_mul_hi_u32 v1, s6, v1 6612; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 6613; GFX9-NEXT: v_subrev_u32_e32 v5, s2, v4 6614; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc 6615; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 6616; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 6617; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6618; GFX9-NEXT: v_mul_lo_u32 v3, v1, s0 6619; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 6620; GFX9-NEXT: v_xor_b32_e32 v0, s3, v0 6621; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0 6622; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 6623; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 6624; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 6625; GFX9-NEXT: v_subrev_u32_e32 v4, s0, v3 6626; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 6627; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 6628; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 6629; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 6630; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 6631; GFX9-NEXT: v_subrev_u32_e32 v1, s1, v1 6632; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 6633; GFX9-NEXT: s_endpgm 6634 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 6635 %r = sdiv <2 x i32> %x, %shl.y 6636 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 6637 ret void 6638} 6639 6640define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 6641; CHECK-LABEL: @srem_i32_oddk_denom( 6642; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 1235195 6643; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6644; CHECK-NEXT: ret void 6645; 6646; GFX6-LABEL: srem_i32_oddk_denom: 6647; GFX6: ; %bb.0: 6648; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 6649; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 6650; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb 6651; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6652; GFX6-NEXT: s_mov_b32 s3, 0xf000 6653; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6654; GFX6-NEXT: v_mul_hi_i32 v0, s4, v0 6655; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 6656; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 6657; GFX6-NEXT: v_ashrrev_i32_e32 v0, 20, v0 6658; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 6659; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 6660; GFX6-NEXT: s_mov_b32 s2, -1 6661; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 6662; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 6663; GFX6-NEXT: s_endpgm 6664; 6665; GFX9-LABEL: srem_i32_oddk_denom: 6666; GFX9: ; %bb.0: 6667; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6668; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 6669; GFX9-NEXT: v_mov_b32_e32 v0, 0 6670; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6671; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 6672; GFX9-NEXT: s_add_i32 s0, s0, s4 6673; GFX9-NEXT: s_lshr_b32 s1, s0, 31 6674; GFX9-NEXT: s_ashr_i32 s0, s0, 20 6675; GFX9-NEXT: s_add_i32 s0, s0, s1 6676; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb 6677; GFX9-NEXT: s_sub_i32 s0, s4, s0 6678; GFX9-NEXT: v_mov_b32_e32 v1, s0 6679; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 6680; GFX9-NEXT: s_endpgm 6681 %r = srem i32 %x, 1235195 6682 store i32 %r, i32 addrspace(1)* %out 6683 ret void 6684} 6685 6686define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 6687; CHECK-LABEL: @srem_i32_pow2k_denom( 6688; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 4096 6689; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6690; CHECK-NEXT: ret void 6691; 6692; GFX6-LABEL: srem_i32_pow2k_denom: 6693; GFX6: ; %bb.0: 6694; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6695; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 6696; GFX6-NEXT: s_mov_b32 s7, 0xf000 6697; GFX6-NEXT: s_mov_b32 s6, -1 6698; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6699; GFX6-NEXT: s_ashr_i32 s1, s0, 31 6700; GFX6-NEXT: s_lshr_b32 s1, s1, 20 6701; GFX6-NEXT: s_add_i32 s1, s0, s1 6702; GFX6-NEXT: s_and_b32 s1, s1, 0xfffff000 6703; GFX6-NEXT: s_sub_i32 s0, s0, s1 6704; GFX6-NEXT: v_mov_b32_e32 v0, s0 6705; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 6706; GFX6-NEXT: s_endpgm 6707; 6708; GFX9-LABEL: srem_i32_pow2k_denom: 6709; GFX9: ; %bb.0: 6710; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6711; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 6712; GFX9-NEXT: v_mov_b32_e32 v0, 0 6713; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6714; GFX9-NEXT: s_ashr_i32 s0, s4, 31 6715; GFX9-NEXT: s_lshr_b32 s0, s0, 20 6716; GFX9-NEXT: s_add_i32 s0, s4, s0 6717; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 6718; GFX9-NEXT: s_sub_i32 s0, s4, s0 6719; GFX9-NEXT: v_mov_b32_e32 v1, s0 6720; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 6721; GFX9-NEXT: s_endpgm 6722 %r = srem i32 %x, 4096 6723 store i32 %r, i32 addrspace(1)* %out 6724 ret void 6725} 6726 6727define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 6728; CHECK-LABEL: @srem_i32_pow2_shl_denom( 6729; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 6730; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]] 6731; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 6732; CHECK-NEXT: ret void 6733; 6734; GFX6-LABEL: srem_i32_pow2_shl_denom: 6735; GFX6: ; %bb.0: 6736; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 6737; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 6738; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6739; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 6740; GFX6-NEXT: s_ashr_i32 s4, s3, 31 6741; GFX6-NEXT: s_add_i32 s3, s3, s4 6742; GFX6-NEXT: s_xor_b32 s4, s3, s4 6743; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 6744; GFX6-NEXT: s_sub_i32 s3, 0, s4 6745; GFX6-NEXT: s_ashr_i32 s5, s2, 31 6746; GFX6-NEXT: s_add_i32 s2, s2, s5 6747; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6748; GFX6-NEXT: s_xor_b32 s6, s2, s5 6749; GFX6-NEXT: s_mov_b32 s2, -1 6750; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6751; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6752; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 6753; GFX6-NEXT: s_mov_b32 s3, 0xf000 6754; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 6755; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 6756; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 6757; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 6758; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 6759; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 6760; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 6761; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 6762; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 6763; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 6764; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 6765; GFX6-NEXT: v_xor_b32_e32 v0, s5, v0 6766; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s5, v0 6767; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 6768; GFX6-NEXT: s_endpgm 6769; 6770; GFX9-LABEL: srem_i32_pow2_shl_denom: 6771; GFX9: ; %bb.0: 6772; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6773; GFX9-NEXT: s_nop 0 6774; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 6775; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6776; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 6777; GFX9-NEXT: s_ashr_i32 s4, s3, 31 6778; GFX9-NEXT: s_add_i32 s3, s3, s4 6779; GFX9-NEXT: s_xor_b32 s3, s3, s4 6780; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 6781; GFX9-NEXT: s_sub_i32 s4, 0, s3 6782; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 6783; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 6784; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 6785; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 6786; GFX9-NEXT: s_ashr_i32 s4, s2, 31 6787; GFX9-NEXT: s_add_i32 s2, s2, s4 6788; GFX9-NEXT: s_xor_b32 s2, s2, s4 6789; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 6790; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 6791; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 6792; GFX9-NEXT: v_mov_b32_e32 v1, 0 6793; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 6794; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 6795; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 6796; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 6797; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6798; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 6799; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 6800; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 6801; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 6802; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 6803; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 6804; GFX9-NEXT: s_endpgm 6805 %shl.y = shl i32 4096, %y 6806 %r = srem i32 %x, %shl.y 6807 store i32 %r, i32 addrspace(1)* %out 6808 ret void 6809} 6810 6811define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 6812; CHECK-LABEL: @srem_v2i32_pow2k_denom( 6813; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6814; CHECK-NEXT: [[TMP2:%.*]] = srem i32 [[TMP1]], 4096 6815; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 6816; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 6817; CHECK-NEXT: [[TMP5:%.*]] = srem i32 [[TMP4]], 4096 6818; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 6819; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 6820; CHECK-NEXT: ret void 6821; 6822; GFX6-LABEL: srem_v2i32_pow2k_denom: 6823; GFX6: ; %bb.0: 6824; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6825; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6826; GFX6-NEXT: s_movk_i32 s2, 0xf000 6827; GFX6-NEXT: s_mov_b32 s7, 0xf000 6828; GFX6-NEXT: s_mov_b32 s6, -1 6829; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6830; GFX6-NEXT: s_ashr_i32 s3, s0, 31 6831; GFX6-NEXT: s_lshr_b32 s3, s3, 20 6832; GFX6-NEXT: s_add_i32 s3, s0, s3 6833; GFX6-NEXT: s_and_b32 s3, s3, s2 6834; GFX6-NEXT: s_sub_i32 s0, s0, s3 6835; GFX6-NEXT: s_ashr_i32 s3, s1, 31 6836; GFX6-NEXT: s_lshr_b32 s3, s3, 20 6837; GFX6-NEXT: s_add_i32 s3, s1, s3 6838; GFX6-NEXT: s_and_b32 s2, s3, s2 6839; GFX6-NEXT: s_sub_i32 s1, s1, s2 6840; GFX6-NEXT: v_mov_b32_e32 v0, s0 6841; GFX6-NEXT: v_mov_b32_e32 v1, s1 6842; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 6843; GFX6-NEXT: s_endpgm 6844; 6845; GFX9-LABEL: srem_v2i32_pow2k_denom: 6846; GFX9: ; %bb.0: 6847; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6848; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 6849; GFX9-NEXT: s_movk_i32 s6, 0xf000 6850; GFX9-NEXT: v_mov_b32_e32 v2, 0 6851; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6852; GFX9-NEXT: s_ashr_i32 s0, s4, 31 6853; GFX9-NEXT: s_ashr_i32 s1, s5, 31 6854; GFX9-NEXT: s_lshr_b32 s0, s0, 20 6855; GFX9-NEXT: s_lshr_b32 s1, s1, 20 6856; GFX9-NEXT: s_add_i32 s0, s4, s0 6857; GFX9-NEXT: s_add_i32 s1, s5, s1 6858; GFX9-NEXT: s_and_b32 s0, s0, s6 6859; GFX9-NEXT: s_and_b32 s1, s1, s6 6860; GFX9-NEXT: s_sub_i32 s0, s4, s0 6861; GFX9-NEXT: s_sub_i32 s1, s5, s1 6862; GFX9-NEXT: v_mov_b32_e32 v0, s0 6863; GFX9-NEXT: v_mov_b32_e32 v1, s1 6864; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 6865; GFX9-NEXT: s_endpgm 6866 %r = srem <2 x i32> %x, <i32 4096, i32 4096> 6867 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 6868 ret void 6869} 6870 6871define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 6872; CHECK-LABEL: @srem_v2i32_pow2_shl_denom( 6873; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 6874; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 6875; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 6876; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 6877; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 6878; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 6879; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 6880; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 6881; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 6882; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 6883; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 6884; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000 6885; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 6886; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]] 6887; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]] 6888; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64 6889; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 6890; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]] 6891; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 6892; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32 6893; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 6894; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] 6895; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 6896; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64 6897; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 6898; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 6899; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 6900; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 6901; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]] 6902; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]] 6903; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]] 6904; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]] 6905; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]] 6906; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]] 6907; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]] 6908; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]] 6909; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]] 6910; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]] 6911; CHECK-NEXT: [[TMP38:%.*]] = insertelement <2 x i32> undef, i32 [[TMP37]], i64 0 6912; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[X]], i64 1 6913; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 6914; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31 6915; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31 6916; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]] 6917; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]] 6918; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]] 6919; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]] 6920; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float 6921; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]]) 6922; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000 6923; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32 6924; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]] 6925; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]] 6926; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64 6927; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64 6928; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]] 6929; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32 6930; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32 6931; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32 6932; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]] 6933; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64 6934; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64 6935; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]] 6936; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32 6937; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32 6938; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32 6939; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]] 6940; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]] 6941; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]] 6942; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]] 6943; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]] 6944; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]] 6945; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]] 6946; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]] 6947; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]] 6948; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]] 6949; CHECK-NEXT: [[TMP76:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP75]], i64 1 6950; CHECK-NEXT: store <2 x i32> [[TMP76]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 6951; CHECK-NEXT: ret void 6952; 6953; GFX6-LABEL: srem_v2i32_pow2_shl_denom: 6954; GFX6: ; %bb.0: 6955; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 6956; GFX6-NEXT: s_movk_i32 s6, 0x1000 6957; GFX6-NEXT: s_mov_b32 s10, 0x4f7ffffe 6958; GFX6-NEXT: s_mov_b32 s7, 0xf000 6959; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6960; GFX6-NEXT: s_lshl_b32 s2, s6, s2 6961; GFX6-NEXT: s_ashr_i32 s4, s2, 31 6962; GFX6-NEXT: s_add_i32 s2, s2, s4 6963; GFX6-NEXT: s_xor_b32 s2, s2, s4 6964; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 6965; GFX6-NEXT: s_lshl_b32 s3, s6, s3 6966; GFX6-NEXT: s_ashr_i32 s6, s3, 31 6967; GFX6-NEXT: s_add_i32 s3, s3, s6 6968; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 6969; GFX6-NEXT: s_xor_b32 s3, s3, s6 6970; GFX6-NEXT: s_sub_i32 s9, 0, s2 6971; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s3 6972; GFX6-NEXT: v_mul_f32_e32 v0, s10, v0 6973; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 6974; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6975; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 6976; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 6977; GFX6-NEXT: s_mov_b32 s6, -1 6978; GFX6-NEXT: v_mul_lo_u32 v1, s9, v0 6979; GFX6-NEXT: s_sub_i32 s9, 0, s3 6980; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6981; GFX6-NEXT: s_ashr_i32 s8, s0, 31 6982; GFX6-NEXT: s_add_i32 s0, s0, s8 6983; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 6984; GFX6-NEXT: s_xor_b32 s0, s0, s8 6985; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 6986; GFX6-NEXT: v_mul_f32_e32 v1, s10, v2 6987; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 6988; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 6989; GFX6-NEXT: v_mul_lo_u32 v2, s9, v1 6990; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 6991; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 6992; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 6993; GFX6-NEXT: s_ashr_i32 s0, s1, 31 6994; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v0 6995; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 6996; GFX6-NEXT: s_add_i32 s1, s1, s0 6997; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 6998; GFX6-NEXT: s_xor_b32 s1, s1, s0 6999; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 7000; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 7001; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v0 7002; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 7003; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 7004; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 7005; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 7006; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 7007; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 7008; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 7009; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 7010; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 7011; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 7012; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 7013; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 7014; GFX6-NEXT: v_xor_b32_e32 v1, s0, v1 7015; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1 7016; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7017; GFX6-NEXT: s_endpgm 7018; 7019; GFX9-LABEL: srem_v2i32_pow2_shl_denom: 7020; GFX9: ; %bb.0: 7021; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7022; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 7023; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 7024; GFX9-NEXT: s_movk_i32 s8, 0x1000 7025; GFX9-NEXT: s_mov_b32 s9, 0x4f7ffffe 7026; GFX9-NEXT: v_mov_b32_e32 v2, 0 7027; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7028; GFX9-NEXT: s_lshl_b32 s0, s8, s6 7029; GFX9-NEXT: s_ashr_i32 s1, s0, 31 7030; GFX9-NEXT: s_add_i32 s0, s0, s1 7031; GFX9-NEXT: s_xor_b32 s0, s0, s1 7032; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 7033; GFX9-NEXT: s_lshl_b32 s1, s8, s7 7034; GFX9-NEXT: s_ashr_i32 s6, s1, 31 7035; GFX9-NEXT: s_add_i32 s1, s1, s6 7036; GFX9-NEXT: s_xor_b32 s1, s1, s6 7037; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 7038; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 7039; GFX9-NEXT: s_sub_i32 s7, 0, s0 7040; GFX9-NEXT: s_ashr_i32 s6, s4, 31 7041; GFX9-NEXT: v_mul_f32_e32 v0, s9, v0 7042; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 7043; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 7044; GFX9-NEXT: s_add_i32 s4, s4, s6 7045; GFX9-NEXT: s_xor_b32 s4, s4, s6 7046; GFX9-NEXT: v_mul_f32_e32 v1, s9, v1 7047; GFX9-NEXT: v_mul_lo_u32 v3, s7, v0 7048; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 7049; GFX9-NEXT: s_sub_i32 s7, 0, s1 7050; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 7051; GFX9-NEXT: v_mul_lo_u32 v4, s7, v1 7052; GFX9-NEXT: s_ashr_i32 s7, s5, 31 7053; GFX9-NEXT: s_add_i32 s5, s5, s7 7054; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 7055; GFX9-NEXT: v_mul_hi_u32 v3, v1, v4 7056; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 7057; GFX9-NEXT: s_xor_b32 s5, s5, s7 7058; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 7059; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 7060; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 7061; GFX9-NEXT: v_mul_lo_u32 v1, v1, s1 7062; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 7063; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v0 7064; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 7065; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 7066; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v0 7067; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 7068; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 7069; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 7070; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v1 7071; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 7072; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 7073; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v1 7074; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 7075; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 7076; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 7077; GFX9-NEXT: v_xor_b32_e32 v1, s7, v1 7078; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 7079; GFX9-NEXT: v_subrev_u32_e32 v1, s7, v1 7080; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 7081; GFX9-NEXT: s_endpgm 7082 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 7083 %r = srem <2 x i32> %x, %shl.y 7084 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 7085 ret void 7086} 7087 7088define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 7089; CHECK-LABEL: @udiv_i64_oddk_denom( 7090; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943 7091; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 7092; CHECK-NEXT: ret void 7093; 7094; GFX6-LABEL: udiv_i64_oddk_denom: 7095; GFX6: ; %bb.0: 7096; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f176a73 7097; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 7098; GFX6-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 7099; GFX6-NEXT: v_rcp_f32_e32 v0, v0 7100; GFX6-NEXT: s_movk_i32 s2, 0xfee0 7101; GFX6-NEXT: s_mov_b32 s3, 0x68958c89 7102; GFX6-NEXT: v_mov_b32_e32 v8, 0 7103; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7104; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7105; GFX6-NEXT: v_trunc_f32_e32 v1, v1 7106; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7107; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 7108; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 7109; GFX6-NEXT: v_mov_b32_e32 v7, 0 7110; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 7111; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 7112; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 7113; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 7114; GFX6-NEXT: s_mov_b32 s11, 0xf000 7115; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7116; GFX6-NEXT: s_mov_b32 s8, s4 7117; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7118; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 7119; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 7120; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 7121; GFX6-NEXT: v_mul_hi_u32 v4, v0, v2 7122; GFX6-NEXT: v_mul_hi_u32 v6, v0, v3 7123; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 7124; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 7125; GFX6-NEXT: s_mov_b32 s4, 0x976a7376 7126; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 7127; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 7128; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 7129; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 7130; GFX6-NEXT: s_mov_b32 s10, -1 7131; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 7132; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc 7133; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 7134; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7135; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 7136; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 7137; GFX6-NEXT: v_mul_lo_u32 v4, v0, s2 7138; GFX6-NEXT: v_mul_hi_u32 v5, v0, s3 7139; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 7140; GFX6-NEXT: v_mul_lo_u32 v6, v2, s3 7141; GFX6-NEXT: s_movk_i32 s2, 0x11f 7142; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 7143; GFX6-NEXT: v_mul_lo_u32 v5, v0, s3 7144; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 7145; GFX6-NEXT: v_mul_lo_u32 v6, v0, v4 7146; GFX6-NEXT: v_mul_hi_u32 v10, v0, v4 7147; GFX6-NEXT: v_mul_hi_u32 v9, v0, v5 7148; GFX6-NEXT: v_mul_hi_u32 v11, v2, v4 7149; GFX6-NEXT: s_mov_b32 s3, 0x976a7377 7150; GFX6-NEXT: s_mov_b32 s9, s5 7151; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 7152; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc 7153; GFX6-NEXT: v_mul_lo_u32 v10, v2, v5 7154; GFX6-NEXT: v_mul_hi_u32 v5, v2, v5 7155; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 7156; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v10 7157; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc 7158; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc 7159; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 7160; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 7161; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 7162; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 7163; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 7164; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7165; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 7166; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 7167; GFX6-NEXT: v_mul_hi_u32 v4, s6, v1 7168; GFX6-NEXT: v_mul_hi_u32 v5, s7, v1 7169; GFX6-NEXT: v_mul_lo_u32 v1, s7, v1 7170; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7171; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 7172; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 7173; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 7174; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7175; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 7176; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 7177; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7178; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 7179; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 7180; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 7181; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 7182; GFX6-NEXT: v_mov_b32_e32 v5, s2 7183; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7184; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 7185; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7186; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s7, v2 7187; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 7188; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 7189; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s3, v3 7190; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 7191; GFX6-NEXT: s_movk_i32 s3, 0x11e 7192; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s3, v4 7193; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 7194; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v5 7195; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 7196; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 7197; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 7198; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 7199; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 7200; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 7201; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 7202; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 7203; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] 7204; GFX6-NEXT: v_mov_b32_e32 v6, s7 7205; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 7206; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s3, v2 7207; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 7208; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s4, v3 7209; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 7210; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 7211; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 7212; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 7213; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 7214; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 7215; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 7216; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 7217; GFX6-NEXT: s_endpgm 7218; 7219; GFX9-LABEL: udiv_i64_oddk_denom: 7220; GFX9: ; %bb.0: 7221; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f176a73 7222; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 7223; GFX9-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 7224; GFX9-NEXT: v_rcp_f32_e32 v0, v0 7225; GFX9-NEXT: s_movk_i32 s4, 0xfee0 7226; GFX9-NEXT: s_mov_b32 s5, 0x68958c89 7227; GFX9-NEXT: v_mov_b32_e32 v8, 0 7228; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7229; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7230; GFX9-NEXT: v_trunc_f32_e32 v1, v1 7231; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7232; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 7233; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 7234; GFX9-NEXT: v_mov_b32_e32 v5, 0 7235; GFX9-NEXT: v_mul_lo_u32 v2, v0, s4 7236; GFX9-NEXT: v_mul_hi_u32 v3, v0, s5 7237; GFX9-NEXT: v_mul_lo_u32 v4, v1, s5 7238; GFX9-NEXT: v_mul_lo_u32 v6, v0, s5 7239; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 7240; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 7241; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 7242; GFX9-NEXT: v_mul_hi_u32 v4, v0, v6 7243; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 7244; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 7245; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 7246; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 7247; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v7, vcc 7248; GFX9-NEXT: v_mul_lo_u32 v7, v1, v6 7249; GFX9-NEXT: v_mul_hi_u32 v6, v1, v6 7250; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 7251; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v6, vcc 7252; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc 7253; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7254; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 7255; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v4, vcc 7256; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] 7257; GFX9-NEXT: v_mul_lo_u32 v4, v0, s4 7258; GFX9-NEXT: v_mul_hi_u32 v6, v0, s5 7259; GFX9-NEXT: v_mul_lo_u32 v7, v2, s5 7260; GFX9-NEXT: v_mul_lo_u32 v9, v0, s5 7261; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 7262; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 7263; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 7264; GFX9-NEXT: v_mul_lo_u32 v6, v0, v4 7265; GFX9-NEXT: v_mul_hi_u32 v7, v0, v9 7266; GFX9-NEXT: v_mul_hi_u32 v10, v0, v4 7267; GFX9-NEXT: v_mul_hi_u32 v11, v2, v4 7268; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 7269; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 7270; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v8, v10, vcc 7271; GFX9-NEXT: v_mul_lo_u32 v10, v2, v9 7272; GFX9-NEXT: v_mul_hi_u32 v9, v2, v9 7273; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 7274; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v10 7275; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v9, vcc 7276; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v5, vcc 7277; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 7278; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc 7279; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] 7280; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 7281; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 7282; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7283; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 7284; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 7285; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 7286; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 7287; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 7288; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7289; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v4, vcc 7290; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 7291; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 7292; GFX9-NEXT: s_movk_i32 s2, 0x11f 7293; GFX9-NEXT: s_mov_b32 s3, 0x976a7377 7294; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 7295; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 7296; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc 7297; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 7298; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v2, vcc 7299; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 7300; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 7301; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 7302; GFX9-NEXT: v_mov_b32_e32 v6, s2 7303; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 7304; GFX9-NEXT: v_mul_lo_u32 v3, v0, s3 7305; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 7306; GFX9-NEXT: v_sub_u32_e32 v4, s7, v2 7307; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 7308; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc 7309; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s3, v3 7310; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] 7311; GFX9-NEXT: s_movk_i32 s3, 0x11e 7312; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s3, v4 7313; GFX9-NEXT: s_mov_b32 s6, 0x976a7376 7314; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 7315; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s6, v6 7316; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 7317; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 7318; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] 7319; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], 2, v0 7320; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] 7321; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], 1, v0 7322; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1] 7323; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 7324; GFX9-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] 7325; GFX9-NEXT: v_mov_b32_e32 v7, s7 7326; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v7, v2, vcc 7327; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s3, v2 7328; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 7329; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v3 7330; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 7331; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 7332; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc 7333; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 7334; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v6, s[0:1] 7335; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 7336; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 7337; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] 7338; GFX9-NEXT: s_endpgm 7339 %r = udiv i64 %x, 1235195949943 7340 store i64 %r, i64 addrspace(1)* %out 7341 ret void 7342} 7343 7344define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 7345; CHECK-LABEL: @udiv_i64_pow2k_denom( 7346; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 4096 7347; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 7348; CHECK-NEXT: ret void 7349; 7350; GFX6-LABEL: udiv_i64_pow2k_denom: 7351; GFX6: ; %bb.0: 7352; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 7353; GFX6-NEXT: s_mov_b32 s7, 0xf000 7354; GFX6-NEXT: s_mov_b32 s6, -1 7355; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7356; GFX6-NEXT: s_mov_b32 s4, s0 7357; GFX6-NEXT: s_mov_b32 s5, s1 7358; GFX6-NEXT: s_lshr_b64 s[0:1], s[2:3], 12 7359; GFX6-NEXT: v_mov_b32_e32 v0, s0 7360; GFX6-NEXT: v_mov_b32_e32 v1, s1 7361; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7362; GFX6-NEXT: s_endpgm 7363; 7364; GFX9-LABEL: udiv_i64_pow2k_denom: 7365; GFX9: ; %bb.0: 7366; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 7367; GFX9-NEXT: v_mov_b32_e32 v2, 0 7368; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7369; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 7370; GFX9-NEXT: v_mov_b32_e32 v0, s2 7371; GFX9-NEXT: v_mov_b32_e32 v1, s3 7372; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 7373; GFX9-NEXT: s_endpgm 7374 %r = udiv i64 %x, 4096 7375 store i64 %r, i64 addrspace(1)* %out 7376 ret void 7377} 7378 7379define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 7380; CHECK-LABEL: @udiv_i64_pow2_shl_denom( 7381; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 7382; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]] 7383; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 7384; CHECK-NEXT: ret void 7385; 7386; GFX6-LABEL: udiv_i64_pow2_shl_denom: 7387; GFX6: ; %bb.0: 7388; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 7389; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd 7390; GFX6-NEXT: s_mov_b32 s3, 0xf000 7391; GFX6-NEXT: s_mov_b32 s2, -1 7392; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7393; GFX6-NEXT: s_mov_b32 s0, s4 7394; GFX6-NEXT: s_add_i32 s8, s8, 12 7395; GFX6-NEXT: s_mov_b32 s1, s5 7396; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 7397; GFX6-NEXT: v_mov_b32_e32 v0, s4 7398; GFX6-NEXT: v_mov_b32_e32 v1, s5 7399; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 7400; GFX6-NEXT: s_endpgm 7401; 7402; GFX9-LABEL: udiv_i64_pow2_shl_denom: 7403; GFX9: ; %bb.0: 7404; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 7405; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 7406; GFX9-NEXT: v_mov_b32_e32 v2, 0 7407; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7408; GFX9-NEXT: s_add_i32 s2, s2, 12 7409; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s2 7410; GFX9-NEXT: v_mov_b32_e32 v0, s0 7411; GFX9-NEXT: v_mov_b32_e32 v1, s1 7412; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 7413; GFX9-NEXT: s_endpgm 7414 %shl.y = shl i64 4096, %y 7415 %r = udiv i64 %x, %shl.y 7416 store i64 %r, i64 addrspace(1)* %out 7417 ret void 7418} 7419 7420define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 7421; CHECK-LABEL: @udiv_v2i64_pow2k_denom( 7422; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 7423; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 7424; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 7425; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 7426; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096 7427; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 7428; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 7429; CHECK-NEXT: ret void 7430; 7431; GFX6-LABEL: udiv_v2i64_pow2k_denom: 7432; GFX6: ; %bb.0: 7433; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7434; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 7435; GFX6-NEXT: s_mov_b32 s7, 0xf000 7436; GFX6-NEXT: s_mov_b32 s6, -1 7437; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7438; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 12 7439; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 7440; GFX6-NEXT: v_mov_b32_e32 v0, s0 7441; GFX6-NEXT: v_mov_b32_e32 v1, s1 7442; GFX6-NEXT: v_mov_b32_e32 v2, s2 7443; GFX6-NEXT: v_mov_b32_e32 v3, s3 7444; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 7445; GFX6-NEXT: s_endpgm 7446; 7447; GFX9-LABEL: udiv_v2i64_pow2k_denom: 7448; GFX9: ; %bb.0: 7449; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7450; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 7451; GFX9-NEXT: v_mov_b32_e32 v4, 0 7452; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7453; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 7454; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 12 7455; GFX9-NEXT: v_mov_b32_e32 v0, s0 7456; GFX9-NEXT: v_mov_b32_e32 v1, s1 7457; GFX9-NEXT: v_mov_b32_e32 v2, s4 7458; GFX9-NEXT: v_mov_b32_e32 v3, s5 7459; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 7460; GFX9-NEXT: s_endpgm 7461 %r = udiv <2 x i64> %x, <i64 4096, i64 4096> 7462 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 7463 ret void 7464} 7465 7466define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 7467; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom( 7468; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 7469; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 7470; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 7471; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 7472; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095 7473; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 7474; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 7475; CHECK-NEXT: ret void 7476; 7477; GFX6-LABEL: udiv_v2i64_mixed_pow2k_denom: 7478; GFX6: ; %bb.0: 7479; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 7480; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 7481; GFX6-NEXT: v_rcp_f32_e32 v0, v0 7482; GFX6-NEXT: s_movk_i32 s2, 0xf001 7483; GFX6-NEXT: v_mov_b32_e32 v8, 0 7484; GFX6-NEXT: v_mov_b32_e32 v7, 0 7485; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7486; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7487; GFX6-NEXT: v_trunc_f32_e32 v1, v1 7488; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7489; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 7490; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 7491; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7492; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 7493; GFX6-NEXT: s_mov_b32 s7, 0xf000 7494; GFX6-NEXT: v_mul_hi_u32 v2, v0, s2 7495; GFX6-NEXT: v_mul_lo_u32 v3, v1, s2 7496; GFX6-NEXT: v_mul_lo_u32 v4, v0, s2 7497; GFX6-NEXT: s_mov_b32 s6, -1 7498; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 7499; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7500; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 7501; GFX6-NEXT: v_mul_hi_u32 v6, v0, v4 7502; GFX6-NEXT: v_mul_hi_u32 v3, v0, v2 7503; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 7504; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 7505; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 7506; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 7507; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 7508; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc 7509; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 7510; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc 7511; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 7512; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7513; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 7514; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 7515; GFX6-NEXT: v_mul_hi_u32 v4, v0, s2 7516; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 7517; GFX6-NEXT: v_mul_lo_u32 v5, v2, s2 7518; GFX6-NEXT: v_mul_lo_u32 v6, v0, s2 7519; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 7520; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7521; GFX6-NEXT: s_lshr_b64 s[2:3], s[8:9], 12 7522; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 7523; GFX6-NEXT: v_mul_lo_u32 v5, v0, v4 7524; GFX6-NEXT: v_mul_hi_u32 v9, v0, v6 7525; GFX6-NEXT: v_mul_hi_u32 v10, v0, v4 7526; GFX6-NEXT: v_mul_hi_u32 v11, v2, v4 7527; GFX6-NEXT: v_add_i32_e32 v5, vcc, v9, v5 7528; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc 7529; GFX6-NEXT: v_mul_lo_u32 v10, v2, v6 7530; GFX6-NEXT: v_mul_hi_u32 v6, v2, v6 7531; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 7532; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v10 7533; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v6, vcc 7534; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc 7535; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 7536; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 7537; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 7538; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 7539; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 7540; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7541; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 7542; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 7543; GFX6-NEXT: v_mul_hi_u32 v4, s10, v1 7544; GFX6-NEXT: v_mul_hi_u32 v5, s11, v1 7545; GFX6-NEXT: v_mul_lo_u32 v1, s11, v1 7546; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7547; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 7548; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 7549; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 7550; GFX6-NEXT: s_movk_i32 s0, 0xfff 7551; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7552; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 7553; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 7554; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7555; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 7556; GFX6-NEXT: v_mul_lo_u32 v4, v1, s0 7557; GFX6-NEXT: v_mul_hi_u32 v5, v0, s0 7558; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 7559; GFX6-NEXT: v_mul_lo_u32 v8, v0, s0 7560; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 7561; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 7562; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 7563; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 7564; GFX6-NEXT: v_mov_b32_e32 v5, s11 7565; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s10, v8 7566; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc 7567; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s0, v8 7568; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc 7569; GFX6-NEXT: s_movk_i32 s0, 0xffe 7570; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 7571; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 7572; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 7573; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 7574; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 7575; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 7576; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 7577; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 7578; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] 7579; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 7580; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 7581; GFX6-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] 7582; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc 7583; GFX6-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] 7584; GFX6-NEXT: v_mov_b32_e32 v0, s2 7585; GFX6-NEXT: v_mov_b32_e32 v1, s3 7586; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 7587; GFX6-NEXT: s_endpgm 7588; 7589; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom: 7590; GFX9: ; %bb.0: 7591; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 7592; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 7593; GFX9-NEXT: v_rcp_f32_e32 v0, v0 7594; GFX9-NEXT: s_movk_i32 s4, 0xf001 7595; GFX9-NEXT: v_mov_b32_e32 v7, 0 7596; GFX9-NEXT: v_mov_b32_e32 v5, 0 7597; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7598; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7599; GFX9-NEXT: v_trunc_f32_e32 v1, v1 7600; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7601; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 7602; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 7603; GFX9-NEXT: v_mul_hi_u32 v2, v0, s4 7604; GFX9-NEXT: v_mul_lo_u32 v4, v1, s4 7605; GFX9-NEXT: v_mul_lo_u32 v3, v0, s4 7606; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 7607; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 7608; GFX9-NEXT: v_mul_hi_u32 v6, v0, v3 7609; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2 7610; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 7611; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 7612; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 7613; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 7614; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v8, vcc 7615; GFX9-NEXT: v_mul_lo_u32 v8, v1, v3 7616; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 7617; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 7618; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc 7619; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc 7620; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7621; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 7622; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 7623; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] 7624; GFX9-NEXT: v_mul_hi_u32 v4, v0, s4 7625; GFX9-NEXT: v_mul_lo_u32 v6, v2, s4 7626; GFX9-NEXT: v_mul_lo_u32 v8, v0, s4 7627; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 7628; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 7629; GFX9-NEXT: v_sub_u32_e32 v4, v4, v0 7630; GFX9-NEXT: v_add_u32_e32 v4, v4, v6 7631; GFX9-NEXT: v_mul_lo_u32 v6, v0, v4 7632; GFX9-NEXT: v_mul_hi_u32 v9, v0, v8 7633; GFX9-NEXT: v_mul_hi_u32 v10, v0, v4 7634; GFX9-NEXT: v_mul_hi_u32 v11, v2, v4 7635; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 7636; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v9, v6 7637; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v7, v10, vcc 7638; GFX9-NEXT: v_mul_lo_u32 v10, v2, v8 7639; GFX9-NEXT: v_mul_hi_u32 v8, v2, v8 7640; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 7641; GFX9-NEXT: s_movk_i32 s0, 0xfff 7642; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v10 7643; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v9, v8, vcc 7644; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v5, vcc 7645; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 7646; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc 7647; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] 7648; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 7649; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 7650; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7651; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 7652; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 7653; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 7654; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 7655; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 7656; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7657; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 7658; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 7659; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 7660; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 12 7661; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 7662; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 7663; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc 7664; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 7665; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v2, vcc 7666; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 2, v0 7667; GFX9-NEXT: v_mul_lo_u32 v4, v1, s0 7668; GFX9-NEXT: v_mul_hi_u32 v6, v0, s0 7669; GFX9-NEXT: v_mul_lo_u32 v9, v0, s0 7670; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc 7671; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 7672; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc 7673; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 7674; GFX9-NEXT: v_mov_b32_e32 v6, s7 7675; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s6, v9 7676; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v6, v4, vcc 7677; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s0, v9 7678; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v4, vcc 7679; GFX9-NEXT: s_movk_i32 s0, 0xffe 7680; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v6 7681; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 7682; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 7683; GFX9-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc 7684; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v9 7685; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 7686; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 7687; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 7688; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[0:1] 7689; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc 7690; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 7691; GFX9-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] 7692; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc 7693; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] 7694; GFX9-NEXT: v_mov_b32_e32 v0, s2 7695; GFX9-NEXT: v_mov_b32_e32 v1, s3 7696; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[8:9] 7697; GFX9-NEXT: s_endpgm 7698 %r = udiv <2 x i64> %x, <i64 4096, i64 4095> 7699 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 7700 ret void 7701} 7702 7703define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 7704; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom( 7705; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 7706; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 7707; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 7708; CHECK-NEXT: [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]] 7709; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 7710; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 7711; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 7712; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]] 7713; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 7714; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 7715; CHECK-NEXT: ret void 7716; 7717; GFX6-LABEL: udiv_v2i64_pow2_shl_denom: 7718; GFX6: ; %bb.0: 7719; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7720; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 7721; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 7722; GFX6-NEXT: s_mov_b32 s7, 0xf000 7723; GFX6-NEXT: s_mov_b32 s6, -1 7724; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7725; GFX6-NEXT: s_add_i32 s0, s0, 12 7726; GFX6-NEXT: s_add_i32 s2, s2, 12 7727; GFX6-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 7728; GFX6-NEXT: s_lshr_b64 s[2:3], s[10:11], s2 7729; GFX6-NEXT: v_mov_b32_e32 v0, s0 7730; GFX6-NEXT: v_mov_b32_e32 v1, s1 7731; GFX6-NEXT: v_mov_b32_e32 v2, s2 7732; GFX6-NEXT: v_mov_b32_e32 v3, s3 7733; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 7734; GFX6-NEXT: s_endpgm 7735; 7736; GFX9-LABEL: udiv_v2i64_pow2_shl_denom: 7737; GFX9: ; %bb.0: 7738; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7739; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 7740; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 7741; GFX9-NEXT: v_mov_b32_e32 v4, 0 7742; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7743; GFX9-NEXT: s_add_i32 s0, s8, 12 7744; GFX9-NEXT: s_add_i32 s8, s10, 12 7745; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], s0 7746; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 7747; GFX9-NEXT: v_mov_b32_e32 v0, s0 7748; GFX9-NEXT: v_mov_b32_e32 v1, s1 7749; GFX9-NEXT: v_mov_b32_e32 v2, s4 7750; GFX9-NEXT: v_mov_b32_e32 v3, s5 7751; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 7752; GFX9-NEXT: s_endpgm 7753 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 7754 %r = udiv <2 x i64> %x, %shl.y 7755 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 7756 ret void 7757} 7758 7759define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 7760; CHECK-LABEL: @urem_i64_oddk_denom( 7761; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993 7762; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 7763; CHECK-NEXT: ret void 7764; 7765; GFX6-LABEL: urem_i64_oddk_denom: 7766; GFX6: ; %bb.0: 7767; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 7768; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 7769; GFX6-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 7770; GFX6-NEXT: v_rcp_f32_e32 v0, v0 7771; GFX6-NEXT: s_movk_i32 s2, 0xfee0 7772; GFX6-NEXT: s_mov_b32 s3, 0x689e0837 7773; GFX6-NEXT: v_mov_b32_e32 v8, 0 7774; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7775; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7776; GFX6-NEXT: v_trunc_f32_e32 v1, v1 7777; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7778; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 7779; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 7780; GFX6-NEXT: v_mov_b32_e32 v7, 0 7781; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 7782; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 7783; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 7784; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 7785; GFX6-NEXT: s_mov_b32 s12, 0x9761f7c9 7786; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7787; GFX6-NEXT: s_mov_b32 s8, s4 7788; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7789; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 7790; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 7791; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 7792; GFX6-NEXT: v_mul_hi_u32 v4, v0, v2 7793; GFX6-NEXT: v_mul_hi_u32 v6, v0, v3 7794; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 7795; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 7796; GFX6-NEXT: s_movk_i32 s4, 0x11f 7797; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 7798; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 7799; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 7800; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 7801; GFX6-NEXT: s_mov_b32 s9, s5 7802; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 7803; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc 7804; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 7805; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7806; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 7807; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 7808; GFX6-NEXT: v_mul_lo_u32 v4, v0, s2 7809; GFX6-NEXT: v_mul_hi_u32 v5, v0, s3 7810; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 7811; GFX6-NEXT: v_mul_lo_u32 v6, v2, s3 7812; GFX6-NEXT: s_movk_i32 s5, 0x11e 7813; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 7814; GFX6-NEXT: v_mul_lo_u32 v5, v0, s3 7815; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 7816; GFX6-NEXT: v_mul_lo_u32 v6, v0, v4 7817; GFX6-NEXT: v_mul_hi_u32 v10, v0, v4 7818; GFX6-NEXT: v_mul_hi_u32 v9, v0, v5 7819; GFX6-NEXT: v_mul_hi_u32 v11, v2, v4 7820; GFX6-NEXT: s_mov_b32 s11, 0xf000 7821; GFX6-NEXT: s_mov_b32 s10, -1 7822; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 7823; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc 7824; GFX6-NEXT: v_mul_lo_u32 v10, v2, v5 7825; GFX6-NEXT: v_mul_hi_u32 v5, v2, v5 7826; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 7827; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v10 7828; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc 7829; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc 7830; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 7831; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 7832; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 7833; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 7834; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 7835; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 7836; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 7837; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 7838; GFX6-NEXT: v_mul_hi_u32 v4, s6, v1 7839; GFX6-NEXT: v_mul_hi_u32 v5, s7, v1 7840; GFX6-NEXT: v_mul_lo_u32 v1, s7, v1 7841; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7842; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 7843; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 7844; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 7845; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 7846; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 7847; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 7848; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7849; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 7850; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 7851; GFX6-NEXT: v_mul_hi_u32 v3, v0, s12 7852; GFX6-NEXT: v_mul_lo_u32 v1, v1, s12 7853; GFX6-NEXT: v_mul_lo_u32 v0, v0, s12 7854; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 7855; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 7856; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 7857; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 7858; GFX6-NEXT: v_mov_b32_e32 v3, s4 7859; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 7860; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 7861; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 7862; GFX6-NEXT: v_cmp_lt_u32_e64 s[2:3], s5, v5 7863; GFX6-NEXT: s_mov_b32 s6, 0x9761f7c8 7864; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 7865; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 7866; GFX6-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v4 7867; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 7868; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 7869; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v5 7870; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 7871; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 7872; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 7873; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 7874; GFX6-NEXT: v_mov_b32_e32 v5, s7 7875; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 7876; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s5, v1 7877; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 7878; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0 7879; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 7880; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 7881; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 7882; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 7883; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 7884; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 7885; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 7886; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 7887; GFX6-NEXT: s_endpgm 7888; 7889; GFX9-LABEL: urem_i64_oddk_denom: 7890; GFX9: ; %bb.0: 7891; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 7892; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 7893; GFX9-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 7894; GFX9-NEXT: v_rcp_f32_e32 v0, v0 7895; GFX9-NEXT: s_movk_i32 s4, 0xfee0 7896; GFX9-NEXT: s_mov_b32 s5, 0x689e0837 7897; GFX9-NEXT: v_mov_b32_e32 v8, 0 7898; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 7899; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 7900; GFX9-NEXT: v_trunc_f32_e32 v1, v1 7901; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 7902; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 7903; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 7904; GFX9-NEXT: v_mov_b32_e32 v5, 0 7905; GFX9-NEXT: s_movk_i32 s8, 0x11f 7906; GFX9-NEXT: v_mul_lo_u32 v2, v0, s4 7907; GFX9-NEXT: v_mul_hi_u32 v3, v0, s5 7908; GFX9-NEXT: v_mul_lo_u32 v4, v1, s5 7909; GFX9-NEXT: v_mul_lo_u32 v6, v0, s5 7910; GFX9-NEXT: s_mov_b32 s9, 0x9761f7c9 7911; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 7912; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 7913; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 7914; GFX9-NEXT: v_mul_hi_u32 v4, v0, v6 7915; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 7916; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 7917; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 7918; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 7919; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v7, vcc 7920; GFX9-NEXT: v_mul_lo_u32 v7, v1, v6 7921; GFX9-NEXT: v_mul_hi_u32 v6, v1, v6 7922; GFX9-NEXT: s_mov_b32 s10, 0x9761f7c8 7923; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 7924; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v6, vcc 7925; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc 7926; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7927; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 7928; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v4, vcc 7929; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] 7930; GFX9-NEXT: v_mul_lo_u32 v4, v0, s4 7931; GFX9-NEXT: v_mul_hi_u32 v6, v0, s5 7932; GFX9-NEXT: v_mul_lo_u32 v7, v2, s5 7933; GFX9-NEXT: v_mul_lo_u32 v9, v0, s5 7934; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 7935; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 7936; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 7937; GFX9-NEXT: v_mul_lo_u32 v6, v0, v4 7938; GFX9-NEXT: v_mul_hi_u32 v7, v0, v9 7939; GFX9-NEXT: v_mul_hi_u32 v10, v0, v4 7940; GFX9-NEXT: v_mul_hi_u32 v11, v2, v4 7941; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 7942; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 7943; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v8, v10, vcc 7944; GFX9-NEXT: v_mul_lo_u32 v10, v2, v9 7945; GFX9-NEXT: v_mul_hi_u32 v9, v2, v9 7946; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 7947; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v10 7948; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v9, vcc 7949; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v5, vcc 7950; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 7951; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc 7952; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] 7953; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 7954; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 7955; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7956; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 7957; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 7958; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 7959; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 7960; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 7961; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 7962; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v4, vcc 7963; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 7964; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 7965; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 7966; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 7967; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc 7968; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 7969; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v2, vcc 7970; GFX9-NEXT: v_mul_lo_u32 v2, v0, s8 7971; GFX9-NEXT: v_mul_hi_u32 v3, v0, s9 7972; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 7973; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 7974; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 7975; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 7976; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 7977; GFX9-NEXT: v_sub_u32_e32 v2, s7, v1 7978; GFX9-NEXT: v_mov_b32_e32 v3, s8 7979; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc 7980; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s9, v0 7981; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1] 7982; GFX9-NEXT: s_movk_i32 s6, 0x11e 7983; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v6 7984; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] 7985; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 7986; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s10, v4 7987; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s9, v4 7988; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 7989; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s8, v6 7990; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] 7991; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] 7992; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 7993; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1] 7994; GFX9-NEXT: v_mov_b32_e32 v6, s7 7995; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v6, v1, vcc 7996; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 7997; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 7998; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 7999; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 8000; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 8001; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc 8002; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 8003; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 8004; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 8005; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 8006; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] 8007; GFX9-NEXT: s_endpgm 8008 %r = urem i64 %x, 1235195393993 8009 store i64 %r, i64 addrspace(1)* %out 8010 ret void 8011} 8012 8013define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 8014; CHECK-LABEL: @urem_i64_pow2k_denom( 8015; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 4096 8016; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 8017; CHECK-NEXT: ret void 8018; 8019; GFX6-LABEL: urem_i64_pow2k_denom: 8020; GFX6: ; %bb.0: 8021; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 8022; GFX6-NEXT: s_mov_b32 s3, 0xf000 8023; GFX6-NEXT: s_mov_b32 s2, -1 8024; GFX6-NEXT: v_mov_b32_e32 v1, 0 8025; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8026; GFX6-NEXT: s_mov_b32 s0, s4 8027; GFX6-NEXT: s_and_b32 s4, s6, 0xfff 8028; GFX6-NEXT: s_mov_b32 s1, s5 8029; GFX6-NEXT: v_mov_b32_e32 v0, s4 8030; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 8031; GFX6-NEXT: s_endpgm 8032; 8033; GFX9-LABEL: urem_i64_pow2k_denom: 8034; GFX9: ; %bb.0: 8035; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 8036; GFX9-NEXT: v_mov_b32_e32 v1, 0 8037; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8038; GFX9-NEXT: s_and_b32 s2, s2, 0xfff 8039; GFX9-NEXT: v_mov_b32_e32 v0, s2 8040; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 8041; GFX9-NEXT: s_endpgm 8042 %r = urem i64 %x, 4096 8043 store i64 %r, i64 addrspace(1)* %out 8044 ret void 8045} 8046 8047define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 8048; CHECK-LABEL: @urem_i64_pow2_shl_denom( 8049; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 8050; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]] 8051; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 8052; CHECK-NEXT: ret void 8053; 8054; GFX6-LABEL: urem_i64_pow2_shl_denom: 8055; GFX6: ; %bb.0: 8056; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 8057; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd 8058; GFX6-NEXT: s_mov_b32 s3, 0xf000 8059; GFX6-NEXT: s_mov_b32 s2, -1 8060; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8061; GFX6-NEXT: s_mov_b32 s0, s4 8062; GFX6-NEXT: s_mov_b32 s1, s5 8063; GFX6-NEXT: s_mov_b64 s[4:5], 0x1000 8064; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 8065; GFX6-NEXT: s_add_u32 s4, s4, -1 8066; GFX6-NEXT: s_addc_u32 s5, s5, -1 8067; GFX6-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 8068; GFX6-NEXT: v_mov_b32_e32 v0, s4 8069; GFX6-NEXT: v_mov_b32_e32 v1, s5 8070; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 8071; GFX6-NEXT: s_endpgm 8072; 8073; GFX9-LABEL: urem_i64_pow2_shl_denom: 8074; GFX9: ; %bb.0: 8075; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 8076; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 8077; GFX9-NEXT: s_mov_b64 s[0:1], 0x1000 8078; GFX9-NEXT: v_mov_b32_e32 v2, 0 8079; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8080; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 8081; GFX9-NEXT: s_add_u32 s0, s0, -1 8082; GFX9-NEXT: s_addc_u32 s1, s1, -1 8083; GFX9-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] 8084; GFX9-NEXT: v_mov_b32_e32 v0, s0 8085; GFX9-NEXT: v_mov_b32_e32 v1, s1 8086; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 8087; GFX9-NEXT: s_endpgm 8088 %shl.y = shl i64 4096, %y 8089 %r = urem i64 %x, %shl.y 8090 store i64 %r, i64 addrspace(1)* %out 8091 ret void 8092} 8093 8094define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 8095; CHECK-LABEL: @urem_v2i64_pow2k_denom( 8096; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 8097; CHECK-NEXT: [[TMP2:%.*]] = urem i64 [[TMP1]], 4096 8098; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 8099; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 8100; CHECK-NEXT: [[TMP5:%.*]] = urem i64 [[TMP4]], 4096 8101; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 8102; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 8103; CHECK-NEXT: ret void 8104; 8105; GFX6-LABEL: urem_v2i64_pow2k_denom: 8106; GFX6: ; %bb.0: 8107; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8108; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 8109; GFX6-NEXT: s_movk_i32 s8, 0xfff 8110; GFX6-NEXT: v_mov_b32_e32 v1, 0 8111; GFX6-NEXT: s_mov_b32 s7, 0xf000 8112; GFX6-NEXT: s_mov_b32 s6, -1 8113; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8114; GFX6-NEXT: s_and_b32 s0, s0, s8 8115; GFX6-NEXT: s_and_b32 s1, s2, s8 8116; GFX6-NEXT: v_mov_b32_e32 v0, s0 8117; GFX6-NEXT: v_mov_b32_e32 v2, s1 8118; GFX6-NEXT: v_mov_b32_e32 v3, v1 8119; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 8120; GFX6-NEXT: s_endpgm 8121; 8122; GFX9-LABEL: urem_v2i64_pow2k_denom: 8123; GFX9: ; %bb.0: 8124; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8125; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 8126; GFX9-NEXT: s_movk_i32 s0, 0xfff 8127; GFX9-NEXT: v_mov_b32_e32 v1, 0 8128; GFX9-NEXT: v_mov_b32_e32 v3, v1 8129; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8130; GFX9-NEXT: s_and_b32 s1, s4, s0 8131; GFX9-NEXT: s_and_b32 s0, s6, s0 8132; GFX9-NEXT: v_mov_b32_e32 v0, s1 8133; GFX9-NEXT: v_mov_b32_e32 v2, s0 8134; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[2:3] 8135; GFX9-NEXT: s_endpgm 8136 %r = urem <2 x i64> %x, <i64 4096, i64 4096> 8137 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 8138 ret void 8139} 8140 8141define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 8142; CHECK-LABEL: @urem_v2i64_pow2_shl_denom( 8143; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 8144; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 8145; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 8146; CHECK-NEXT: [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]] 8147; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 8148; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 8149; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 8150; CHECK-NEXT: [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]] 8151; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 8152; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 8153; CHECK-NEXT: ret void 8154; 8155; GFX6-LABEL: urem_v2i64_pow2_shl_denom: 8156; GFX6: ; %bb.0: 8157; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8158; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 8159; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 8160; GFX6-NEXT: s_mov_b64 s[12:13], 0x1000 8161; GFX6-NEXT: s_mov_b32 s7, 0xf000 8162; GFX6-NEXT: s_mov_b32 s6, -1 8163; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8164; GFX6-NEXT: s_lshl_b64 s[2:3], s[12:13], s2 8165; GFX6-NEXT: s_lshl_b64 s[0:1], s[12:13], s0 8166; GFX6-NEXT: s_add_u32 s0, s0, -1 8167; GFX6-NEXT: s_addc_u32 s1, s1, -1 8168; GFX6-NEXT: s_and_b64 s[0:1], s[8:9], s[0:1] 8169; GFX6-NEXT: s_add_u32 s2, s2, -1 8170; GFX6-NEXT: s_addc_u32 s3, s3, -1 8171; GFX6-NEXT: s_and_b64 s[2:3], s[10:11], s[2:3] 8172; GFX6-NEXT: v_mov_b32_e32 v0, s0 8173; GFX6-NEXT: v_mov_b32_e32 v1, s1 8174; GFX6-NEXT: v_mov_b32_e32 v2, s2 8175; GFX6-NEXT: v_mov_b32_e32 v3, s3 8176; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 8177; GFX6-NEXT: s_endpgm 8178; 8179; GFX9-LABEL: urem_v2i64_pow2_shl_denom: 8180; GFX9: ; %bb.0: 8181; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8182; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 8183; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 8184; GFX9-NEXT: s_mov_b64 s[0:1], 0x1000 8185; GFX9-NEXT: v_mov_b32_e32 v4, 0 8186; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8187; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 8188; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 8189; GFX9-NEXT: s_add_u32 s0, s0, -1 8190; GFX9-NEXT: s_addc_u32 s1, s1, -1 8191; GFX9-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1] 8192; GFX9-NEXT: s_add_u32 s4, s10, -1 8193; GFX9-NEXT: s_addc_u32 s5, s11, -1 8194; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 8195; GFX9-NEXT: v_mov_b32_e32 v0, s0 8196; GFX9-NEXT: v_mov_b32_e32 v1, s1 8197; GFX9-NEXT: v_mov_b32_e32 v2, s4 8198; GFX9-NEXT: v_mov_b32_e32 v3, s5 8199; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 8200; GFX9-NEXT: s_endpgm 8201 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 8202 %r = urem <2 x i64> %x, %shl.y 8203 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 8204 ret void 8205} 8206 8207define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 8208; CHECK-LABEL: @sdiv_i64_oddk_denom( 8209; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195 8210; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 8211; CHECK-NEXT: ret void 8212; 8213; GFX6-LABEL: sdiv_i64_oddk_denom: 8214; GFX6: ; %bb.0: 8215; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 8216; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 8217; GFX6-NEXT: v_rcp_f32_e32 v0, v0 8218; GFX6-NEXT: s_mov_b32 s2, 0xffed2705 8219; GFX6-NEXT: v_mov_b32_e32 v8, 0 8220; GFX6-NEXT: v_mov_b32_e32 v7, 0 8221; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8222; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8223; GFX6-NEXT: v_trunc_f32_e32 v1, v1 8224; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8225; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 8226; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 8227; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 8228; GFX6-NEXT: s_mov_b32 s7, 0xf000 8229; GFX6-NEXT: v_mul_hi_u32 v3, v0, s2 8230; GFX6-NEXT: v_mul_lo_u32 v2, v1, s2 8231; GFX6-NEXT: v_mul_lo_u32 v4, v0, s2 8232; GFX6-NEXT: s_mov_b32 s6, -1 8233; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8234; GFX6-NEXT: s_mov_b32 s4, s8 8235; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8236; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 8237; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 8238; GFX6-NEXT: v_mul_hi_u32 v6, v0, v4 8239; GFX6-NEXT: v_mul_hi_u32 v3, v0, v2 8240; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 8241; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 8242; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 8243; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 8244; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 8245; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc 8246; GFX6-NEXT: s_mov_b32 s5, s9 8247; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 8248; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc 8249; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 8250; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8251; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 8252; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 8253; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 8254; GFX6-NEXT: v_mul_lo_u32 v4, v2, s2 8255; GFX6-NEXT: v_mul_hi_u32 v5, v0, s2 8256; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 8257; GFX6-NEXT: v_mul_lo_u32 v5, v0, s2 8258; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 8259; GFX6-NEXT: v_mul_lo_u32 v10, v0, v4 8260; GFX6-NEXT: v_mul_hi_u32 v12, v0, v4 8261; GFX6-NEXT: v_mul_hi_u32 v11, v0, v5 8262; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 8263; GFX6-NEXT: v_mul_lo_u32 v5, v2, v5 8264; GFX6-NEXT: v_mul_hi_u32 v6, v2, v4 8265; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 8266; GFX6-NEXT: v_addc_u32_e32 v11, vcc, v8, v12, vcc 8267; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 8268; GFX6-NEXT: v_add_i32_e32 v5, vcc, v10, v5 8269; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc 8270; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc 8271; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 8272; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 8273; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 8274; GFX6-NEXT: s_ashr_i32 s2, s11, 31 8275; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 8276; GFX6-NEXT: s_add_u32 s0, s10, s2 8277; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 8278; GFX6-NEXT: s_mov_b32 s3, s2 8279; GFX6-NEXT: s_addc_u32 s1, s11, s2 8280; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 8281; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8282; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 8283; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 8284; GFX6-NEXT: v_mul_hi_u32 v4, s0, v1 8285; GFX6-NEXT: v_mul_hi_u32 v5, s1, v1 8286; GFX6-NEXT: v_mul_lo_u32 v1, s1, v1 8287; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8288; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 8289; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 8290; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 8291; GFX6-NEXT: s_mov_b32 s3, 0x12d8fb 8292; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 8293; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 8294; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 8295; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 8296; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 8297; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 8298; GFX6-NEXT: v_mul_hi_u32 v5, v0, s3 8299; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 8300; GFX6-NEXT: v_mul_lo_u32 v8, v0, s3 8301; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 8302; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 8303; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 8304; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 8305; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s0, v8 8306; GFX6-NEXT: v_mov_b32_e32 v5, s1 8307; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc 8308; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s3, v8 8309; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc 8310; GFX6-NEXT: s_mov_b32 s0, 0x12d8fa 8311; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 8312; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 8313; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 8314; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 8315; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 8316; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 8317; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 8318; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 8319; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] 8320; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 8321; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 8322; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 8323; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 8324; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 8325; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 8326; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 8327; GFX6-NEXT: v_mov_b32_e32 v2, s2 8328; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 8329; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 8330; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 8331; GFX6-NEXT: s_endpgm 8332; 8333; GFX9-LABEL: sdiv_i64_oddk_denom: 8334; GFX9: ; %bb.0: 8335; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 8336; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 8337; GFX9-NEXT: v_rcp_f32_e32 v0, v0 8338; GFX9-NEXT: s_mov_b32 s8, 0xffed2705 8339; GFX9-NEXT: v_mov_b32_e32 v7, 0 8340; GFX9-NEXT: v_mov_b32_e32 v5, 0 8341; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8342; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8343; GFX9-NEXT: v_trunc_f32_e32 v1, v1 8344; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8345; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 8346; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 8347; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 8348; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 8349; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 8350; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 8351; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 8352; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 8353; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 8354; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 8355; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 8356; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 8357; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 8358; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 8359; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v8, vcc 8360; GFX9-NEXT: v_mul_lo_u32 v8, v1, v4 8361; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 8362; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 8363; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc 8364; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc 8365; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 8366; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 8367; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 8368; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] 8369; GFX9-NEXT: v_mul_lo_u32 v4, v2, s8 8370; GFX9-NEXT: v_mul_hi_u32 v6, v0, s8 8371; GFX9-NEXT: v_mul_lo_u32 v8, v0, s8 8372; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 8373; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 8374; GFX9-NEXT: v_sub_u32_e32 v4, v4, v0 8375; GFX9-NEXT: v_mul_lo_u32 v10, v0, v4 8376; GFX9-NEXT: v_mul_hi_u32 v11, v0, v8 8377; GFX9-NEXT: v_mul_hi_u32 v12, v0, v4 8378; GFX9-NEXT: v_mul_hi_u32 v9, v2, v8 8379; GFX9-NEXT: v_mul_lo_u32 v8, v2, v8 8380; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 8381; GFX9-NEXT: v_mul_hi_u32 v6, v2, v4 8382; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v7, v12, vcc 8383; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 8384; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 8385; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc 8386; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc 8387; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 8388; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc 8389; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] 8390; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8391; GFX9-NEXT: s_ashr_i32 s2, s7, 31 8392; GFX9-NEXT: s_add_u32 s0, s6, s2 8393; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 8394; GFX9-NEXT: s_mov_b32 s3, s2 8395; GFX9-NEXT: s_addc_u32 s1, s7, s2 8396; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 8397; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 8398; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 8399; GFX9-NEXT: v_mul_hi_u32 v3, s0, v0 8400; GFX9-NEXT: v_mul_hi_u32 v4, s0, v1 8401; GFX9-NEXT: v_mul_hi_u32 v6, s1, v1 8402; GFX9-NEXT: v_mul_lo_u32 v1, s1, v1 8403; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 8404; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 8405; GFX9-NEXT: v_mul_lo_u32 v4, s1, v0 8406; GFX9-NEXT: v_mul_hi_u32 v0, s1, v0 8407; GFX9-NEXT: s_mov_b32 s3, 0x12d8fb 8408; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 8409; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 8410; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc 8411; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 8412; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v2, vcc 8413; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 2, v0 8414; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 8415; GFX9-NEXT: v_mul_hi_u32 v6, v0, s3 8416; GFX9-NEXT: v_mul_lo_u32 v9, v0, s3 8417; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc 8418; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 8419; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc 8420; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 8421; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s0, v9 8422; GFX9-NEXT: v_mov_b32_e32 v6, s1 8423; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v6, v4, vcc 8424; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s3, v9 8425; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v4, vcc 8426; GFX9-NEXT: s_mov_b32 s0, 0x12d8fa 8427; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v6 8428; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 8429; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 8430; GFX9-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc 8431; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v9 8432; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 8433; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 8434; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 8435; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[0:1] 8436; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 8437; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc 8438; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 8439; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc 8440; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 8441; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 8442; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 8443; GFX9-NEXT: v_mov_b32_e32 v2, s2 8444; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s2, v0 8445; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc 8446; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] 8447; GFX9-NEXT: s_endpgm 8448 %r = sdiv i64 %x, 1235195 8449 store i64 %r, i64 addrspace(1)* %out 8450 ret void 8451} 8452 8453define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 8454; CHECK-LABEL: @sdiv_i64_pow2k_denom( 8455; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 4096 8456; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 8457; CHECK-NEXT: ret void 8458; 8459; GFX6-LABEL: sdiv_i64_pow2k_denom: 8460; GFX6: ; %bb.0: 8461; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 8462; GFX6-NEXT: s_mov_b32 s7, 0xf000 8463; GFX6-NEXT: s_mov_b32 s6, -1 8464; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8465; GFX6-NEXT: s_mov_b32 s4, s0 8466; GFX6-NEXT: s_ashr_i32 s0, s3, 31 8467; GFX6-NEXT: s_lshr_b32 s0, s0, 20 8468; GFX6-NEXT: s_add_u32 s0, s2, s0 8469; GFX6-NEXT: s_mov_b32 s5, s1 8470; GFX6-NEXT: s_addc_u32 s1, s3, 0 8471; GFX6-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 8472; GFX6-NEXT: v_mov_b32_e32 v0, s0 8473; GFX6-NEXT: v_mov_b32_e32 v1, s1 8474; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 8475; GFX6-NEXT: s_endpgm 8476; 8477; GFX9-LABEL: sdiv_i64_pow2k_denom: 8478; GFX9: ; %bb.0: 8479; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 8480; GFX9-NEXT: v_mov_b32_e32 v2, 0 8481; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8482; GFX9-NEXT: s_ashr_i32 s4, s3, 31 8483; GFX9-NEXT: s_lshr_b32 s4, s4, 20 8484; GFX9-NEXT: s_add_u32 s2, s2, s4 8485; GFX9-NEXT: s_addc_u32 s3, s3, 0 8486; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 8487; GFX9-NEXT: v_mov_b32_e32 v0, s2 8488; GFX9-NEXT: v_mov_b32_e32 v1, s3 8489; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 8490; GFX9-NEXT: s_endpgm 8491 %r = sdiv i64 %x, 4096 8492 store i64 %r, i64 addrspace(1)* %out 8493 ret void 8494} 8495 8496define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 8497; CHECK-LABEL: @sdiv_i64_pow2_shl_denom( 8498; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 8499; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]] 8500; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 8501; CHECK-NEXT: ret void 8502; 8503; GFX6-LABEL: sdiv_i64_pow2_shl_denom: 8504; GFX6: ; %bb.0: 8505; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd 8506; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 8507; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 8508; GFX6-NEXT: s_mov_b32 s7, 0xf000 8509; GFX6-NEXT: s_mov_b32 s6, -1 8510; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8511; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 8512; GFX6-NEXT: s_ashr_i32 s12, s3, 31 8513; GFX6-NEXT: s_add_u32 s2, s2, s12 8514; GFX6-NEXT: s_mov_b32 s13, s12 8515; GFX6-NEXT: s_addc_u32 s3, s3, s12 8516; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] 8517; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 8518; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 8519; GFX6-NEXT: s_sub_u32 s4, 0, s2 8520; GFX6-NEXT: s_subb_u32 s5, 0, s3 8521; GFX6-NEXT: s_ashr_i32 s14, s11, 31 8522; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 8523; GFX6-NEXT: v_rcp_f32_e32 v0, v0 8524; GFX6-NEXT: s_mov_b32 s15, s14 8525; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8526; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8527; GFX6-NEXT: v_trunc_f32_e32 v1, v1 8528; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8529; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 8530; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 8531; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 8532; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 8533; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 8534; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 8535; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8536; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 8537; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 8538; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 8539; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 8540; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 8541; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 8542; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 8543; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 8544; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 8545; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 8546; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 8547; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 8548; GFX6-NEXT: v_mov_b32_e32 v4, 0 8549; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 8550; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8551; GFX6-NEXT: v_mov_b32_e32 v6, 0 8552; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 8553; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 8554; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 8555; GFX6-NEXT: v_mul_lo_u32 v5, s4, v2 8556; GFX6-NEXT: v_mul_hi_u32 v7, s4, v0 8557; GFX6-NEXT: v_mul_lo_u32 v8, s5, v0 8558; GFX6-NEXT: s_mov_b32 s5, s9 8559; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 8560; GFX6-NEXT: v_mul_lo_u32 v7, s4, v0 8561; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 8562; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 8563; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 8564; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 8565; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 8566; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 8567; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 8568; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 8569; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 8570; GFX6-NEXT: v_mul_lo_u32 v2, v2, v5 8571; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 8572; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 8573; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 8574; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 8575; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 8576; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 8577; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 8578; GFX6-NEXT: s_add_u32 s0, s10, s14 8579; GFX6-NEXT: s_addc_u32 s1, s11, s14 8580; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 8581; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 8582; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8583; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 8584; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 8585; GFX6-NEXT: v_mul_hi_u32 v5, s10, v1 8586; GFX6-NEXT: v_mul_hi_u32 v7, s11, v1 8587; GFX6-NEXT: v_mul_lo_u32 v1, s11, v1 8588; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8589; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 8590; GFX6-NEXT: v_mul_lo_u32 v5, s11, v0 8591; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 8592; GFX6-NEXT: s_mov_b32 s4, s8 8593; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 8594; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 8595; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 8596; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 8597; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 8598; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 8599; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 8600; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 8601; GFX6-NEXT: v_mov_b32_e32 v5, s3 8602; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8603; GFX6-NEXT: v_mul_lo_u32 v3, s2, v0 8604; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 8605; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s11, v2 8606; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 8607; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 8608; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s2, v3 8609; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 8610; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v4 8611; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 8612; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v5 8613; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 8614; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s3, v4 8615; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 8616; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 8617; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 8618; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 8619; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 8620; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 8621; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] 8622; GFX6-NEXT: v_mov_b32_e32 v6, s11 8623; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 8624; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 8625; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 8626; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 8627; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 8628; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s3, v2 8629; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 8630; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 8631; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 8632; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 8633; GFX6-NEXT: s_xor_b64 s[0:1], s[14:15], s[12:13] 8634; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 8635; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 8636; GFX6-NEXT: v_xor_b32_e32 v1, s1, v1 8637; GFX6-NEXT: v_mov_b32_e32 v2, s1 8638; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 8639; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 8640; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 8641; GFX6-NEXT: s_endpgm 8642; 8643; GFX9-LABEL: sdiv_i64_pow2_shl_denom: 8644; GFX9: ; %bb.0: 8645; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 8646; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 8647; GFX9-NEXT: v_mov_b32_e32 v2, 0 8648; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8649; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 8650; GFX9-NEXT: s_ashr_i32 s8, s3, 31 8651; GFX9-NEXT: s_add_u32 s2, s2, s8 8652; GFX9-NEXT: s_mov_b32 s9, s8 8653; GFX9-NEXT: s_addc_u32 s3, s3, s8 8654; GFX9-NEXT: s_xor_b64 s[10:11], s[2:3], s[8:9] 8655; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 8656; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 8657; GFX9-NEXT: s_sub_u32 s12, 0, s10 8658; GFX9-NEXT: s_subb_u32 s4, 0, s11 8659; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 8660; GFX9-NEXT: v_rcp_f32_e32 v0, v0 8661; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8662; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8663; GFX9-NEXT: v_trunc_f32_e32 v1, v1 8664; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8665; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 8666; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 8667; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0 8668; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 8669; GFX9-NEXT: v_mul_lo_u32 v6, s4, v0 8670; GFX9-NEXT: v_mul_lo_u32 v5, s12, v0 8671; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 8672; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 8673; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 8674; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3 8675; GFX9-NEXT: v_mul_hi_u32 v8, v0, v3 8676; GFX9-NEXT: v_mul_hi_u32 v7, v1, v5 8677; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 8678; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 8679; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 8680; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc 8681; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 8682; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 8683; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v7, vcc 8684; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v2, vcc 8685; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 8686; GFX9-NEXT: v_mov_b32_e32 v6, 0 8687; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v3 8688; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc 8689; GFX9-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[2:3] 8690; GFX9-NEXT: v_mul_lo_u32 v5, s12, v3 8691; GFX9-NEXT: v_mul_hi_u32 v7, s12, v0 8692; GFX9-NEXT: v_mul_lo_u32 v8, s4, v0 8693; GFX9-NEXT: v_mul_lo_u32 v9, s12, v0 8694; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 8695; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 8696; GFX9-NEXT: v_add_u32_e32 v5, v5, v8 8697; GFX9-NEXT: v_mul_lo_u32 v10, v0, v5 8698; GFX9-NEXT: v_mul_hi_u32 v11, v0, v9 8699; GFX9-NEXT: v_mul_hi_u32 v12, v0, v5 8700; GFX9-NEXT: v_mul_hi_u32 v8, v3, v9 8701; GFX9-NEXT: v_mul_lo_u32 v9, v3, v9 8702; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 8703; GFX9-NEXT: v_mul_hi_u32 v7, v3, v5 8704; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc 8705; GFX9-NEXT: v_mul_lo_u32 v3, v3, v5 8706; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 8707; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc 8708; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 8709; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 8710; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v5, vcc 8711; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 8712; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3] 8713; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8714; GFX9-NEXT: s_ashr_i32 s2, s7, 31 8715; GFX9-NEXT: s_add_u32 s0, s6, s2 8716; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 8717; GFX9-NEXT: s_mov_b32 s3, s2 8718; GFX9-NEXT: s_addc_u32 s1, s7, s2 8719; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3] 8720; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 8721; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 8722; GFX9-NEXT: v_mul_hi_u32 v4, s6, v0 8723; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 8724; GFX9-NEXT: v_mul_hi_u32 v7, s7, v1 8725; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 8726; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 8727; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 8728; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 8729; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 8730; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 8731; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v4, v0, vcc 8732; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v2, vcc 8733; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 8734; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v3, vcc 8735; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 8736; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 8737; GFX9-NEXT: v_mul_lo_u32 v5, s11, v0 8738; GFX9-NEXT: v_mov_b32_e32 v6, s11 8739; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 8740; GFX9-NEXT: v_mul_lo_u32 v4, s10, v0 8741; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 8742; GFX9-NEXT: v_sub_u32_e32 v5, s7, v3 8743; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s6, v4 8744; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc 8745; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s10, v4 8746; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] 8747; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v5 8748; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 8749; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v6 8750; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 8751; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v5 8752; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v6, s[0:1] 8753; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], 2, v0 8754; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] 8755; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], 1, v0 8756; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, v1, s[0:1] 8757; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 8758; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v7, s[0:1] 8759; GFX9-NEXT: v_mov_b32_e32 v7, s7 8760; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc 8761; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 8762; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 8763; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v4 8764; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 8765; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v3 8766; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc 8767; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 8768; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v6, s[0:1] 8769; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 8770; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[8:9] 8771; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 8772; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 8773; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 8774; GFX9-NEXT: v_mov_b32_e32 v3, s1 8775; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 8776; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 8777; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 8778; GFX9-NEXT: s_endpgm 8779 %shl.y = shl i64 4096, %y 8780 %r = sdiv i64 %x, %shl.y 8781 store i64 %r, i64 addrspace(1)* %out 8782 ret void 8783} 8784 8785define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 8786; CHECK-LABEL: @sdiv_v2i64_pow2k_denom( 8787; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 8788; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 8789; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 8790; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 8791; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096 8792; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 8793; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 8794; CHECK-NEXT: ret void 8795; 8796; GFX6-LABEL: sdiv_v2i64_pow2k_denom: 8797; GFX6: ; %bb.0: 8798; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8799; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 8800; GFX6-NEXT: s_mov_b32 s7, 0xf000 8801; GFX6-NEXT: s_mov_b32 s6, -1 8802; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8803; GFX6-NEXT: s_ashr_i32 s8, s1, 31 8804; GFX6-NEXT: s_lshr_b32 s8, s8, 20 8805; GFX6-NEXT: s_add_u32 s0, s0, s8 8806; GFX6-NEXT: s_addc_u32 s1, s1, 0 8807; GFX6-NEXT: s_ashr_i32 s8, s3, 31 8808; GFX6-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 8809; GFX6-NEXT: s_lshr_b32 s8, s8, 20 8810; GFX6-NEXT: s_add_u32 s2, s2, s8 8811; GFX6-NEXT: s_addc_u32 s3, s3, 0 8812; GFX6-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 8813; GFX6-NEXT: v_mov_b32_e32 v0, s0 8814; GFX6-NEXT: v_mov_b32_e32 v1, s1 8815; GFX6-NEXT: v_mov_b32_e32 v2, s2 8816; GFX6-NEXT: v_mov_b32_e32 v3, s3 8817; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 8818; GFX6-NEXT: s_endpgm 8819; 8820; GFX9-LABEL: sdiv_v2i64_pow2k_denom: 8821; GFX9: ; %bb.0: 8822; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8823; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 8824; GFX9-NEXT: v_mov_b32_e32 v4, 0 8825; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8826; GFX9-NEXT: s_ashr_i32 s0, s5, 31 8827; GFX9-NEXT: s_lshr_b32 s0, s0, 20 8828; GFX9-NEXT: s_add_u32 s0, s4, s0 8829; GFX9-NEXT: s_addc_u32 s1, s5, 0 8830; GFX9-NEXT: s_ashr_i32 s4, s7, 31 8831; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 8832; GFX9-NEXT: s_lshr_b32 s4, s4, 20 8833; GFX9-NEXT: s_add_u32 s4, s6, s4 8834; GFX9-NEXT: s_addc_u32 s5, s7, 0 8835; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 8836; GFX9-NEXT: v_mov_b32_e32 v0, s0 8837; GFX9-NEXT: v_mov_b32_e32 v1, s1 8838; GFX9-NEXT: v_mov_b32_e32 v2, s4 8839; GFX9-NEXT: v_mov_b32_e32 v3, s5 8840; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 8841; GFX9-NEXT: s_endpgm 8842 %r = sdiv <2 x i64> %x, <i64 4096, i64 4096> 8843 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 8844 ret void 8845} 8846 8847define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 8848; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom( 8849; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 8850; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 8851; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 8852; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 8853; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095 8854; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 8855; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 8856; CHECK-NEXT: ret void 8857; 8858; GFX6-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 8859; GFX6: ; %bb.0: 8860; GFX6-NEXT: v_mov_b32_e32 v0, 0x457ff000 8861; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 8862; GFX6-NEXT: v_mac_f32_e32 v0, 0, v1 8863; GFX6-NEXT: v_rcp_f32_e32 v0, v0 8864; GFX6-NEXT: s_movk_i32 s6, 0xf001 8865; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8866; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 8867; GFX6-NEXT: s_mov_b32 s7, 0xf000 8868; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8869; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8870; GFX6-NEXT: v_trunc_f32_e32 v1, v1 8871; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8872; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 8873; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 8874; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8875; GFX6-NEXT: s_ashr_i32 s0, s9, 31 8876; GFX6-NEXT: s_lshr_b32 s0, s0, 20 8877; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 8878; GFX6-NEXT: v_mul_lo_u32 v3, v1, s6 8879; GFX6-NEXT: s_add_u32 s2, s8, s0 8880; GFX6-NEXT: s_addc_u32 s3, s9, 0 8881; GFX6-NEXT: s_ashr_i32 s8, s11, 31 8882; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 8883; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 8884; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 8885; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 8886; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 8887; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 8888; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 8889; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 8890; GFX6-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 8891; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 8892; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 8893; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 8894; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 8895; GFX6-NEXT: s_mov_b32 s9, s8 8896; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 8897; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 8898; GFX6-NEXT: v_mov_b32_e32 v4, 0 8899; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 8900; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8901; GFX6-NEXT: v_mov_b32_e32 v6, 0 8902; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 8903; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 8904; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 8905; GFX6-NEXT: v_mul_lo_u32 v5, v2, s6 8906; GFX6-NEXT: v_mul_hi_u32 v7, v0, s6 8907; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 8908; GFX6-NEXT: v_mul_lo_u32 v7, v0, s6 8909; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, v0, v5 8910; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 8911; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 8912; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 8913; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 8914; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 8915; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 8916; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 8917; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 8918; GFX6-NEXT: v_mul_lo_u32 v2, v2, v5 8919; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 8920; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 8921; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 8922; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 8923; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 8924; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 8925; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 8926; GFX6-NEXT: s_add_u32 s0, s10, s8 8927; GFX6-NEXT: s_addc_u32 s1, s11, s8 8928; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 8929; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] 8930; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 8931; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 8932; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 8933; GFX6-NEXT: v_mul_hi_u32 v5, s0, v1 8934; GFX6-NEXT: v_mul_hi_u32 v7, s1, v1 8935; GFX6-NEXT: v_mul_lo_u32 v1, s1, v1 8936; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 8937; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 8938; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 8939; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 8940; GFX6-NEXT: s_movk_i32 s9, 0xfff 8941; GFX6-NEXT: s_mov_b32 s6, -1 8942; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 8943; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 8944; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 8945; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 8946; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 8947; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9 8948; GFX6-NEXT: v_mul_hi_u32 v5, v0, s9 8949; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 8950; GFX6-NEXT: v_mul_lo_u32 v8, v0, s9 8951; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 8952; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 8953; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 8954; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 8955; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s0, v8 8956; GFX6-NEXT: v_mov_b32_e32 v5, s1 8957; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc 8958; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s9, v8 8959; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc 8960; GFX6-NEXT: s_movk_i32 s0, 0xffe 8961; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 8962; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 8963; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 8964; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 8965; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 8966; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 8967; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 8968; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 8969; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] 8970; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 8971; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 8972; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 8973; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 8974; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 8975; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 8976; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0 8977; GFX6-NEXT: v_xor_b32_e32 v1, s8, v1 8978; GFX6-NEXT: v_mov_b32_e32 v3, s8 8979; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc 8980; GFX6-NEXT: v_mov_b32_e32 v0, s2 8981; GFX6-NEXT: v_mov_b32_e32 v1, s3 8982; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 8983; GFX6-NEXT: s_endpgm 8984; 8985; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 8986; GFX9: ; %bb.0: 8987; GFX9-NEXT: v_mov_b32_e32 v0, 0x457ff000 8988; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 8989; GFX9-NEXT: v_mac_f32_e32 v0, 0, v1 8990; GFX9-NEXT: v_rcp_f32_e32 v0, v0 8991; GFX9-NEXT: s_movk_i32 s8, 0xf001 8992; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 8993; GFX9-NEXT: v_mov_b32_e32 v4, 0 8994; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 8995; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 8996; GFX9-NEXT: v_trunc_f32_e32 v1, v1 8997; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 8998; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 8999; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 9000; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9001; GFX9-NEXT: s_ashr_i32 s2, s5, 31 9002; GFX9-NEXT: s_lshr_b32 s2, s2, 20 9003; GFX9-NEXT: v_mul_hi_u32 v2, v0, s8 9004; GFX9-NEXT: v_mul_lo_u32 v3, v1, s8 9005; GFX9-NEXT: v_mul_lo_u32 v5, v0, s8 9006; GFX9-NEXT: s_add_u32 s4, s4, s2 9007; GFX9-NEXT: s_addc_u32 s5, s5, 0 9008; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 9009; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 9010; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 9011; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5 9012; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 9013; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 9014; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 9015; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 9016; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 9017; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 9018; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 9019; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 9020; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 9021; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc 9022; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v4, vcc 9023; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9024; GFX9-NEXT: v_mov_b32_e32 v6, 0 9025; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 9026; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc 9027; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] 9028; GFX9-NEXT: v_mul_lo_u32 v5, v2, s8 9029; GFX9-NEXT: v_mul_hi_u32 v7, v0, s8 9030; GFX9-NEXT: v_mul_lo_u32 v8, v0, s8 9031; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 9032; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 9033; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 9034; GFX9-NEXT: v_sub_u32_e32 v5, v5, v0 9035; GFX9-NEXT: v_mul_lo_u32 v10, v0, v5 9036; GFX9-NEXT: v_mul_hi_u32 v11, v0, v8 9037; GFX9-NEXT: v_mul_hi_u32 v12, v0, v5 9038; GFX9-NEXT: v_mul_hi_u32 v9, v2, v8 9039; GFX9-NEXT: v_mul_lo_u32 v8, v2, v8 9040; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 9041; GFX9-NEXT: v_mul_hi_u32 v7, v2, v5 9042; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc 9043; GFX9-NEXT: v_mul_lo_u32 v2, v2, v5 9044; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 9045; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc 9046; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc 9047; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 9048; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v5, vcc 9049; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3] 9050; GFX9-NEXT: s_ashr_i32 s2, s7, 31 9051; GFX9-NEXT: s_add_u32 s6, s6, s2 9052; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 9053; GFX9-NEXT: s_mov_b32 s3, s2 9054; GFX9-NEXT: s_addc_u32 s7, s7, s2 9055; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] 9056; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 9057; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 9058; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 9059; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 9060; GFX9-NEXT: v_mul_hi_u32 v7, s7, v1 9061; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 9062; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9063; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc 9064; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 9065; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 9066; GFX9-NEXT: s_movk_i32 s0, 0xfff 9067; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 9068; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 9069; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v4, vcc 9070; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 9071; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v2, vcc 9072; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 2, v0 9073; GFX9-NEXT: v_mul_lo_u32 v5, v1, s0 9074; GFX9-NEXT: v_mul_hi_u32 v6, v0, s0 9075; GFX9-NEXT: v_mul_lo_u32 v9, v0, s0 9076; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc 9077; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 1, v0 9078; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v1, vcc 9079; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 9080; GFX9-NEXT: v_mov_b32_e32 v6, s7 9081; GFX9-NEXT: v_sub_co_u32_e32 v9, vcc, s6, v9 9082; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc 9083; GFX9-NEXT: v_subrev_co_u32_e32 v6, vcc, s0, v9 9084; GFX9-NEXT: v_subbrev_co_u32_e32 v10, vcc, 0, v5, vcc 9085; GFX9-NEXT: s_movk_i32 s0, 0xffe 9086; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v6 9087; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 9088; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 9089; GFX9-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc 9090; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v9 9091; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 9092; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 9093; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 9094; GFX9-NEXT: v_cndmask_b32_e64 v5, -1, v6, s[0:1] 9095; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 9096; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc 9097; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 9098; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc 9099; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 9100; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 9101; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v0 9102; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 9103; GFX9-NEXT: v_mov_b32_e32 v3, s2 9104; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc 9105; GFX9-NEXT: v_mov_b32_e32 v0, s4 9106; GFX9-NEXT: v_mov_b32_e32 v1, s5 9107; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9108; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] 9109; GFX9-NEXT: s_endpgm 9110 %r = sdiv <2 x i64> %x, <i64 4096, i64 4095> 9111 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 9112 ret void 9113} 9114 9115define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 9116; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom( 9117; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 9118; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 9119; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 9120; CHECK-NEXT: [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]] 9121; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 9122; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 9123; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 9124; CHECK-NEXT: [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]] 9125; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 9126; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 9127; CHECK-NEXT: ret void 9128; 9129; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom: 9130; GFX6: ; %bb.0: 9131; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 9132; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 9133; GFX6-NEXT: s_mov_b32 s18, 0x4f800000 9134; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc 9135; GFX6-NEXT: s_mov_b32 s20, 0x2f800000 9136; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9137; GFX6-NEXT: s_lshl_b64 s[12:13], s[2:3], s6 9138; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 9139; GFX6-NEXT: s_ashr_i32 s16, s3, 31 9140; GFX6-NEXT: s_add_u32 s2, s2, s16 9141; GFX6-NEXT: s_mov_b32 s17, s16 9142; GFX6-NEXT: s_addc_u32 s3, s3, s16 9143; GFX6-NEXT: s_xor_b64 s[14:15], s[2:3], s[16:17] 9144; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s14 9145; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s15 9146; GFX6-NEXT: s_mov_b32 s21, 0xcf800000 9147; GFX6-NEXT: s_sub_u32 s6, 0, s14 9148; GFX6-NEXT: s_subb_u32 s7, 0, s15 9149; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1 9150; GFX6-NEXT: v_rcp_f32_e32 v0, v0 9151; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 9152; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 9153; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0 9154; GFX6-NEXT: v_mul_f32_e32 v1, s20, v0 9155; GFX6-NEXT: v_trunc_f32_e32 v1, v1 9156; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 9157; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9158; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9159; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 9160; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 9161; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 9162; GFX6-NEXT: v_mul_lo_u32 v5, s6, v0 9163; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9164; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9165; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 9166; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 9167; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 9168; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 9169; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9170; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 9171; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 9172; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 9173; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 9174; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 9175; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 9176; GFX6-NEXT: v_mov_b32_e32 v4, 0 9177; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 9178; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9179; GFX6-NEXT: v_mov_b32_e32 v6, 0 9180; GFX6-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2 9181; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 9182; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[2:3] 9183; GFX6-NEXT: v_mul_lo_u32 v5, s6, v2 9184; GFX6-NEXT: v_mul_hi_u32 v7, s6, v0 9185; GFX6-NEXT: v_mul_lo_u32 v8, s7, v0 9186; GFX6-NEXT: s_mov_b32 s7, 0xf000 9187; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 9188; GFX6-NEXT: v_mul_lo_u32 v7, s6, v0 9189; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 9190; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 9191; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 9192; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 9193; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 9194; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 9195; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 9196; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 9197; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 9198; GFX6-NEXT: v_mul_lo_u32 v2, v2, v5 9199; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 9200; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 9201; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 9202; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 9203; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 9204; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 9205; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] 9206; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9207; GFX6-NEXT: s_ashr_i32 s2, s9, 31 9208; GFX6-NEXT: s_add_u32 s0, s8, s2 9209; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9210; GFX6-NEXT: s_mov_b32 s3, s2 9211; GFX6-NEXT: s_addc_u32 s1, s9, s2 9212; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3] 9213; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 9214; GFX6-NEXT: v_mul_lo_u32 v2, s8, v1 9215; GFX6-NEXT: v_mul_hi_u32 v3, s8, v0 9216; GFX6-NEXT: v_mul_hi_u32 v5, s8, v1 9217; GFX6-NEXT: v_mul_hi_u32 v7, s9, v1 9218; GFX6-NEXT: v_mul_lo_u32 v1, s9, v1 9219; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9220; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 9221; GFX6-NEXT: v_mul_lo_u32 v5, s9, v0 9222; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 9223; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[16:17] 9224; GFX6-NEXT: s_mov_b32 s6, -1 9225; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 9226; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 9227; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 9228; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 9229; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 9230; GFX6-NEXT: v_mul_lo_u32 v2, s14, v1 9231; GFX6-NEXT: v_mul_hi_u32 v3, s14, v0 9232; GFX6-NEXT: v_mul_lo_u32 v5, s15, v0 9233; GFX6-NEXT: v_mov_b32_e32 v7, s15 9234; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9235; GFX6-NEXT: v_mul_lo_u32 v3, s14, v0 9236; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 9237; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s9, v2 9238; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 9239; GFX6-NEXT: v_subb_u32_e64 v5, s[0:1], v5, v7, vcc 9240; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s14, v3 9241; GFX6-NEXT: v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1] 9242; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v5 9243; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 9244; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 9245; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 9246; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v5 9247; GFX6-NEXT: v_cndmask_b32_e64 v5, v8, v7, s[0:1] 9248; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v0 9249; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 9250; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v0 9251; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v1, s[0:1] 9252; GFX6-NEXT: s_ashr_i32 s8, s13, 31 9253; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 9254; GFX6-NEXT: s_add_u32 s12, s12, s8 9255; GFX6-NEXT: v_cndmask_b32_e64 v5, v10, v8, s[0:1] 9256; GFX6-NEXT: v_mov_b32_e32 v8, s9 9257; GFX6-NEXT: s_mov_b32 s9, s8 9258; GFX6-NEXT: s_addc_u32 s13, s13, s8 9259; GFX6-NEXT: s_xor_b64 s[12:13], s[12:13], s[8:9] 9260; GFX6-NEXT: v_cvt_f32_u32_e32 v10, s12 9261; GFX6-NEXT: v_cvt_f32_u32_e32 v11, s13 9262; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc 9263; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s15, v2 9264; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 9265; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s14, v3 9266; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 9267; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s15, v2 9268; GFX6-NEXT: v_mac_f32_e32 v10, s18, v11 9269; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc 9270; GFX6-NEXT: v_rcp_f32_e32 v3, v10 9271; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 9272; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 9273; GFX6-NEXT: s_sub_u32 s14, 0, s12 9274; GFX6-NEXT: v_mul_f32_e32 v3, s19, v3 9275; GFX6-NEXT: v_mul_f32_e32 v5, s20, v3 9276; GFX6-NEXT: v_trunc_f32_e32 v5, v5 9277; GFX6-NEXT: v_mac_f32_e32 v3, s21, v5 9278; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 9279; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 9280; GFX6-NEXT: v_cndmask_b32_e64 v2, v9, v7, s[0:1] 9281; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 9282; GFX6-NEXT: v_mul_hi_u32 v2, s14, v3 9283; GFX6-NEXT: v_mul_lo_u32 v7, s14, v5 9284; GFX6-NEXT: s_subb_u32 s15, 0, s13 9285; GFX6-NEXT: v_mul_lo_u32 v8, s15, v3 9286; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 9287; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v7 9288; GFX6-NEXT: v_mul_lo_u32 v7, s14, v3 9289; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 9290; GFX6-NEXT: v_mul_lo_u32 v8, v3, v2 9291; GFX6-NEXT: v_mul_hi_u32 v10, v3, v2 9292; GFX6-NEXT: v_mul_hi_u32 v9, v3, v7 9293; GFX6-NEXT: v_mul_hi_u32 v11, v5, v2 9294; GFX6-NEXT: v_mul_lo_u32 v2, v5, v2 9295; GFX6-NEXT: v_xor_b32_e32 v1, s3, v1 9296; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 9297; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 9298; GFX6-NEXT: v_mul_lo_u32 v10, v5, v7 9299; GFX6-NEXT: v_mul_hi_u32 v7, v5, v7 9300; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v10 9301; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc 9302; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v11, v4, vcc 9303; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 9304; GFX6-NEXT: v_add_i32_e64 v2, s[0:1], v3, v2 9305; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v6, v8, vcc 9306; GFX6-NEXT: v_addc_u32_e64 v3, vcc, v5, v7, s[0:1] 9307; GFX6-NEXT: v_mul_lo_u32 v8, s14, v3 9308; GFX6-NEXT: v_mul_hi_u32 v9, s14, v2 9309; GFX6-NEXT: v_mul_lo_u32 v10, s15, v2 9310; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 9311; GFX6-NEXT: v_mul_lo_u32 v9, s14, v2 9312; GFX6-NEXT: v_add_i32_e32 v8, vcc, v10, v8 9313; GFX6-NEXT: v_mul_lo_u32 v12, v2, v8 9314; GFX6-NEXT: v_mul_hi_u32 v14, v2, v8 9315; GFX6-NEXT: v_mul_hi_u32 v13, v2, v9 9316; GFX6-NEXT: v_mul_hi_u32 v11, v3, v9 9317; GFX6-NEXT: v_mul_lo_u32 v9, v3, v9 9318; GFX6-NEXT: v_mul_hi_u32 v10, v3, v8 9319; GFX6-NEXT: v_add_i32_e32 v12, vcc, v13, v12 9320; GFX6-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc 9321; GFX6-NEXT: v_mul_lo_u32 v3, v3, v8 9322; GFX6-NEXT: v_add_i32_e32 v9, vcc, v12, v9 9323; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v13, v11, vcc 9324; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v10, v4, vcc 9325; GFX6-NEXT: v_add_i32_e32 v3, vcc, v9, v3 9326; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v6, v8, vcc 9327; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v7 9328; GFX6-NEXT: s_ashr_i32 s14, s11, 31 9329; GFX6-NEXT: v_addc_u32_e64 v5, vcc, v5, v8, s[0:1] 9330; GFX6-NEXT: s_add_u32 s0, s10, s14 9331; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 9332; GFX6-NEXT: s_mov_b32 s15, s14 9333; GFX6-NEXT: s_addc_u32 s1, s11, s14 9334; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 9335; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 9336; GFX6-NEXT: v_mul_lo_u32 v5, s10, v3 9337; GFX6-NEXT: v_mul_hi_u32 v7, s10, v2 9338; GFX6-NEXT: v_mul_hi_u32 v9, s10, v3 9339; GFX6-NEXT: v_mul_hi_u32 v10, s11, v3 9340; GFX6-NEXT: v_mul_lo_u32 v3, s11, v3 9341; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 9342; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc 9343; GFX6-NEXT: v_mul_lo_u32 v9, s11, v2 9344; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2 9345; GFX6-NEXT: v_mov_b32_e32 v8, s3 9346; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 9347; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v2, vcc 9348; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc 9349; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 9350; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc 9351; GFX6-NEXT: v_mul_lo_u32 v4, s12, v3 9352; GFX6-NEXT: v_mul_hi_u32 v5, s12, v2 9353; GFX6-NEXT: v_mul_lo_u32 v6, s13, v2 9354; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 9355; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc 9356; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 9357; GFX6-NEXT: v_mul_lo_u32 v5, s12, v2 9358; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 9359; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s11, v4 9360; GFX6-NEXT: v_mov_b32_e32 v7, s13 9361; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s10, v5 9362; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc 9363; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s12, v5 9364; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] 9365; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v6 9366; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 9367; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 9368; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 9369; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v6 9370; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] 9371; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v2 9372; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] 9373; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v2 9374; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] 9375; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 9376; GFX6-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[0:1] 9377; GFX6-NEXT: v_mov_b32_e32 v8, s11 9378; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc 9379; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v4 9380; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 9381; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 9382; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 9383; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v4 9384; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc 9385; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 9386; GFX6-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] 9387; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 9388; GFX6-NEXT: s_xor_b64 s[0:1], s[14:15], s[8:9] 9389; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 9390; GFX6-NEXT: v_xor_b32_e32 v2, s0, v2 9391; GFX6-NEXT: v_xor_b32_e32 v3, s1, v3 9392; GFX6-NEXT: v_mov_b32_e32 v4, s1 9393; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v2 9394; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc 9395; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 9396; GFX6-NEXT: s_endpgm 9397; 9398; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom: 9399; GFX9: ; %bb.0: 9400; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 9401; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 9402; GFX9-NEXT: s_mov_b32 s16, 0x4f800000 9403; GFX9-NEXT: s_mov_b32 s17, 0x5f7ffffc 9404; GFX9-NEXT: s_mov_b32 s18, 0x2f800000 9405; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9406; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s6 9407; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 9408; GFX9-NEXT: s_ashr_i32 s12, s3, 31 9409; GFX9-NEXT: s_add_u32 s2, s2, s12 9410; GFX9-NEXT: s_mov_b32 s13, s12 9411; GFX9-NEXT: s_addc_u32 s3, s3, s12 9412; GFX9-NEXT: s_xor_b64 s[10:11], s[2:3], s[12:13] 9413; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 9414; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 9415; GFX9-NEXT: s_mov_b32 s19, 0xcf800000 9416; GFX9-NEXT: s_sub_u32 s14, 0, s10 9417; GFX9-NEXT: s_subb_u32 s4, 0, s11 9418; GFX9-NEXT: v_mac_f32_e32 v0, s16, v1 9419; GFX9-NEXT: v_rcp_f32_e32 v0, v0 9420; GFX9-NEXT: v_mov_b32_e32 v6, 0 9421; GFX9-NEXT: v_mul_f32_e32 v0, s17, v0 9422; GFX9-NEXT: v_mul_f32_e32 v1, s18, v0 9423; GFX9-NEXT: v_trunc_f32_e32 v1, v1 9424; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 9425; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 9426; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 9427; GFX9-NEXT: v_mul_hi_u32 v3, s14, v0 9428; GFX9-NEXT: v_mul_lo_u32 v2, s14, v1 9429; GFX9-NEXT: v_mul_lo_u32 v5, s4, v0 9430; GFX9-NEXT: v_mul_lo_u32 v4, s14, v0 9431; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9432; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 9433; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 9434; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 9435; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 9436; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 9437; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 9438; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 9439; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc 9440; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 9441; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 9442; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 9443; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc 9444; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc 9445; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9446; GFX9-NEXT: v_mov_b32_e32 v5, 0 9447; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 9448; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc 9449; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] 9450; GFX9-NEXT: v_mul_lo_u32 v4, s14, v2 9451; GFX9-NEXT: v_mul_hi_u32 v7, s14, v0 9452; GFX9-NEXT: v_mul_lo_u32 v8, s4, v0 9453; GFX9-NEXT: v_mul_lo_u32 v9, s14, v0 9454; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 9455; GFX9-NEXT: v_add_u32_e32 v4, v7, v4 9456; GFX9-NEXT: v_add_u32_e32 v4, v4, v8 9457; GFX9-NEXT: v_mul_lo_u32 v10, v0, v4 9458; GFX9-NEXT: v_mul_hi_u32 v11, v0, v9 9459; GFX9-NEXT: v_mul_hi_u32 v12, v0, v4 9460; GFX9-NEXT: v_mul_hi_u32 v8, v2, v9 9461; GFX9-NEXT: v_mul_lo_u32 v9, v2, v9 9462; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 9463; GFX9-NEXT: v_mul_hi_u32 v7, v2, v4 9464; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc 9465; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 9466; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 9467; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc 9468; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v6, vcc 9469; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 9470; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9471; GFX9-NEXT: s_ashr_i32 s14, s5, 31 9472; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v5, v4, vcc 9473; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 9474; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] 9475; GFX9-NEXT: s_add_u32 s2, s4, s14 9476; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 9477; GFX9-NEXT: s_mov_b32 s15, s14 9478; GFX9-NEXT: s_addc_u32 s3, s5, s14 9479; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], s[14:15] 9480; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 9481; GFX9-NEXT: v_mul_lo_u32 v2, s4, v1 9482; GFX9-NEXT: v_mul_hi_u32 v3, s4, v0 9483; GFX9-NEXT: v_mul_hi_u32 v4, s4, v1 9484; GFX9-NEXT: v_mul_hi_u32 v7, s5, v1 9485; GFX9-NEXT: v_mul_lo_u32 v1, s5, v1 9486; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9487; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 9488; GFX9-NEXT: v_mul_lo_u32 v4, s5, v0 9489; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 9490; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 9491; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 9492; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 9493; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v6, vcc 9494; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 9495; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc 9496; GFX9-NEXT: v_mul_lo_u32 v2, s10, v1 9497; GFX9-NEXT: v_mul_hi_u32 v3, s10, v0 9498; GFX9-NEXT: v_mul_lo_u32 v4, s11, v0 9499; GFX9-NEXT: v_mov_b32_e32 v7, s11 9500; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9501; GFX9-NEXT: v_mul_lo_u32 v3, s10, v0 9502; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 9503; GFX9-NEXT: v_sub_u32_e32 v4, s5, v2 9504; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s4, v3 9505; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v7, vcc 9506; GFX9-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s10, v3 9507; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] 9508; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v4 9509; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 9510; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v7 9511; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 9512; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v4 9513; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v7, s[0:1] 9514; GFX9-NEXT: v_add_co_u32_e64 v7, s[0:1], 2, v0 9515; GFX9-NEXT: v_addc_co_u32_e64 v8, s[0:1], 0, v1, s[0:1] 9516; GFX9-NEXT: v_add_co_u32_e64 v9, s[0:1], 1, v0 9517; GFX9-NEXT: v_addc_co_u32_e64 v10, s[0:1], 0, v1, s[0:1] 9518; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 9519; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, v8, s[0:1] 9520; GFX9-NEXT: v_mov_b32_e32 v8, s5 9521; GFX9-NEXT: s_xor_b64 s[4:5], s[14:15], s[12:13] 9522; GFX9-NEXT: s_ashr_i32 s12, s9, 31 9523; GFX9-NEXT: s_add_u32 s8, s8, s12 9524; GFX9-NEXT: s_mov_b32 s13, s12 9525; GFX9-NEXT: s_addc_u32 s9, s9, s12 9526; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] 9527; GFX9-NEXT: v_cvt_f32_u32_e32 v10, s8 9528; GFX9-NEXT: v_cvt_f32_u32_e32 v11, s9 9529; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v8, v2, vcc 9530; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 9531; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 9532; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 9533; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 9534; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v2 9535; GFX9-NEXT: v_mac_f32_e32 v10, s16, v11 9536; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc 9537; GFX9-NEXT: v_rcp_f32_e32 v3, v10 9538; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 9539; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 9540; GFX9-NEXT: s_sub_u32 s10, 0, s8 9541; GFX9-NEXT: v_mul_f32_e32 v3, s17, v3 9542; GFX9-NEXT: v_mul_f32_e32 v4, s18, v3 9543; GFX9-NEXT: v_trunc_f32_e32 v4, v4 9544; GFX9-NEXT: v_mac_f32_e32 v3, s19, v4 9545; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 9546; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 9547; GFX9-NEXT: v_cndmask_b32_e64 v2, v9, v7, s[0:1] 9548; GFX9-NEXT: s_subb_u32 s11, 0, s9 9549; GFX9-NEXT: v_mul_lo_u32 v8, s10, v4 9550; GFX9-NEXT: v_mul_hi_u32 v7, s10, v3 9551; GFX9-NEXT: v_mul_lo_u32 v9, s11, v3 9552; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 9553; GFX9-NEXT: v_mul_lo_u32 v2, s10, v3 9554; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 9555; GFX9-NEXT: v_add_u32_e32 v7, v7, v9 9556; GFX9-NEXT: v_mul_lo_u32 v8, v3, v7 9557; GFX9-NEXT: v_mul_hi_u32 v9, v3, v2 9558; GFX9-NEXT: v_mul_hi_u32 v10, v3, v7 9559; GFX9-NEXT: v_mul_hi_u32 v11, v4, v7 9560; GFX9-NEXT: v_mul_lo_u32 v7, v4, v7 9561; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 9562; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc 9563; GFX9-NEXT: v_mul_lo_u32 v10, v4, v2 9564; GFX9-NEXT: v_mul_hi_u32 v2, v4, v2 9565; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 9566; GFX9-NEXT: v_xor_b32_e32 v1, s5, v1 9567; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 9568; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v9, v2, vcc 9569; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v6, vcc 9570; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 9571; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v3, v2 9572; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v5, v8, vcc 9573; GFX9-NEXT: v_addc_co_u32_e64 v3, vcc, v4, v7, s[0:1] 9574; GFX9-NEXT: v_mul_lo_u32 v8, s10, v3 9575; GFX9-NEXT: v_mul_hi_u32 v9, s10, v2 9576; GFX9-NEXT: v_mul_lo_u32 v10, s11, v2 9577; GFX9-NEXT: v_mul_lo_u32 v11, s10, v2 9578; GFX9-NEXT: s_ashr_i32 s10, s7, 31 9579; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 9580; GFX9-NEXT: v_add_u32_e32 v8, v8, v10 9581; GFX9-NEXT: v_mul_lo_u32 v12, v2, v8 9582; GFX9-NEXT: v_mul_hi_u32 v13, v2, v11 9583; GFX9-NEXT: v_mul_hi_u32 v14, v2, v8 9584; GFX9-NEXT: v_mul_hi_u32 v10, v3, v11 9585; GFX9-NEXT: v_mul_lo_u32 v11, v3, v11 9586; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12 9587; GFX9-NEXT: v_mul_hi_u32 v9, v3, v8 9588; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v14, vcc 9589; GFX9-NEXT: v_mul_lo_u32 v3, v3, v8 9590; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v12, v11 9591; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v10, vcc 9592; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v6, vcc 9593; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3 9594; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v5, v8, vcc 9595; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 9596; GFX9-NEXT: v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1] 9597; GFX9-NEXT: s_add_u32 s0, s6, s10 9598; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 9599; GFX9-NEXT: s_mov_b32 s11, s10 9600; GFX9-NEXT: s_addc_u32 s1, s7, s10 9601; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 9602; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 9603; GFX9-NEXT: v_mul_lo_u32 v4, s6, v3 9604; GFX9-NEXT: v_mul_hi_u32 v7, s6, v2 9605; GFX9-NEXT: v_mul_hi_u32 v9, s6, v3 9606; GFX9-NEXT: v_mul_hi_u32 v10, s7, v3 9607; GFX9-NEXT: v_mul_lo_u32 v3, s7, v3 9608; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v4 9609; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v9, vcc 9610; GFX9-NEXT: v_mul_lo_u32 v9, s7, v2 9611; GFX9-NEXT: v_mul_hi_u32 v2, s7, v2 9612; GFX9-NEXT: v_mov_b32_e32 v8, s5 9613; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 9614; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v2, vcc 9615; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v10, v6, vcc 9616; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 9617; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc 9618; GFX9-NEXT: v_mul_lo_u32 v4, s8, v3 9619; GFX9-NEXT: v_mul_hi_u32 v5, s8, v2 9620; GFX9-NEXT: v_mul_lo_u32 v7, s9, v2 9621; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s4, v0 9622; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc 9623; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 9624; GFX9-NEXT: v_mul_lo_u32 v5, s8, v2 9625; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 9626; GFX9-NEXT: v_sub_u32_e32 v7, s7, v4 9627; GFX9-NEXT: v_mov_b32_e32 v8, s9 9628; GFX9-NEXT: v_sub_co_u32_e32 v5, vcc, s6, v5 9629; GFX9-NEXT: v_subb_co_u32_e64 v7, s[0:1], v7, v8, vcc 9630; GFX9-NEXT: v_subrev_co_u32_e64 v8, s[0:1], s8, v5 9631; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[0:1], 0, v7, s[0:1] 9632; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v7 9633; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] 9634; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 9635; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 9636; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v7 9637; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v8, s[0:1] 9638; GFX9-NEXT: v_add_co_u32_e64 v8, s[0:1], 2, v2 9639; GFX9-NEXT: v_addc_co_u32_e64 v9, s[0:1], 0, v3, s[0:1] 9640; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v2 9641; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1] 9642; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 9643; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v9, s[0:1] 9644; GFX9-NEXT: v_mov_b32_e32 v9, s7 9645; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v9, v4, vcc 9646; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v4 9647; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc 9648; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 9649; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 9650; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v4 9651; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc 9652; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 9653; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, v8, s[0:1] 9654; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 9655; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[12:13] 9656; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc 9657; GFX9-NEXT: v_xor_b32_e32 v2, s0, v2 9658; GFX9-NEXT: v_xor_b32_e32 v3, s1, v3 9659; GFX9-NEXT: v_mov_b32_e32 v4, s1 9660; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2 9661; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc 9662; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9663; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[2:3] 9664; GFX9-NEXT: s_endpgm 9665 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 9666 %r = sdiv <2 x i64> %x, %shl.y 9667 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 9668 ret void 9669} 9670 9671define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 9672; CHECK-LABEL: @srem_i64_oddk_denom( 9673; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 1235195 9674; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 9675; CHECK-NEXT: ret void 9676; 9677; GFX6-LABEL: srem_i64_oddk_denom: 9678; GFX6: ; %bb.0: 9679; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 9680; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 9681; GFX6-NEXT: v_rcp_f32_e32 v0, v0 9682; GFX6-NEXT: s_mov_b32 s2, 0xffed2705 9683; GFX6-NEXT: v_mov_b32_e32 v8, 0 9684; GFX6-NEXT: v_mov_b32_e32 v7, 0 9685; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9686; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9687; GFX6-NEXT: v_trunc_f32_e32 v1, v1 9688; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9689; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9690; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9691; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 9692; GFX6-NEXT: s_mov_b32 s7, 0xf000 9693; GFX6-NEXT: v_mul_hi_u32 v3, v0, s2 9694; GFX6-NEXT: v_mul_lo_u32 v2, v1, s2 9695; GFX6-NEXT: v_mul_lo_u32 v4, v0, s2 9696; GFX6-NEXT: s_mov_b32 s6, -1 9697; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9698; GFX6-NEXT: s_mov_b32 s4, s8 9699; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9700; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 9701; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 9702; GFX6-NEXT: v_mul_hi_u32 v6, v0, v4 9703; GFX6-NEXT: v_mul_hi_u32 v3, v0, v2 9704; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 9705; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9706; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 9707; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 9708; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 9709; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc 9710; GFX6-NEXT: s_mov_b32 s5, s9 9711; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 9712; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc 9713; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 9714; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9715; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 9716; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 9717; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 9718; GFX6-NEXT: v_mul_lo_u32 v4, v2, s2 9719; GFX6-NEXT: v_mul_hi_u32 v5, v0, s2 9720; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 9721; GFX6-NEXT: v_mul_lo_u32 v5, v0, s2 9722; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 9723; GFX6-NEXT: v_mul_lo_u32 v10, v0, v4 9724; GFX6-NEXT: v_mul_hi_u32 v12, v0, v4 9725; GFX6-NEXT: v_mul_hi_u32 v11, v0, v5 9726; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 9727; GFX6-NEXT: v_mul_lo_u32 v5, v2, v5 9728; GFX6-NEXT: v_mul_hi_u32 v6, v2, v4 9729; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 9730; GFX6-NEXT: v_addc_u32_e32 v11, vcc, v8, v12, vcc 9731; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 9732; GFX6-NEXT: v_add_i32_e32 v5, vcc, v10, v5 9733; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc 9734; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc 9735; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 9736; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 9737; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 9738; GFX6-NEXT: s_ashr_i32 s2, s11, 31 9739; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 9740; GFX6-NEXT: s_add_u32 s0, s10, s2 9741; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9742; GFX6-NEXT: s_mov_b32 s3, s2 9743; GFX6-NEXT: s_addc_u32 s1, s11, s2 9744; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 9745; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 9746; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 9747; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 9748; GFX6-NEXT: v_mul_hi_u32 v4, s0, v1 9749; GFX6-NEXT: v_mul_hi_u32 v5, s1, v1 9750; GFX6-NEXT: v_mul_lo_u32 v1, s1, v1 9751; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9752; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 9753; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 9754; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 9755; GFX6-NEXT: s_mov_b32 s3, 0x12d8fb 9756; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9757; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 9758; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 9759; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 9760; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 9761; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 9762; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 9763; GFX6-NEXT: v_mul_lo_u32 v0, v0, s3 9764; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 9765; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 9766; GFX6-NEXT: v_mov_b32_e32 v2, s1 9767; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 9768; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v0 9769; GFX6-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc 9770; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s3, v2 9771; GFX6-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v3, vcc 9772; GFX6-NEXT: s_mov_b32 s0, 0x12d8fa 9773; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v2 9774; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 9775; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 9776; GFX6-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc 9777; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 9778; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v0 9779; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 9780; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 9781; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 9782; GFX6-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[0:1] 9783; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 9784; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 9785; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 9786; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 9787; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 9788; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 9789; GFX6-NEXT: v_mov_b32_e32 v2, s2 9790; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 9791; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 9792; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 9793; GFX6-NEXT: s_endpgm 9794; 9795; GFX9-LABEL: srem_i64_oddk_denom: 9796; GFX9: ; %bb.0: 9797; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 9798; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 9799; GFX9-NEXT: v_rcp_f32_e32 v0, v0 9800; GFX9-NEXT: s_mov_b32 s8, 0xffed2705 9801; GFX9-NEXT: v_mov_b32_e32 v7, 0 9802; GFX9-NEXT: v_mov_b32_e32 v5, 0 9803; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9804; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9805; GFX9-NEXT: v_trunc_f32_e32 v1, v1 9806; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9807; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 9808; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 9809; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 9810; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 9811; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 9812; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 9813; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9814; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 9815; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 9816; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 9817; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 9818; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 9819; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 9820; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 9821; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v8, vcc 9822; GFX9-NEXT: v_mul_lo_u32 v8, v1, v4 9823; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 9824; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 9825; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc 9826; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc 9827; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9828; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 9829; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 9830; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] 9831; GFX9-NEXT: v_mul_lo_u32 v4, v2, s8 9832; GFX9-NEXT: v_mul_hi_u32 v6, v0, s8 9833; GFX9-NEXT: v_mul_lo_u32 v8, v0, s8 9834; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 9835; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 9836; GFX9-NEXT: v_sub_u32_e32 v4, v4, v0 9837; GFX9-NEXT: v_mul_lo_u32 v10, v0, v4 9838; GFX9-NEXT: v_mul_hi_u32 v11, v0, v8 9839; GFX9-NEXT: v_mul_hi_u32 v12, v0, v4 9840; GFX9-NEXT: v_mul_hi_u32 v9, v2, v8 9841; GFX9-NEXT: v_mul_lo_u32 v8, v2, v8 9842; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 9843; GFX9-NEXT: v_mul_hi_u32 v6, v2, v4 9844; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v7, v12, vcc 9845; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 9846; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 9847; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc 9848; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc 9849; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 9850; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc 9851; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] 9852; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9853; GFX9-NEXT: s_ashr_i32 s2, s7, 31 9854; GFX9-NEXT: s_add_u32 s0, s6, s2 9855; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 9856; GFX9-NEXT: s_mov_b32 s3, s2 9857; GFX9-NEXT: s_addc_u32 s1, s7, s2 9858; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 9859; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 9860; GFX9-NEXT: v_mul_lo_u32 v2, s0, v1 9861; GFX9-NEXT: v_mul_hi_u32 v3, s0, v0 9862; GFX9-NEXT: v_mul_hi_u32 v4, s0, v1 9863; GFX9-NEXT: v_mul_hi_u32 v6, s1, v1 9864; GFX9-NEXT: v_mul_lo_u32 v1, s1, v1 9865; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9866; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 9867; GFX9-NEXT: v_mul_lo_u32 v4, s1, v0 9868; GFX9-NEXT: v_mul_hi_u32 v0, s1, v0 9869; GFX9-NEXT: s_mov_b32 s3, 0x12d8fb 9870; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 9871; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 9872; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc 9873; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 9874; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v2, vcc 9875; GFX9-NEXT: v_mul_hi_u32 v2, v0, s3 9876; GFX9-NEXT: v_mul_lo_u32 v1, v1, s3 9877; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 9878; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 9879; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 9880; GFX9-NEXT: v_mov_b32_e32 v2, s1 9881; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 9882; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s3, v0 9883; GFX9-NEXT: v_subbrev_co_u32_e32 v3, vcc, 0, v1, vcc 9884; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s3, v2 9885; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc 9886; GFX9-NEXT: s_mov_b32 s0, 0x12d8fa 9887; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s0, v2 9888; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 9889; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 9890; GFX9-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc 9891; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 9892; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v0 9893; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 9894; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 9895; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 9896; GFX9-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[0:1] 9897; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 9898; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 9899; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 9900; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 9901; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 9902; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 9903; GFX9-NEXT: v_mov_b32_e32 v2, s2 9904; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s2, v0 9905; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc 9906; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] 9907; GFX9-NEXT: s_endpgm 9908 %r = srem i64 %x, 1235195 9909 store i64 %r, i64 addrspace(1)* %out 9910 ret void 9911} 9912 9913define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 9914; CHECK-LABEL: @srem_i64_pow2k_denom( 9915; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 4096 9916; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 9917; CHECK-NEXT: ret void 9918; 9919; GFX6-LABEL: srem_i64_pow2k_denom: 9920; GFX6: ; %bb.0: 9921; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 9922; GFX6-NEXT: s_mov_b32 s3, 0xf000 9923; GFX6-NEXT: s_mov_b32 s2, -1 9924; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9925; GFX6-NEXT: s_mov_b32 s0, s4 9926; GFX6-NEXT: s_ashr_i32 s4, s7, 31 9927; GFX6-NEXT: s_lshr_b32 s4, s4, 20 9928; GFX6-NEXT: s_add_u32 s4, s6, s4 9929; GFX6-NEXT: s_mov_b32 s1, s5 9930; GFX6-NEXT: s_addc_u32 s5, s7, 0 9931; GFX6-NEXT: s_and_b32 s4, s4, 0xfffff000 9932; GFX6-NEXT: s_sub_u32 s4, s6, s4 9933; GFX6-NEXT: s_subb_u32 s5, s7, s5 9934; GFX6-NEXT: v_mov_b32_e32 v0, s4 9935; GFX6-NEXT: v_mov_b32_e32 v1, s5 9936; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 9937; GFX6-NEXT: s_endpgm 9938; 9939; GFX9-LABEL: srem_i64_pow2k_denom: 9940; GFX9: ; %bb.0: 9941; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 9942; GFX9-NEXT: v_mov_b32_e32 v2, 0 9943; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9944; GFX9-NEXT: s_ashr_i32 s4, s3, 31 9945; GFX9-NEXT: s_lshr_b32 s4, s4, 20 9946; GFX9-NEXT: s_add_u32 s4, s2, s4 9947; GFX9-NEXT: s_addc_u32 s5, s3, 0 9948; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 9949; GFX9-NEXT: s_sub_u32 s2, s2, s4 9950; GFX9-NEXT: s_subb_u32 s3, s3, s5 9951; GFX9-NEXT: v_mov_b32_e32 v0, s2 9952; GFX9-NEXT: v_mov_b32_e32 v1, s3 9953; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 9954; GFX9-NEXT: s_endpgm 9955 %r = srem i64 %x, 4096 9956 store i64 %r, i64 addrspace(1)* %out 9957 ret void 9958} 9959 9960define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 9961; CHECK-LABEL: @srem_i64_pow2_shl_denom( 9962; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 9963; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]] 9964; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 9965; CHECK-NEXT: ret void 9966; 9967; GFX6-LABEL: srem_i64_pow2_shl_denom: 9968; GFX6: ; %bb.0: 9969; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd 9970; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 9971; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 9972; GFX6-NEXT: s_mov_b32 s7, 0xf000 9973; GFX6-NEXT: s_mov_b32 s6, -1 9974; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9975; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 9976; GFX6-NEXT: s_ashr_i32 s4, s3, 31 9977; GFX6-NEXT: s_add_u32 s2, s2, s4 9978; GFX6-NEXT: s_mov_b32 s5, s4 9979; GFX6-NEXT: s_addc_u32 s3, s3, s4 9980; GFX6-NEXT: s_xor_b64 s[12:13], s[2:3], s[4:5] 9981; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 9982; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s13 9983; GFX6-NEXT: s_sub_u32 s2, 0, s12 9984; GFX6-NEXT: s_subb_u32 s3, 0, s13 9985; GFX6-NEXT: s_ashr_i32 s14, s11, 31 9986; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 9987; GFX6-NEXT: v_rcp_f32_e32 v0, v0 9988; GFX6-NEXT: s_mov_b32 s15, s14 9989; GFX6-NEXT: s_mov_b32 s4, s8 9990; GFX6-NEXT: s_mov_b32 s5, s9 9991; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9992; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9993; GFX6-NEXT: v_trunc_f32_e32 v1, v1 9994; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9995; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9996; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9997; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 9998; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 9999; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 10000; GFX6-NEXT: v_mul_lo_u32 v4, s2, v0 10001; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10002; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 10003; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 10004; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 10005; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 10006; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 10007; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 10008; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 10009; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 10010; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 10011; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 10012; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 10013; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 10014; GFX6-NEXT: v_mov_b32_e32 v4, 0 10015; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 10016; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10017; GFX6-NEXT: v_mov_b32_e32 v6, 0 10018; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 10019; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 10020; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 10021; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 10022; GFX6-NEXT: v_mul_hi_u32 v7, s2, v0 10023; GFX6-NEXT: v_mul_lo_u32 v8, s3, v0 10024; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 10025; GFX6-NEXT: v_mul_lo_u32 v7, s2, v0 10026; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 10027; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 10028; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 10029; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 10030; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 10031; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 10032; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 10033; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 10034; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 10035; GFX6-NEXT: v_mul_lo_u32 v2, v2, v5 10036; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 10037; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 10038; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 10039; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 10040; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 10041; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 10042; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 10043; GFX6-NEXT: s_add_u32 s0, s10, s14 10044; GFX6-NEXT: s_addc_u32 s1, s11, s14 10045; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 10046; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 10047; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 10048; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 10049; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 10050; GFX6-NEXT: v_mul_hi_u32 v5, s10, v1 10051; GFX6-NEXT: v_mul_hi_u32 v7, s11, v1 10052; GFX6-NEXT: v_mul_lo_u32 v1, s11, v1 10053; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10054; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 10055; GFX6-NEXT: v_mul_lo_u32 v5, s11, v0 10056; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 10057; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 10058; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 10059; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 10060; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 10061; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 10062; GFX6-NEXT: v_mul_lo_u32 v1, s12, v1 10063; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 10064; GFX6-NEXT: v_mul_lo_u32 v3, s13, v0 10065; GFX6-NEXT: v_mul_lo_u32 v0, s12, v0 10066; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 10067; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 10068; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s11, v1 10069; GFX6-NEXT: v_mov_b32_e32 v3, s13 10070; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 10071; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 10072; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 10073; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 10074; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v5 10075; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 10076; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 10077; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v4 10078; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 10079; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 10080; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v5 10081; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 10082; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 10083; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 10084; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 10085; GFX6-NEXT: v_mov_b32_e32 v5, s11 10086; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 10087; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 10088; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 10089; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 10090; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 10091; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 10092; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 10093; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 10094; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 10095; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 10096; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 10097; GFX6-NEXT: v_xor_b32_e32 v0, s14, v0 10098; GFX6-NEXT: v_xor_b32_e32 v1, s14, v1 10099; GFX6-NEXT: v_mov_b32_e32 v2, s14 10100; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s14, v0 10101; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 10102; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 10103; GFX6-NEXT: s_endpgm 10104; 10105; GFX9-LABEL: srem_i64_pow2_shl_denom: 10106; GFX9: ; %bb.0: 10107; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 10108; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 10109; GFX9-NEXT: v_mov_b32_e32 v2, 0 10110; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10111; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 10112; GFX9-NEXT: s_ashr_i32 s4, s3, 31 10113; GFX9-NEXT: s_add_u32 s2, s2, s4 10114; GFX9-NEXT: s_mov_b32 s5, s4 10115; GFX9-NEXT: s_addc_u32 s3, s3, s4 10116; GFX9-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] 10117; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 10118; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 10119; GFX9-NEXT: s_sub_u32 s10, 0, s8 10120; GFX9-NEXT: s_subb_u32 s4, 0, s9 10121; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 10122; GFX9-NEXT: v_rcp_f32_e32 v0, v0 10123; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10124; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10125; GFX9-NEXT: v_trunc_f32_e32 v1, v1 10126; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10127; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 10128; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 10129; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 10130; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 10131; GFX9-NEXT: v_mul_lo_u32 v6, s4, v0 10132; GFX9-NEXT: v_mul_lo_u32 v5, s10, v0 10133; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 10134; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 10135; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 10136; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3 10137; GFX9-NEXT: v_mul_hi_u32 v8, v0, v3 10138; GFX9-NEXT: v_mul_hi_u32 v7, v1, v5 10139; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 10140; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 10141; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 10142; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc 10143; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 10144; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 10145; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v7, vcc 10146; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v2, vcc 10147; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 10148; GFX9-NEXT: v_mov_b32_e32 v6, 0 10149; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v3 10150; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc 10151; GFX9-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[2:3] 10152; GFX9-NEXT: v_mul_lo_u32 v5, s10, v3 10153; GFX9-NEXT: v_mul_hi_u32 v7, s10, v0 10154; GFX9-NEXT: v_mul_lo_u32 v8, s4, v0 10155; GFX9-NEXT: v_mul_lo_u32 v9, s10, v0 10156; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 10157; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 10158; GFX9-NEXT: v_add_u32_e32 v5, v5, v8 10159; GFX9-NEXT: v_mul_lo_u32 v10, v0, v5 10160; GFX9-NEXT: v_mul_hi_u32 v11, v0, v9 10161; GFX9-NEXT: v_mul_hi_u32 v12, v0, v5 10162; GFX9-NEXT: v_mul_hi_u32 v8, v3, v9 10163; GFX9-NEXT: v_mul_lo_u32 v9, v3, v9 10164; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 10165; GFX9-NEXT: v_mul_hi_u32 v7, v3, v5 10166; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc 10167; GFX9-NEXT: v_mul_lo_u32 v3, v3, v5 10168; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 10169; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc 10170; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 10171; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 10172; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10173; GFX9-NEXT: s_ashr_i32 s10, s7, 31 10174; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v5, vcc 10175; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 10176; GFX9-NEXT: s_add_u32 s0, s6, s10 10177; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3] 10178; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 10179; GFX9-NEXT: s_mov_b32 s11, s10 10180; GFX9-NEXT: s_addc_u32 s1, s7, s10 10181; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 10182; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 10183; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 10184; GFX9-NEXT: v_mul_hi_u32 v4, s6, v0 10185; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 10186; GFX9-NEXT: v_mul_hi_u32 v7, s7, v1 10187; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 10188; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 10189; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 10190; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 10191; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 10192; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 10193; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v4, v0, vcc 10194; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v2, vcc 10195; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 10196; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v3, vcc 10197; GFX9-NEXT: v_mul_lo_u32 v1, s8, v1 10198; GFX9-NEXT: v_mul_hi_u32 v3, s8, v0 10199; GFX9-NEXT: v_mul_lo_u32 v4, s9, v0 10200; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 10201; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 10202; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 10203; GFX9-NEXT: v_sub_u32_e32 v3, s7, v1 10204; GFX9-NEXT: v_mov_b32_e32 v4, s9 10205; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 10206; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc 10207; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s8, v0 10208; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1] 10209; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v6 10210; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] 10211; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 10212; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v5 10213; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v5 10214; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 10215; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v6 10216; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] 10217; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] 10218; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 10219; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] 10220; GFX9-NEXT: v_mov_b32_e32 v6, s7 10221; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v6, v1, vcc 10222; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 10223; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 10224; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 10225; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 10226; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 10227; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc 10228; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 10229; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 10230; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] 10231; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 10232; GFX9-NEXT: v_xor_b32_e32 v0, s10, v0 10233; GFX9-NEXT: v_xor_b32_e32 v1, s10, v1 10234; GFX9-NEXT: v_mov_b32_e32 v3, s10 10235; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s10, v0 10236; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 10237; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 10238; GFX9-NEXT: s_endpgm 10239 %shl.y = shl i64 4096, %y 10240 %r = srem i64 %x, %shl.y 10241 store i64 %r, i64 addrspace(1)* %out 10242 ret void 10243} 10244 10245define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 10246; CHECK-LABEL: @srem_v2i64_pow2k_denom( 10247; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 10248; CHECK-NEXT: [[TMP2:%.*]] = srem i64 [[TMP1]], 4096 10249; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 10250; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 10251; CHECK-NEXT: [[TMP5:%.*]] = srem i64 [[TMP4]], 4096 10252; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 10253; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 10254; CHECK-NEXT: ret void 10255; 10256; GFX6-LABEL: srem_v2i64_pow2k_denom: 10257; GFX6: ; %bb.0: 10258; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 10259; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 10260; GFX6-NEXT: s_movk_i32 s8, 0xf000 10261; GFX6-NEXT: s_mov_b32 s7, 0xf000 10262; GFX6-NEXT: s_mov_b32 s6, -1 10263; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10264; GFX6-NEXT: s_ashr_i32 s9, s1, 31 10265; GFX6-NEXT: s_lshr_b32 s9, s9, 20 10266; GFX6-NEXT: s_add_u32 s9, s0, s9 10267; GFX6-NEXT: s_addc_u32 s10, s1, 0 10268; GFX6-NEXT: s_and_b32 s9, s9, s8 10269; GFX6-NEXT: s_sub_u32 s0, s0, s9 10270; GFX6-NEXT: s_subb_u32 s1, s1, s10 10271; GFX6-NEXT: s_ashr_i32 s9, s3, 31 10272; GFX6-NEXT: s_lshr_b32 s9, s9, 20 10273; GFX6-NEXT: s_add_u32 s9, s2, s9 10274; GFX6-NEXT: s_addc_u32 s10, s3, 0 10275; GFX6-NEXT: s_and_b32 s8, s9, s8 10276; GFX6-NEXT: s_sub_u32 s2, s2, s8 10277; GFX6-NEXT: s_subb_u32 s3, s3, s10 10278; GFX6-NEXT: v_mov_b32_e32 v0, s0 10279; GFX6-NEXT: v_mov_b32_e32 v1, s1 10280; GFX6-NEXT: v_mov_b32_e32 v2, s2 10281; GFX6-NEXT: v_mov_b32_e32 v3, s3 10282; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 10283; GFX6-NEXT: s_endpgm 10284; 10285; GFX9-LABEL: srem_v2i64_pow2k_denom: 10286; GFX9: ; %bb.0: 10287; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10288; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 10289; GFX9-NEXT: s_movk_i32 s8, 0xf000 10290; GFX9-NEXT: v_mov_b32_e32 v4, 0 10291; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10292; GFX9-NEXT: s_ashr_i32 s0, s5, 31 10293; GFX9-NEXT: s_lshr_b32 s0, s0, 20 10294; GFX9-NEXT: s_add_u32 s0, s4, s0 10295; GFX9-NEXT: s_addc_u32 s1, s5, 0 10296; GFX9-NEXT: s_and_b32 s0, s0, s8 10297; GFX9-NEXT: s_sub_u32 s0, s4, s0 10298; GFX9-NEXT: s_subb_u32 s1, s5, s1 10299; GFX9-NEXT: s_ashr_i32 s4, s7, 31 10300; GFX9-NEXT: s_lshr_b32 s4, s4, 20 10301; GFX9-NEXT: s_add_u32 s4, s6, s4 10302; GFX9-NEXT: s_addc_u32 s5, s7, 0 10303; GFX9-NEXT: s_and_b32 s4, s4, s8 10304; GFX9-NEXT: s_sub_u32 s4, s6, s4 10305; GFX9-NEXT: s_subb_u32 s5, s7, s5 10306; GFX9-NEXT: v_mov_b32_e32 v0, s0 10307; GFX9-NEXT: v_mov_b32_e32 v1, s1 10308; GFX9-NEXT: v_mov_b32_e32 v2, s4 10309; GFX9-NEXT: v_mov_b32_e32 v3, s5 10310; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 10311; GFX9-NEXT: s_endpgm 10312 %r = srem <2 x i64> %x, <i64 4096, i64 4096> 10313 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 10314 ret void 10315} 10316 10317define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 10318; CHECK-LABEL: @srem_v2i64_pow2_shl_denom( 10319; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 10320; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 10321; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 10322; CHECK-NEXT: [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]] 10323; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 10324; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 10325; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 10326; CHECK-NEXT: [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]] 10327; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 10328; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 10329; CHECK-NEXT: ret void 10330; 10331; GFX6-LABEL: srem_v2i64_pow2_shl_denom: 10332; GFX6: ; %bb.0: 10333; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 10334; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 10335; GFX6-NEXT: s_mov_b32 s18, 0x4f800000 10336; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc 10337; GFX6-NEXT: s_mov_b32 s20, 0x2f800000 10338; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10339; GFX6-NEXT: s_lshl_b64 s[14:15], s[2:3], s6 10340; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 10341; GFX6-NEXT: s_ashr_i32 s4, s3, 31 10342; GFX6-NEXT: s_add_u32 s2, s2, s4 10343; GFX6-NEXT: s_mov_b32 s5, s4 10344; GFX6-NEXT: s_addc_u32 s3, s3, s4 10345; GFX6-NEXT: s_xor_b64 s[16:17], s[2:3], s[4:5] 10346; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s16 10347; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s17 10348; GFX6-NEXT: s_mov_b32 s21, 0xcf800000 10349; GFX6-NEXT: s_sub_u32 s6, 0, s16 10350; GFX6-NEXT: s_subb_u32 s7, 0, s17 10351; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1 10352; GFX6-NEXT: v_rcp_f32_e32 v0, v0 10353; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 10354; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 10355; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0 10356; GFX6-NEXT: v_mul_f32_e32 v1, s20, v0 10357; GFX6-NEXT: v_trunc_f32_e32 v1, v1 10358; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 10359; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 10360; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 10361; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10362; GFX6-NEXT: s_ashr_i32 s12, s9, 31 10363; GFX6-NEXT: s_add_u32 s0, s8, s12 10364; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 10365; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 10366; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 10367; GFX6-NEXT: v_mul_lo_u32 v5, s6, v0 10368; GFX6-NEXT: s_mov_b32 s13, s12 10369; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10370; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 10371; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 10372; GFX6-NEXT: v_mul_hi_u32 v4, v0, v5 10373; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 10374; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 10375; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 10376; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 10377; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 10378; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 10379; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 10380; GFX6-NEXT: s_addc_u32 s1, s9, s12 10381; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] 10382; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 10383; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v5, vcc 10384; GFX6-NEXT: v_mov_b32_e32 v4, 0 10385; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 10386; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10387; GFX6-NEXT: v_mov_b32_e32 v6, 0 10388; GFX6-NEXT: v_add_i32_e64 v0, s[2:3], v0, v2 10389; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 10390; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[2:3] 10391; GFX6-NEXT: v_mul_lo_u32 v5, s6, v2 10392; GFX6-NEXT: v_mul_hi_u32 v7, s6, v0 10393; GFX6-NEXT: v_mul_lo_u32 v8, s7, v0 10394; GFX6-NEXT: s_mov_b32 s7, 0xf000 10395; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 10396; GFX6-NEXT: v_mul_lo_u32 v7, s6, v0 10397; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 10398; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 10399; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 10400; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 10401; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 10402; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 10403; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 10404; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 10405; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 10406; GFX6-NEXT: v_mul_lo_u32 v2, v2, v5 10407; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 10408; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 10409; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 10410; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 10411; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 10412; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 10413; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[2:3] 10414; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 10415; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 10416; GFX6-NEXT: v_mul_lo_u32 v2, s8, v1 10417; GFX6-NEXT: v_mul_hi_u32 v3, s8, v0 10418; GFX6-NEXT: v_mul_hi_u32 v5, s8, v1 10419; GFX6-NEXT: v_mul_hi_u32 v7, s9, v1 10420; GFX6-NEXT: v_mul_lo_u32 v1, s9, v1 10421; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10422; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 10423; GFX6-NEXT: v_mul_lo_u32 v5, s9, v0 10424; GFX6-NEXT: v_mul_hi_u32 v0, s9, v0 10425; GFX6-NEXT: s_mov_b32 s6, -1 10426; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 10427; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 10428; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 10429; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 10430; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 10431; GFX6-NEXT: v_mul_lo_u32 v1, s16, v1 10432; GFX6-NEXT: v_mul_hi_u32 v2, s16, v0 10433; GFX6-NEXT: v_mul_lo_u32 v3, s17, v0 10434; GFX6-NEXT: v_mul_lo_u32 v0, s16, v0 10435; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 10436; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 10437; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v1 10438; GFX6-NEXT: v_mov_b32_e32 v3, s17 10439; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 10440; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 10441; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s16, v0 10442; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v2, s[0:1] 10443; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s17, v7 10444; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 10445; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 10446; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v5 10447; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s16, v5 10448; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 10449; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v7 10450; GFX6-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 10451; GFX6-NEXT: s_ashr_i32 s2, s15, 31 10452; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 10453; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 10454; GFX6-NEXT: s_add_u32 s8, s14, s2 10455; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[0:1] 10456; GFX6-NEXT: v_mov_b32_e32 v7, s9 10457; GFX6-NEXT: s_mov_b32 s3, s2 10458; GFX6-NEXT: s_addc_u32 s9, s15, s2 10459; GFX6-NEXT: s_xor_b64 s[8:9], s[8:9], s[2:3] 10460; GFX6-NEXT: v_cvt_f32_u32_e32 v8, s8 10461; GFX6-NEXT: v_cvt_f32_u32_e32 v9, s9 10462; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v7, v1, vcc 10463; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s17, v1 10464; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 10465; GFX6-NEXT: v_mac_f32_e32 v8, s18, v9 10466; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s16, v0 10467; GFX6-NEXT: v_rcp_f32_e32 v8, v8 10468; GFX6-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc 10469; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s17, v1 10470; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc 10471; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 10472; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 10473; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v3, s[0:1] 10474; GFX6-NEXT: v_mul_f32_e32 v3, s19, v8 10475; GFX6-NEXT: v_mul_f32_e32 v5, s20, v3 10476; GFX6-NEXT: v_trunc_f32_e32 v5, v5 10477; GFX6-NEXT: v_mac_f32_e32 v3, s21, v5 10478; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 10479; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 10480; GFX6-NEXT: s_sub_u32 s2, 0, s8 10481; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 10482; GFX6-NEXT: v_mul_hi_u32 v2, s2, v3 10483; GFX6-NEXT: v_mul_lo_u32 v7, s2, v5 10484; GFX6-NEXT: s_subb_u32 s3, 0, s9 10485; GFX6-NEXT: v_mul_lo_u32 v8, s3, v3 10486; GFX6-NEXT: s_ashr_i32 s14, s11, 31 10487; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v7 10488; GFX6-NEXT: v_mul_lo_u32 v7, s2, v3 10489; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v8 10490; GFX6-NEXT: v_mul_lo_u32 v8, v3, v2 10491; GFX6-NEXT: v_mul_hi_u32 v10, v3, v2 10492; GFX6-NEXT: v_mul_hi_u32 v9, v3, v7 10493; GFX6-NEXT: v_mul_hi_u32 v11, v5, v2 10494; GFX6-NEXT: v_mul_lo_u32 v2, v5, v2 10495; GFX6-NEXT: s_mov_b32 s15, s14 10496; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 10497; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 10498; GFX6-NEXT: v_mul_lo_u32 v10, v5, v7 10499; GFX6-NEXT: v_mul_hi_u32 v7, v5, v7 10500; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 10501; GFX6-NEXT: v_xor_b32_e32 v1, s12, v1 10502; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v10 10503; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc 10504; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v11, v4, vcc 10505; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 10506; GFX6-NEXT: v_add_i32_e64 v2, s[0:1], v3, v2 10507; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v6, v8, vcc 10508; GFX6-NEXT: v_addc_u32_e64 v3, vcc, v5, v7, s[0:1] 10509; GFX6-NEXT: v_mul_lo_u32 v8, s2, v3 10510; GFX6-NEXT: v_mul_hi_u32 v9, s2, v2 10511; GFX6-NEXT: v_mul_lo_u32 v10, s3, v2 10512; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 10513; GFX6-NEXT: v_mul_lo_u32 v9, s2, v2 10514; GFX6-NEXT: v_add_i32_e32 v8, vcc, v10, v8 10515; GFX6-NEXT: v_mul_lo_u32 v12, v2, v8 10516; GFX6-NEXT: v_mul_hi_u32 v14, v2, v8 10517; GFX6-NEXT: v_mul_hi_u32 v13, v2, v9 10518; GFX6-NEXT: v_mul_hi_u32 v11, v3, v9 10519; GFX6-NEXT: v_mul_lo_u32 v9, v3, v9 10520; GFX6-NEXT: v_mul_hi_u32 v10, v3, v8 10521; GFX6-NEXT: v_add_i32_e32 v12, vcc, v13, v12 10522; GFX6-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc 10523; GFX6-NEXT: v_mul_lo_u32 v3, v3, v8 10524; GFX6-NEXT: v_add_i32_e32 v9, vcc, v12, v9 10525; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v13, v11, vcc 10526; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v10, v4, vcc 10527; GFX6-NEXT: v_add_i32_e32 v3, vcc, v9, v3 10528; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v6, v8, vcc 10529; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v7 10530; GFX6-NEXT: v_addc_u32_e64 v5, vcc, v5, v8, s[0:1] 10531; GFX6-NEXT: s_add_u32 s0, s10, s14 10532; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 10533; GFX6-NEXT: s_addc_u32 s1, s11, s14 10534; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 10535; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 10536; GFX6-NEXT: v_mul_lo_u32 v5, s10, v3 10537; GFX6-NEXT: v_mul_hi_u32 v7, s10, v2 10538; GFX6-NEXT: v_mul_hi_u32 v9, s10, v3 10539; GFX6-NEXT: v_mul_hi_u32 v10, s11, v3 10540; GFX6-NEXT: v_mul_lo_u32 v3, s11, v3 10541; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 10542; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc 10543; GFX6-NEXT: v_mul_lo_u32 v9, s11, v2 10544; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2 10545; GFX6-NEXT: v_mov_b32_e32 v8, s12 10546; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 10547; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v2, vcc 10548; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v10, v4, vcc 10549; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 10550; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc 10551; GFX6-NEXT: v_mul_lo_u32 v3, s8, v3 10552; GFX6-NEXT: v_mul_hi_u32 v4, s8, v2 10553; GFX6-NEXT: v_mul_lo_u32 v5, s9, v2 10554; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 10555; GFX6-NEXT: v_mul_lo_u32 v2, s8, v2 10556; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc 10557; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 10558; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 10559; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s11, v3 10560; GFX6-NEXT: v_mov_b32_e32 v5, s9 10561; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 10562; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 10563; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s8, v2 10564; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] 10565; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v7 10566; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1] 10567; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 10568; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v6 10569; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s8, v6 10570; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 10571; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v7 10572; GFX6-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 10573; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 10574; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 10575; GFX6-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] 10576; GFX6-NEXT: v_mov_b32_e32 v7, s11 10577; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc 10578; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 10579; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 10580; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 10581; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 10582; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 10583; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 10584; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 10585; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 10586; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 10587; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 10588; GFX6-NEXT: v_xor_b32_e32 v2, s14, v2 10589; GFX6-NEXT: v_xor_b32_e32 v3, s14, v3 10590; GFX6-NEXT: v_mov_b32_e32 v4, s14 10591; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s14, v2 10592; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc 10593; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 10594; GFX6-NEXT: s_endpgm 10595; 10596; GFX9-LABEL: srem_v2i64_pow2_shl_denom: 10597; GFX9: ; %bb.0: 10598; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 10599; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 10600; GFX9-NEXT: s_mov_b32 s16, 0x4f800000 10601; GFX9-NEXT: s_mov_b32 s17, 0x5f7ffffc 10602; GFX9-NEXT: s_mov_b32 s18, 0x2f800000 10603; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10604; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s6 10605; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 10606; GFX9-NEXT: s_ashr_i32 s4, s3, 31 10607; GFX9-NEXT: s_add_u32 s2, s2, s4 10608; GFX9-NEXT: s_mov_b32 s5, s4 10609; GFX9-NEXT: s_addc_u32 s3, s3, s4 10610; GFX9-NEXT: s_xor_b64 s[12:13], s[2:3], s[4:5] 10611; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 10612; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 10613; GFX9-NEXT: s_mov_b32 s19, 0xcf800000 10614; GFX9-NEXT: s_sub_u32 s8, 0, s12 10615; GFX9-NEXT: s_subb_u32 s4, 0, s13 10616; GFX9-NEXT: v_mac_f32_e32 v0, s16, v1 10617; GFX9-NEXT: v_rcp_f32_e32 v0, v0 10618; GFX9-NEXT: v_mov_b32_e32 v6, 0 10619; GFX9-NEXT: v_mul_f32_e32 v0, s17, v0 10620; GFX9-NEXT: v_mul_f32_e32 v1, s18, v0 10621; GFX9-NEXT: v_trunc_f32_e32 v1, v1 10622; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 10623; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 10624; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 10625; GFX9-NEXT: v_mul_hi_u32 v3, s8, v0 10626; GFX9-NEXT: v_mul_lo_u32 v2, s8, v1 10627; GFX9-NEXT: v_mul_lo_u32 v5, s4, v0 10628; GFX9-NEXT: v_mul_lo_u32 v4, s8, v0 10629; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 10630; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 10631; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 10632; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 10633; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 10634; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 10635; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 10636; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 10637; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc 10638; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 10639; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 10640; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 10641; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc 10642; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc 10643; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10644; GFX9-NEXT: v_mov_b32_e32 v5, 0 10645; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 10646; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc 10647; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] 10648; GFX9-NEXT: v_mul_lo_u32 v4, s8, v2 10649; GFX9-NEXT: v_mul_hi_u32 v7, s8, v0 10650; GFX9-NEXT: v_mul_lo_u32 v8, s4, v0 10651; GFX9-NEXT: v_mul_lo_u32 v9, s8, v0 10652; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 10653; GFX9-NEXT: v_add_u32_e32 v4, v7, v4 10654; GFX9-NEXT: v_add_u32_e32 v4, v4, v8 10655; GFX9-NEXT: v_mul_lo_u32 v10, v0, v4 10656; GFX9-NEXT: v_mul_hi_u32 v11, v0, v9 10657; GFX9-NEXT: v_mul_hi_u32 v12, v0, v4 10658; GFX9-NEXT: v_mul_hi_u32 v8, v2, v9 10659; GFX9-NEXT: v_mul_lo_u32 v9, v2, v9 10660; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 10661; GFX9-NEXT: v_mul_hi_u32 v7, v2, v4 10662; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc 10663; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 10664; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 10665; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc 10666; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v6, vcc 10667; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 10668; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10669; GFX9-NEXT: s_ashr_i32 s8, s5, 31 10670; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v5, v4, vcc 10671; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 10672; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] 10673; GFX9-NEXT: s_add_u32 s2, s4, s8 10674; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 10675; GFX9-NEXT: s_addc_u32 s3, s5, s8 10676; GFX9-NEXT: s_mov_b32 s9, s8 10677; GFX9-NEXT: s_xor_b64 s[14:15], s[2:3], s[8:9] 10678; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 10679; GFX9-NEXT: v_mul_lo_u32 v2, s14, v1 10680; GFX9-NEXT: v_mul_hi_u32 v3, s14, v0 10681; GFX9-NEXT: v_mul_hi_u32 v4, s14, v1 10682; GFX9-NEXT: v_mul_hi_u32 v7, s15, v1 10683; GFX9-NEXT: v_mul_lo_u32 v1, s15, v1 10684; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10685; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 10686; GFX9-NEXT: v_mul_lo_u32 v4, s15, v0 10687; GFX9-NEXT: v_mul_hi_u32 v0, s15, v0 10688; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 10689; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 10690; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 10691; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v6, vcc 10692; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 10693; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v2, vcc 10694; GFX9-NEXT: v_mul_lo_u32 v1, s12, v1 10695; GFX9-NEXT: v_mul_hi_u32 v2, s12, v0 10696; GFX9-NEXT: v_mul_lo_u32 v3, s13, v0 10697; GFX9-NEXT: v_mul_lo_u32 v0, s12, v0 10698; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 10699; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 10700; GFX9-NEXT: v_sub_u32_e32 v2, s15, v1 10701; GFX9-NEXT: v_mov_b32_e32 v3, s13 10702; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s14, v0 10703; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc 10704; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s12, v0 10705; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[2:3], 0, v2, s[0:1] 10706; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v7 10707; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 10708; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v4 10709; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 10710; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v7 10711; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 10712; GFX9-NEXT: s_ashr_i32 s2, s11, 31 10713; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] 10714; GFX9-NEXT: s_add_u32 s10, s10, s2 10715; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s12, v4 10716; GFX9-NEXT: s_mov_b32 s3, s2 10717; GFX9-NEXT: s_addc_u32 s11, s11, s2 10718; GFX9-NEXT: s_xor_b64 s[10:11], s[10:11], s[2:3] 10719; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] 10720; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 10721; GFX9-NEXT: v_cvt_f32_u32_e32 v8, s10 10722; GFX9-NEXT: v_cvt_f32_u32_e32 v9, s11 10723; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[0:1] 10724; GFX9-NEXT: v_mov_b32_e32 v7, s15 10725; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v7, v1, vcc 10726; GFX9-NEXT: v_mac_f32_e32 v8, s16, v9 10727; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 10728; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 10729; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 10730; GFX9-NEXT: v_rcp_f32_e32 v8, v8 10731; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc 10732; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 10733; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc 10734; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 10735; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 10736; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 10737; GFX9-NEXT: v_mul_f32_e32 v3, s17, v8 10738; GFX9-NEXT: v_mul_f32_e32 v4, s18, v3 10739; GFX9-NEXT: v_trunc_f32_e32 v4, v4 10740; GFX9-NEXT: v_mac_f32_e32 v3, s19, v4 10741; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 10742; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 10743; GFX9-NEXT: s_sub_u32 s2, 0, s10 10744; GFX9-NEXT: s_subb_u32 s3, 0, s11 10745; GFX9-NEXT: v_mul_hi_u32 v7, s2, v3 10746; GFX9-NEXT: v_mul_lo_u32 v8, s2, v4 10747; GFX9-NEXT: v_mul_lo_u32 v9, s3, v3 10748; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 10749; GFX9-NEXT: v_mul_lo_u32 v2, s2, v3 10750; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 10751; GFX9-NEXT: v_add_u32_e32 v7, v7, v9 10752; GFX9-NEXT: v_mul_lo_u32 v8, v3, v7 10753; GFX9-NEXT: v_mul_hi_u32 v9, v3, v2 10754; GFX9-NEXT: v_mul_hi_u32 v10, v3, v7 10755; GFX9-NEXT: v_mul_hi_u32 v11, v4, v7 10756; GFX9-NEXT: v_mul_lo_u32 v7, v4, v7 10757; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 10758; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc 10759; GFX9-NEXT: v_mul_lo_u32 v10, v4, v2 10760; GFX9-NEXT: v_mul_hi_u32 v2, v4, v2 10761; GFX9-NEXT: s_ashr_i32 s12, s7, 31 10762; GFX9-NEXT: s_mov_b32 s13, s12 10763; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 10764; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v9, v2, vcc 10765; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v6, vcc 10766; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v7 10767; GFX9-NEXT: v_add_co_u32_e64 v2, s[0:1], v3, v2 10768; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v5, v8, vcc 10769; GFX9-NEXT: v_addc_co_u32_e64 v3, vcc, v4, v7, s[0:1] 10770; GFX9-NEXT: v_mul_lo_u32 v8, s2, v3 10771; GFX9-NEXT: v_mul_hi_u32 v9, s2, v2 10772; GFX9-NEXT: v_mul_lo_u32 v10, s3, v2 10773; GFX9-NEXT: v_mul_lo_u32 v11, s2, v2 10774; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 10775; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 10776; GFX9-NEXT: v_add_u32_e32 v8, v8, v10 10777; GFX9-NEXT: v_mul_lo_u32 v12, v2, v8 10778; GFX9-NEXT: v_mul_hi_u32 v13, v2, v11 10779; GFX9-NEXT: v_mul_hi_u32 v14, v2, v8 10780; GFX9-NEXT: v_mul_hi_u32 v10, v3, v11 10781; GFX9-NEXT: v_mul_lo_u32 v11, v3, v11 10782; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12 10783; GFX9-NEXT: v_mul_hi_u32 v9, v3, v8 10784; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v14, vcc 10785; GFX9-NEXT: v_mul_lo_u32 v3, v3, v8 10786; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v12, v11 10787; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v10, vcc 10788; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v6, vcc 10789; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v10, v3 10790; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v5, v8, vcc 10791; GFX9-NEXT: v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1] 10792; GFX9-NEXT: s_add_u32 s0, s6, s12 10793; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 10794; GFX9-NEXT: s_addc_u32 s1, s7, s12 10795; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] 10796; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 10797; GFX9-NEXT: v_mul_lo_u32 v4, s6, v3 10798; GFX9-NEXT: v_mul_hi_u32 v7, s6, v2 10799; GFX9-NEXT: v_mul_hi_u32 v9, s6, v3 10800; GFX9-NEXT: v_mul_hi_u32 v10, s7, v3 10801; GFX9-NEXT: v_mul_lo_u32 v3, s7, v3 10802; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v4 10803; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v9, vcc 10804; GFX9-NEXT: v_mul_lo_u32 v9, s7, v2 10805; GFX9-NEXT: v_mul_hi_u32 v2, s7, v2 10806; GFX9-NEXT: v_xor_b32_e32 v0, s8, v0 10807; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 10808; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v9 10809; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v2, vcc 10810; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v10, v6, vcc 10811; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 10812; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc 10813; GFX9-NEXT: v_mul_lo_u32 v3, s10, v3 10814; GFX9-NEXT: v_mul_hi_u32 v4, s10, v2 10815; GFX9-NEXT: v_mul_lo_u32 v5, s11, v2 10816; GFX9-NEXT: v_mul_lo_u32 v2, s10, v2 10817; GFX9-NEXT: v_mov_b32_e32 v8, s8 10818; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s8, v0 10819; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 10820; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v8, vcc 10821; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 10822; GFX9-NEXT: v_sub_u32_e32 v4, s7, v3 10823; GFX9-NEXT: v_mov_b32_e32 v5, s11 10824; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s6, v2 10825; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v5, vcc 10826; GFX9-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s10, v2 10827; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[2:3], 0, v4, s[0:1] 10828; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s11, v8 10829; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v5, s[0:1] 10830; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 10831; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s10, v7 10832; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s10, v7 10833; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[2:3] 10834; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s11, v8 10835; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[2:3] 10836; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] 10837; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 10838; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[0:1] 10839; GFX9-NEXT: v_mov_b32_e32 v8, s7 10840; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v8, v3, vcc 10841; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 10842; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 10843; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 10844; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc 10845; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v3 10846; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc 10847; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 10848; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 10849; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v5, s[0:1] 10850; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 10851; GFX9-NEXT: v_xor_b32_e32 v2, s12, v2 10852; GFX9-NEXT: v_xor_b32_e32 v3, s12, v3 10853; GFX9-NEXT: v_mov_b32_e32 v4, s12 10854; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s12, v2 10855; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v4, vcc 10856; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10857; GFX9-NEXT: global_store_dwordx4 v6, v[0:3], s[4:5] 10858; GFX9-NEXT: s_endpgm 10859 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 10860 %r = srem <2 x i64> %x, %shl.y 10861 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 10862 ret void 10863} 10864