1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update 3; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s 5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s 6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx90a -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX90A %s 7 8define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 9; CHECK-LABEL: @udiv_i32( 10; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 11; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 12; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000 13; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 14; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]] 15; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]] 16; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64 17; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 18; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 19; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 20; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 21; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 22; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]] 23; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64 24; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 25; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 26; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 27; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 28; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 29; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]] 30; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]] 31; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]] 32; CHECK-NEXT: [[TMP23:%.*]] = add i32 [[TMP19]], 1 33; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP19]] 34; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP21]], [[Y]] 35; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP22]], i32 [[TMP25]], i32 [[TMP21]] 36; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]] 37; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP24]], 1 38; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]] 39; CHECK-NEXT: store i32 [[TMP29]], i32 addrspace(1)* [[OUT:%.*]], align 4 40; CHECK-NEXT: ret void 41; 42; GFX6-LABEL: udiv_i32: 43; GFX6: ; %bb.0: 44; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 45; GFX6-NEXT: s_mov_b32 s7, 0xf000 46; GFX6-NEXT: s_mov_b32 s6, -1 47; GFX6-NEXT: s_waitcnt lgkmcnt(0) 48; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 49; GFX6-NEXT: s_sub_i32 s4, 0, s3 50; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 51; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 52; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 53; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 54; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 55; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 56; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 57; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 58; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 59; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 60; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 61; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 62; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 63; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 64; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 65; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 66; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 67; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 68; GFX6-NEXT: s_waitcnt lgkmcnt(0) 69; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 70; GFX6-NEXT: s_endpgm 71; 72; GFX9-LABEL: udiv_i32: 73; GFX9: ; %bb.0: 74; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 75; GFX9-NEXT: v_mov_b32_e32 v2, 0 76; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 77; GFX9-NEXT: s_waitcnt lgkmcnt(0) 78; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 79; GFX9-NEXT: s_sub_i32 s4, 0, s3 80; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 81; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 82; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 83; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 84; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 85; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 86; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 87; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3 88; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 89; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 90; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 91; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 92; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 93; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 94; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 95; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 96; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 97; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 98; GFX9-NEXT: s_endpgm 99; 100; GFX90A-LABEL: udiv_i32: 101; GFX90A: ; %bb.0: 102; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 103; GFX90A-NEXT: v_mov_b32_e32 v1, 0 104; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 105; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 106; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 107; GFX90A-NEXT: s_sub_i32 s4, 0, s3 108; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 109; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 110; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 111; GFX90A-NEXT: v_mul_lo_u32 v2, s4, v0 112; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 113; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 114; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 115; GFX90A-NEXT: v_mul_lo_u32 v2, v0, s3 116; GFX90A-NEXT: v_sub_u32_e32 v2, s2, v2 117; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 118; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 119; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 120; GFX90A-NEXT: v_subrev_u32_e32 v3, s3, v2 121; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 122; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 123; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 124; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 125; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] 126; GFX90A-NEXT: s_endpgm 127 %r = udiv i32 %x, %y 128 store i32 %r, i32 addrspace(1)* %out 129 ret void 130} 131 132define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 133; CHECK-LABEL: @urem_i32( 134; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 135; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 136; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000 137; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 138; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]] 139; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]] 140; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64 141; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 142; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 143; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 144; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 145; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 146; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]] 147; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64 148; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 149; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 150; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 151; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 152; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 153; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]] 154; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]] 155; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]] 156; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP21]], [[Y]] 157; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP21]] 158; CHECK-NEXT: [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]] 159; CHECK-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]] 160; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]] 161; CHECK-NEXT: store i32 [[TMP27]], i32 addrspace(1)* [[OUT:%.*]], align 4 162; CHECK-NEXT: ret void 163; 164; GFX6-LABEL: urem_i32: 165; GFX6: ; %bb.0: 166; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 167; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 168; GFX6-NEXT: s_mov_b32 s3, 0xf000 169; GFX6-NEXT: s_waitcnt lgkmcnt(0) 170; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 171; GFX6-NEXT: s_sub_i32 s2, 0, s5 172; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 173; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 174; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 175; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 176; GFX6-NEXT: s_mov_b32 s2, -1 177; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 178; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 179; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 180; GFX6-NEXT: v_mul_lo_u32 v0, v0, s5 181; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 182; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s5, v0 183; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 184; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 185; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s5, v0 186; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 187; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 188; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 189; GFX6-NEXT: s_endpgm 190; 191; GFX9-LABEL: urem_i32: 192; GFX9: ; %bb.0: 193; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 194; GFX9-NEXT: s_waitcnt lgkmcnt(0) 195; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 196; GFX9-NEXT: s_sub_i32 s4, 0, s3 197; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 198; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 199; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 200; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 201; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 202; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 203; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 204; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 205; GFX9-NEXT: v_mov_b32_e32 v1, 0 206; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 207; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 208; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 209; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 210; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 211; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 212; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 213; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 214; GFX9-NEXT: s_waitcnt lgkmcnt(0) 215; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 216; GFX9-NEXT: s_endpgm 217; 218; GFX90A-LABEL: urem_i32: 219; GFX90A: ; %bb.0: 220; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 221; GFX90A-NEXT: v_mov_b32_e32 v1, 0 222; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 223; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 224; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 225; GFX90A-NEXT: s_sub_i32 s4, 0, s3 226; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 227; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 228; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 229; GFX90A-NEXT: v_mul_lo_u32 v2, s4, v0 230; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 231; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 232; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 233; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s3 234; GFX90A-NEXT: v_sub_u32_e32 v0, s2, v0 235; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 236; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 237; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 238; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 239; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 240; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 241; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] 242; GFX90A-NEXT: s_endpgm 243 %r = urem i32 %x, %y 244 store i32 %r, i32 addrspace(1)* %out 245 ret void 246} 247 248define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 249; CHECK-LABEL: @sdiv_i32( 250; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 251; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 252; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 253; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[X]], [[TMP1]] 254; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]] 255; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]] 256; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]] 257; CHECK-NEXT: [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float 258; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]]) 259; CHECK-NEXT: [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41EFFFFFC0000000 260; CHECK-NEXT: [[TMP11:%.*]] = fptoui float [[TMP10]] to i32 261; CHECK-NEXT: [[TMP12:%.*]] = sub i32 0, [[TMP7]] 262; CHECK-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP11]] 263; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP11]] to i64 264; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 265; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 266; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 267; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 268; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 269; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP11]], [[TMP19]] 270; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP6]] to i64 271; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP20]] to i64 272; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]] 273; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 274; CHECK-NEXT: [[TMP25:%.*]] = lshr i64 [[TMP23]], 32 275; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 276; CHECK-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], [[TMP7]] 277; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP6]], [[TMP27]] 278; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP7]] 279; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 280; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 281; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP28]], [[TMP7]] 282; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP29]], i32 [[TMP32]], i32 [[TMP28]] 283; CHECK-NEXT: [[TMP34:%.*]] = icmp uge i32 [[TMP33]], [[TMP7]] 284; CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP31]], 1 285; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]] 286; CHECK-NEXT: [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]] 287; CHECK-NEXT: [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]] 288; CHECK-NEXT: store i32 [[TMP38]], i32 addrspace(1)* [[OUT:%.*]], align 4 289; CHECK-NEXT: ret void 290; 291; GFX6-LABEL: sdiv_i32: 292; GFX6: ; %bb.0: 293; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 294; GFX6-NEXT: s_mov_b32 s7, 0xf000 295; GFX6-NEXT: s_mov_b32 s6, -1 296; GFX6-NEXT: s_waitcnt lgkmcnt(0) 297; GFX6-NEXT: s_ashr_i32 s8, s3, 31 298; GFX6-NEXT: s_add_i32 s3, s3, s8 299; GFX6-NEXT: s_xor_b32 s3, s3, s8 300; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 301; GFX6-NEXT: s_sub_i32 s4, 0, s3 302; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 303; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 304; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 305; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 306; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 307; GFX6-NEXT: s_ashr_i32 s0, s2, 31 308; GFX6-NEXT: s_add_i32 s1, s2, s0 309; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 310; GFX6-NEXT: s_xor_b32 s1, s1, s0 311; GFX6-NEXT: s_xor_b32 s2, s0, s8 312; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 313; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 314; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 315; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 316; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 317; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 318; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 319; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 320; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 321; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 322; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 323; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 324; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 325; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 326; GFX6-NEXT: s_waitcnt lgkmcnt(0) 327; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 328; GFX6-NEXT: s_endpgm 329; 330; GFX9-LABEL: sdiv_i32: 331; GFX9: ; %bb.0: 332; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 333; GFX9-NEXT: v_mov_b32_e32 v2, 0 334; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 335; GFX9-NEXT: s_waitcnt lgkmcnt(0) 336; GFX9-NEXT: s_ashr_i32 s4, s3, 31 337; GFX9-NEXT: s_add_i32 s3, s3, s4 338; GFX9-NEXT: s_xor_b32 s3, s3, s4 339; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 340; GFX9-NEXT: s_sub_i32 s5, 0, s3 341; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 342; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 343; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 344; GFX9-NEXT: v_mul_lo_u32 v1, s5, v0 345; GFX9-NEXT: s_ashr_i32 s5, s2, 31 346; GFX9-NEXT: s_add_i32 s2, s2, s5 347; GFX9-NEXT: s_xor_b32 s2, s2, s5 348; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 349; GFX9-NEXT: s_xor_b32 s4, s5, s4 350; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 351; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 352; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3 353; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 354; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 355; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 356; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 357; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 358; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 359; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 360; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 361; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 362; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 363; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 364; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 365; GFX9-NEXT: s_endpgm 366; 367; GFX90A-LABEL: sdiv_i32: 368; GFX90A: ; %bb.0: 369; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 370; GFX90A-NEXT: v_mov_b32_e32 v1, 0 371; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 372; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 373; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 374; GFX90A-NEXT: s_add_i32 s3, s3, s4 375; GFX90A-NEXT: s_xor_b32 s3, s3, s4 376; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 377; GFX90A-NEXT: s_ashr_i32 s5, s2, 31 378; GFX90A-NEXT: s_add_i32 s2, s2, s5 379; GFX90A-NEXT: s_xor_b32 s4, s5, s4 380; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 381; GFX90A-NEXT: s_xor_b32 s2, s2, s5 382; GFX90A-NEXT: s_sub_i32 s5, 0, s3 383; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 384; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 385; GFX90A-NEXT: v_mul_lo_u32 v2, s5, v0 386; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 387; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 388; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 389; GFX90A-NEXT: v_mul_lo_u32 v2, v0, s3 390; GFX90A-NEXT: v_sub_u32_e32 v2, s2, v2 391; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 392; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 393; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 394; GFX90A-NEXT: v_subrev_u32_e32 v3, s3, v2 395; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 396; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 397; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 398; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 399; GFX90A-NEXT: v_xor_b32_e32 v0, s4, v0 400; GFX90A-NEXT: v_subrev_u32_e32 v0, s4, v0 401; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] 402; GFX90A-NEXT: s_endpgm 403 %r = sdiv i32 %x, %y 404 store i32 %r, i32 addrspace(1)* %out 405 ret void 406} 407 408define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 409; CHECK-LABEL: @srem_i32( 410; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 411; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 412; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[X]], [[TMP1]] 413; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]] 414; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]] 415; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]] 416; CHECK-NEXT: [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float 417; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 418; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41EFFFFFC0000000 419; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP9]] to i32 420; CHECK-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP6]] 421; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP10]] 422; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP10]] to i64 423; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64 424; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]] 425; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 426; CHECK-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP15]], 32 427; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 428; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP10]], [[TMP18]] 429; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64 430; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP19]] to i64 431; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]] 432; CHECK-NEXT: [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32 433; CHECK-NEXT: [[TMP24:%.*]] = lshr i64 [[TMP22]], 32 434; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 435; CHECK-NEXT: [[TMP26:%.*]] = mul i32 [[TMP25]], [[TMP6]] 436; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP5]], [[TMP26]] 437; CHECK-NEXT: [[TMP28:%.*]] = icmp uge i32 [[TMP27]], [[TMP6]] 438; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP6]] 439; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP28]], i32 [[TMP29]], i32 [[TMP27]] 440; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP6]] 441; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP30]], [[TMP6]] 442; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]] 443; CHECK-NEXT: [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]] 444; CHECK-NEXT: [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]] 445; CHECK-NEXT: store i32 [[TMP35]], i32 addrspace(1)* [[OUT:%.*]], align 4 446; CHECK-NEXT: ret void 447; 448; GFX6-LABEL: srem_i32: 449; GFX6: ; %bb.0: 450; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 451; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 452; GFX6-NEXT: s_waitcnt lgkmcnt(0) 453; GFX6-NEXT: s_ashr_i32 s4, s3, 31 454; GFX6-NEXT: s_add_i32 s3, s3, s4 455; GFX6-NEXT: s_xor_b32 s4, s3, s4 456; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 457; GFX6-NEXT: s_sub_i32 s3, 0, s4 458; GFX6-NEXT: s_ashr_i32 s5, s2, 31 459; GFX6-NEXT: s_add_i32 s2, s2, s5 460; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 461; GFX6-NEXT: s_xor_b32 s6, s2, s5 462; GFX6-NEXT: s_mov_b32 s2, -1 463; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 464; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 465; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 466; GFX6-NEXT: s_mov_b32 s3, 0xf000 467; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 468; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 469; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 470; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 471; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 472; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 473; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 474; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 475; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 476; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 477; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 478; GFX6-NEXT: v_xor_b32_e32 v0, s5, v0 479; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s5, v0 480; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 481; GFX6-NEXT: s_endpgm 482; 483; GFX9-LABEL: srem_i32: 484; GFX9: ; %bb.0: 485; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 486; GFX9-NEXT: s_waitcnt lgkmcnt(0) 487; GFX9-NEXT: s_ashr_i32 s4, s3, 31 488; GFX9-NEXT: s_add_i32 s3, s3, s4 489; GFX9-NEXT: s_xor_b32 s3, s3, s4 490; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 491; GFX9-NEXT: s_sub_i32 s4, 0, s3 492; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 493; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 494; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 495; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 496; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 497; GFX9-NEXT: s_ashr_i32 s4, s2, 31 498; GFX9-NEXT: s_add_i32 s2, s2, s4 499; GFX9-NEXT: s_xor_b32 s2, s2, s4 500; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 501; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 502; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 503; GFX9-NEXT: v_mov_b32_e32 v1, 0 504; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 505; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 506; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 507; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 508; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 509; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 510; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 511; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 512; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 513; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 514; GFX9-NEXT: s_waitcnt lgkmcnt(0) 515; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 516; GFX9-NEXT: s_endpgm 517; 518; GFX90A-LABEL: srem_i32: 519; GFX90A: ; %bb.0: 520; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 521; GFX90A-NEXT: v_mov_b32_e32 v1, 0 522; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 523; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 524; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 525; GFX90A-NEXT: s_add_i32 s3, s3, s4 526; GFX90A-NEXT: s_xor_b32 s3, s3, s4 527; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 528; GFX90A-NEXT: s_sub_i32 s5, 0, s3 529; GFX90A-NEXT: s_ashr_i32 s4, s2, 31 530; GFX90A-NEXT: s_add_i32 s2, s2, s4 531; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 532; GFX90A-NEXT: s_xor_b32 s2, s2, s4 533; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 534; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 535; GFX90A-NEXT: v_mul_lo_u32 v2, s5, v0 536; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 537; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 538; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 539; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s3 540; GFX90A-NEXT: v_sub_u32_e32 v0, s2, v0 541; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 542; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 543; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 544; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 545; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 546; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 547; GFX90A-NEXT: v_xor_b32_e32 v0, s4, v0 548; GFX90A-NEXT: v_subrev_u32_e32 v0, s4, v0 549; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] 550; GFX90A-NEXT: s_endpgm 551 %r = srem i32 %x, %y 552 store i32 %r, i32 addrspace(1)* %out 553 ret void 554} 555 556define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 557; CHECK-LABEL: @udiv_i16( 558; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 559; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 560; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 561; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 562; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 563; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 564; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 565; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 566; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 567; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 568; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 569; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 570; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 571; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 572; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 573; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 65535 574; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 575; CHECK-NEXT: store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]], align 2 576; CHECK-NEXT: ret void 577; 578; GFX6-LABEL: udiv_i16: 579; GFX6: ; %bb.0: 580; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb 581; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 582; GFX6-NEXT: s_waitcnt lgkmcnt(0) 583; GFX6-NEXT: s_lshr_b32 s3, s2, 16 584; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 585; GFX6-NEXT: s_and_b32 s2, s2, 0xffff 586; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s2 587; GFX6-NEXT: s_mov_b32 s3, 0xf000 588; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 589; GFX6-NEXT: s_mov_b32 s2, -1 590; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 591; GFX6-NEXT: v_trunc_f32_e32 v2, v2 592; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 593; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 594; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 595; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 596; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 597; GFX6-NEXT: s_endpgm 598; 599; GFX9-LABEL: udiv_i16: 600; GFX9: ; %bb.0: 601; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 602; GFX9-NEXT: v_mov_b32_e32 v3, 0 603; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 604; GFX9-NEXT: s_waitcnt lgkmcnt(0) 605; GFX9-NEXT: s_lshr_b32 s3, s2, 16 606; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 607; GFX9-NEXT: s_and_b32 s2, s2, 0xffff 608; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 609; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 610; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 611; GFX9-NEXT: v_trunc_f32_e32 v2, v2 612; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 613; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 614; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 615; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 616; GFX9-NEXT: global_store_short v3, v0, s[0:1] 617; GFX9-NEXT: s_endpgm 618; 619; GFX90A-LABEL: udiv_i16: 620; GFX90A: ; %bb.0: 621; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x2c 622; GFX90A-NEXT: v_mov_b32_e32 v3, 0 623; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 624; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 625; GFX90A-NEXT: s_lshr_b32 s3, s2, 16 626; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 627; GFX90A-NEXT: s_and_b32 s2, s2, 0xffff 628; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s2 629; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0 630; GFX90A-NEXT: v_mul_f32_e32 v2, v1, v2 631; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 632; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v2 633; GFX90A-NEXT: v_mad_f32 v1, -v2, v0, v1 634; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 635; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 636; GFX90A-NEXT: global_store_short v3, v0, s[0:1] 637; GFX90A-NEXT: s_endpgm 638 %r = udiv i16 %x, %y 639 store i16 %r, i16 addrspace(1)* %out 640 ret void 641} 642 643define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 644; CHECK-LABEL: @urem_i16( 645; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 646; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 647; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 648; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 649; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 650; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 651; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 652; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 653; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 654; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 655; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 656; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 657; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 658; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 659; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 660; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 661; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 662; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 663; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 664; CHECK-NEXT: store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]], align 2 665; CHECK-NEXT: ret void 666; 667; GFX6-LABEL: urem_i16: 668; GFX6: ; %bb.0: 669; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 670; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 671; GFX6-NEXT: s_waitcnt lgkmcnt(0) 672; GFX6-NEXT: s_lshr_b32 s2, s4, 16 673; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 674; GFX6-NEXT: s_and_b32 s3, s4, 0xffff 675; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 676; GFX6-NEXT: s_mov_b32 s3, 0xf000 677; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 678; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 679; GFX6-NEXT: v_trunc_f32_e32 v2, v2 680; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 681; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 682; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 683; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 684; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 685; GFX6-NEXT: s_mov_b32 s2, -1 686; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 687; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 688; GFX6-NEXT: s_endpgm 689; 690; GFX9-LABEL: urem_i16: 691; GFX9: ; %bb.0: 692; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 693; GFX9-NEXT: s_waitcnt lgkmcnt(0) 694; GFX9-NEXT: s_lshr_b32 s3, s2, 16 695; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 696; GFX9-NEXT: s_and_b32 s4, s2, 0xffff 697; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 698; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 699; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 700; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 701; GFX9-NEXT: v_trunc_f32_e32 v2, v2 702; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 703; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 704; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 705; GFX9-NEXT: v_mov_b32_e32 v1, 0 706; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 707; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 708; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 709; GFX9-NEXT: s_waitcnt lgkmcnt(0) 710; GFX9-NEXT: global_store_short v1, v0, s[0:1] 711; GFX9-NEXT: s_endpgm 712; 713; GFX90A-LABEL: urem_i16: 714; GFX90A: ; %bb.0: 715; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x2c 716; GFX90A-NEXT: v_mov_b32_e32 v3, 0 717; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 718; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 719; GFX90A-NEXT: s_lshr_b32 s3, s2, 16 720; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 721; GFX90A-NEXT: s_and_b32 s4, s2, 0xffff 722; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s4 723; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0 724; GFX90A-NEXT: v_mul_f32_e32 v2, v1, v2 725; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 726; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v2 727; GFX90A-NEXT: v_mad_f32 v1, -v2, v0, v1 728; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 729; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 730; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s3 731; GFX90A-NEXT: v_sub_u32_e32 v0, s2, v0 732; GFX90A-NEXT: global_store_short v3, v0, s[0:1] 733; GFX90A-NEXT: s_endpgm 734 %r = urem i16 %x, %y 735 store i16 %r, i16 addrspace(1)* %out 736 ret void 737} 738 739define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 740; CHECK-LABEL: @sdiv_i16( 741; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 742; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 743; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 744; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 745; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 746; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 747; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 748; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 749; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 750; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 751; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 752; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 753; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 754; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 755; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 756; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 757; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 758; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 759; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 16 760; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 16 761; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 762; CHECK-NEXT: store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]], align 2 763; CHECK-NEXT: ret void 764; 765; GFX6-LABEL: sdiv_i16: 766; GFX6: ; %bb.0: 767; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 768; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 769; GFX6-NEXT: s_mov_b32 s7, 0xf000 770; GFX6-NEXT: s_mov_b32 s6, -1 771; GFX6-NEXT: s_waitcnt lgkmcnt(0) 772; GFX6-NEXT: s_ashr_i32 s1, s0, 16 773; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 774; GFX6-NEXT: s_sext_i32_i16 s0, s0 775; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 776; GFX6-NEXT: s_xor_b32 s0, s0, s1 777; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 778; GFX6-NEXT: s_ashr_i32 s0, s0, 30 779; GFX6-NEXT: s_or_b32 s0, s0, 1 780; GFX6-NEXT: v_mov_b32_e32 v3, s0 781; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 782; GFX6-NEXT: v_trunc_f32_e32 v2, v2 783; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 784; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 785; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 786; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 787; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 788; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 789; GFX6-NEXT: s_endpgm 790; 791; GFX9-LABEL: sdiv_i16: 792; GFX9: ; %bb.0: 793; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 794; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 795; GFX9-NEXT: v_mov_b32_e32 v1, 0 796; GFX9-NEXT: s_waitcnt lgkmcnt(0) 797; GFX9-NEXT: s_ashr_i32 s0, s4, 16 798; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 799; GFX9-NEXT: s_sext_i32_i16 s1, s4 800; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 801; GFX9-NEXT: s_xor_b32 s0, s1, s0 802; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 803; GFX9-NEXT: s_ashr_i32 s0, s0, 30 804; GFX9-NEXT: s_or_b32 s4, s0, 1 805; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 806; GFX9-NEXT: v_trunc_f32_e32 v3, v3 807; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 808; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 809; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 810; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 811; GFX9-NEXT: s_cselect_b32 s0, s4, 0 812; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 813; GFX9-NEXT: global_store_short v1, v0, s[2:3] 814; GFX9-NEXT: s_endpgm 815; 816; GFX90A-LABEL: sdiv_i16: 817; GFX90A: ; %bb.0: 818; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 819; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 820; GFX90A-NEXT: v_mov_b32_e32 v1, 0 821; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 822; GFX90A-NEXT: s_ashr_i32 s0, s4, 16 823; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 824; GFX90A-NEXT: s_sext_i32_i16 s1, s4 825; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 826; GFX90A-NEXT: s_xor_b32 s0, s1, s0 827; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 828; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 829; GFX90A-NEXT: s_or_b32 s4, s0, 1 830; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 831; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 832; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 833; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 834; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 835; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 836; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 837; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 838; GFX90A-NEXT: global_store_short v1, v0, s[2:3] 839; GFX90A-NEXT: s_endpgm 840 %r = sdiv i16 %x, %y 841 store i16 %r, i16 addrspace(1)* %out 842 ret void 843} 844 845define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 846; CHECK-LABEL: @srem_i16( 847; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 848; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 849; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 850; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 851; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 852; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 853; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 854; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 855; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 856; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 857; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 858; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 859; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 860; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 861; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 862; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 863; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 864; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 865; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 866; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 867; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 868; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 869; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 870; CHECK-NEXT: store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]], align 2 871; CHECK-NEXT: ret void 872; 873; GFX6-LABEL: srem_i16: 874; GFX6: ; %bb.0: 875; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 876; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 877; GFX6-NEXT: s_waitcnt lgkmcnt(0) 878; GFX6-NEXT: s_ashr_i32 s2, s4, 16 879; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 880; GFX6-NEXT: s_sext_i32_i16 s3, s4 881; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 882; GFX6-NEXT: s_xor_b32 s3, s3, s2 883; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 884; GFX6-NEXT: s_ashr_i32 s3, s3, 30 885; GFX6-NEXT: s_or_b32 s3, s3, 1 886; GFX6-NEXT: v_mov_b32_e32 v3, s3 887; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 888; GFX6-NEXT: v_trunc_f32_e32 v2, v2 889; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 890; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 891; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 892; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 893; GFX6-NEXT: s_mov_b32 s3, 0xf000 894; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 895; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 896; GFX6-NEXT: s_mov_b32 s2, -1 897; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 898; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 899; GFX6-NEXT: s_endpgm 900; 901; GFX9-LABEL: srem_i16: 902; GFX9: ; %bb.0: 903; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 904; GFX9-NEXT: s_waitcnt lgkmcnt(0) 905; GFX9-NEXT: s_ashr_i32 s5, s4, 16 906; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5 907; GFX9-NEXT: s_sext_i32_i16 s2, s4 908; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 909; GFX9-NEXT: s_xor_b32 s2, s2, s5 910; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 911; GFX9-NEXT: s_ashr_i32 s2, s2, 30 912; GFX9-NEXT: s_or_b32 s6, s2, 1 913; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 914; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 915; GFX9-NEXT: v_trunc_f32_e32 v2, v2 916; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 917; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 918; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 919; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 920; GFX9-NEXT: s_cselect_b32 s2, s6, 0 921; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 922; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 923; GFX9-NEXT: v_mov_b32_e32 v1, 0 924; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 925; GFX9-NEXT: s_waitcnt lgkmcnt(0) 926; GFX9-NEXT: global_store_short v1, v0, s[0:1] 927; GFX9-NEXT: s_endpgm 928; 929; GFX90A-LABEL: srem_i16: 930; GFX90A: ; %bb.0: 931; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 932; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 933; GFX90A-NEXT: v_mov_b32_e32 v1, 0 934; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 935; GFX90A-NEXT: s_ashr_i32 s5, s4, 16 936; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s5 937; GFX90A-NEXT: s_sext_i32_i16 s0, s4 938; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s0 939; GFX90A-NEXT: s_xor_b32 s0, s0, s5 940; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 941; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 942; GFX90A-NEXT: s_or_b32 s6, s0, 1 943; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 944; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 945; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 946; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 947; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 948; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 949; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 950; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 951; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s5 952; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 953; GFX90A-NEXT: global_store_short v1, v0, s[2:3] 954; GFX90A-NEXT: s_endpgm 955 %r = srem i16 %x, %y 956 store i16 %r, i16 addrspace(1)* %out 957 ret void 958} 959 960define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 961; CHECK-LABEL: @udiv_i8( 962; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 963; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 964; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 965; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 966; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 967; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 968; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 969; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 970; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 971; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 972; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 973; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 974; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 975; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 976; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 977; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 255 978; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 979; CHECK-NEXT: store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]], align 1 980; CHECK-NEXT: ret void 981; 982; GFX6-LABEL: udiv_i8: 983; GFX6: ; %bb.0: 984; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 985; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 986; GFX6-NEXT: s_mov_b32 s7, 0xf000 987; GFX6-NEXT: s_mov_b32 s6, -1 988; GFX6-NEXT: s_waitcnt lgkmcnt(0) 989; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s0 990; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 991; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 992; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 993; GFX6-NEXT: v_trunc_f32_e32 v1, v1 994; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 995; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 996; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 997; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 998; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 999; GFX6-NEXT: s_endpgm 1000; 1001; GFX9-LABEL: udiv_i8: 1002; GFX9: ; %bb.0: 1003; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 1004; GFX9-NEXT: v_mov_b32_e32 v2, 0 1005; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1006; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1007; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 1008; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 1009; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 1010; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 1011; GFX9-NEXT: v_trunc_f32_e32 v1, v1 1012; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 1013; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v3 1014; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1015; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 1016; GFX9-NEXT: global_store_byte v2, v0, s[0:1] 1017; GFX9-NEXT: s_endpgm 1018; 1019; GFX90A-LABEL: udiv_i8: 1020; GFX90A: ; %bb.0: 1021; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x2c 1022; GFX90A-NEXT: v_mov_b32_e32 v2, 0 1023; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1024; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1025; GFX90A-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 1026; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v0 1027; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 1028; GFX90A-NEXT: v_mul_f32_e32 v1, v3, v1 1029; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 1030; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v1 1031; GFX90A-NEXT: v_mad_f32 v1, -v1, v0, v3 1032; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1033; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 1034; GFX90A-NEXT: global_store_byte v2, v0, s[0:1] 1035; GFX90A-NEXT: s_endpgm 1036 %r = udiv i8 %x, %y 1037 store i8 %r, i8 addrspace(1)* %out 1038 ret void 1039} 1040 1041define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 1042; CHECK-LABEL: @urem_i8( 1043; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 1044; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 1045; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 1046; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 1047; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 1048; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 1049; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 1050; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 1051; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 1052; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 1053; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 1054; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 1055; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 1056; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 1057; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 1058; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 1059; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 1060; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 255 1061; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8 1062; CHECK-NEXT: store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]], align 1 1063; CHECK-NEXT: ret void 1064; 1065; GFX6-LABEL: urem_i8: 1066; GFX6: ; %bb.0: 1067; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 1068; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1069; GFX6-NEXT: s_mov_b32 s3, 0xf000 1070; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1071; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 1072; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 1073; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 1074; GFX6-NEXT: s_lshr_b32 s2, s4, 8 1075; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 1076; GFX6-NEXT: v_trunc_f32_e32 v1, v1 1077; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 1078; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 1079; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1080; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 1081; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 1082; GFX6-NEXT: s_mov_b32 s2, -1 1083; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1084; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 1085; GFX6-NEXT: s_endpgm 1086; 1087; GFX9-LABEL: urem_i8: 1088; GFX9: ; %bb.0: 1089; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 1090; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1091; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 1092; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 1093; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 1094; GFX9-NEXT: s_lshr_b32 s3, s2, 8 1095; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1096; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 1097; GFX9-NEXT: v_trunc_f32_e32 v1, v1 1098; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 1099; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2 1100; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1101; GFX9-NEXT: v_mov_b32_e32 v1, 0 1102; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 1103; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 1104; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1105; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1106; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 1107; GFX9-NEXT: s_endpgm 1108; 1109; GFX90A-LABEL: urem_i8: 1110; GFX90A: ; %bb.0: 1111; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1112; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 1113; GFX90A-NEXT: v_mov_b32_e32 v2, 0 1114; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1115; GFX90A-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 1116; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v0 1117; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v3, s4 1118; GFX90A-NEXT: s_lshr_b32 s0, s4, 8 1119; GFX90A-NEXT: v_mul_f32_e32 v1, v3, v1 1120; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 1121; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v1 1122; GFX90A-NEXT: v_mad_f32 v1, -v1, v0, v3 1123; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1124; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 1125; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s0 1126; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 1127; GFX90A-NEXT: global_store_byte v2, v0, s[2:3] 1128; GFX90A-NEXT: s_endpgm 1129 %r = urem i8 %x, %y 1130 store i8 %r, i8 addrspace(1)* %out 1131 ret void 1132} 1133 1134define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 1135; CHECK-LABEL: @sdiv_i8( 1136; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 1137; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 1138; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 1139; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 1140; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 1141; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 1142; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 1143; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 1144; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 1145; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 1146; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 1147; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 1148; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 1149; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 1150; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 1151; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 1152; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 1153; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 1154; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 24 1155; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 24 1156; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8 1157; CHECK-NEXT: store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]], align 1 1158; CHECK-NEXT: ret void 1159; 1160; GFX6-LABEL: sdiv_i8: 1161; GFX6: ; %bb.0: 1162; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1163; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 1164; GFX6-NEXT: s_mov_b32 s7, 0xf000 1165; GFX6-NEXT: s_mov_b32 s6, -1 1166; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1167; GFX6-NEXT: s_bfe_i32 s1, s0, 0x80008 1168; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 1169; GFX6-NEXT: s_sext_i32_i8 s0, s0 1170; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 1171; GFX6-NEXT: s_xor_b32 s0, s0, s1 1172; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 1173; GFX6-NEXT: s_ashr_i32 s0, s0, 30 1174; GFX6-NEXT: s_or_b32 s0, s0, 1 1175; GFX6-NEXT: v_mov_b32_e32 v3, s0 1176; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 1177; GFX6-NEXT: v_trunc_f32_e32 v2, v2 1178; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 1179; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 1180; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 1181; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 1182; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1183; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 1184; GFX6-NEXT: s_endpgm 1185; 1186; GFX9-LABEL: sdiv_i8: 1187; GFX9: ; %bb.0: 1188; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1189; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 1190; GFX9-NEXT: v_mov_b32_e32 v1, 0 1191; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1192; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80008 1193; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 1194; GFX9-NEXT: s_sext_i32_i8 s1, s4 1195; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 1196; GFX9-NEXT: s_xor_b32 s0, s1, s0 1197; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 1198; GFX9-NEXT: s_ashr_i32 s0, s0, 30 1199; GFX9-NEXT: s_or_b32 s4, s0, 1 1200; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 1201; GFX9-NEXT: v_trunc_f32_e32 v3, v3 1202; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 1203; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 1204; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 1205; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 1206; GFX9-NEXT: s_cselect_b32 s0, s4, 0 1207; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 1208; GFX9-NEXT: global_store_byte v1, v0, s[2:3] 1209; GFX9-NEXT: s_endpgm 1210; 1211; GFX90A-LABEL: sdiv_i8: 1212; GFX90A: ; %bb.0: 1213; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1214; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 1215; GFX90A-NEXT: v_mov_b32_e32 v1, 0 1216; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1217; GFX90A-NEXT: s_bfe_i32 s0, s4, 0x80008 1218; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 1219; GFX90A-NEXT: s_sext_i32_i8 s1, s4 1220; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 1221; GFX90A-NEXT: s_xor_b32 s0, s1, s0 1222; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 1223; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 1224; GFX90A-NEXT: s_or_b32 s4, s0, 1 1225; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 1226; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 1227; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 1228; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 1229; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 1230; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 1231; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 1232; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 1233; GFX90A-NEXT: global_store_byte v1, v0, s[2:3] 1234; GFX90A-NEXT: s_endpgm 1235 %r = sdiv i8 %x, %y 1236 store i8 %r, i8 addrspace(1)* %out 1237 ret void 1238} 1239 1240define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 1241; CHECK-LABEL: @srem_i8( 1242; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 1243; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 1244; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 1245; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 1246; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 1247; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 1248; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 1249; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 1250; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 1251; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 1252; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 1253; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 1254; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 1255; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 1256; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 1257; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 1258; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 1259; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 1260; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 1261; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 1262; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 24 1263; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 24 1264; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8 1265; CHECK-NEXT: store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]], align 1 1266; CHECK-NEXT: ret void 1267; 1268; GFX6-LABEL: srem_i8: 1269; GFX6: ; %bb.0: 1270; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1271; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 1272; GFX6-NEXT: s_mov_b32 s7, 0xf000 1273; GFX6-NEXT: s_mov_b32 s6, -1 1274; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1275; GFX6-NEXT: s_bfe_i32 s1, s0, 0x80008 1276; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 1277; GFX6-NEXT: s_sext_i32_i8 s3, s0 1278; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 1279; GFX6-NEXT: s_xor_b32 s1, s3, s1 1280; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 1281; GFX6-NEXT: s_ashr_i32 s1, s1, 30 1282; GFX6-NEXT: s_or_b32 s1, s1, 1 1283; GFX6-NEXT: v_mov_b32_e32 v3, s1 1284; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 1285; GFX6-NEXT: v_trunc_f32_e32 v2, v2 1286; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 1287; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 1288; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 1289; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 1290; GFX6-NEXT: s_lshr_b32 s2, s0, 8 1291; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1292; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 1293; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1294; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 1295; GFX6-NEXT: s_endpgm 1296; 1297; GFX9-LABEL: srem_i8: 1298; GFX9: ; %bb.0: 1299; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 1300; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1301; GFX9-NEXT: s_bfe_i32 s2, s4, 0x80008 1302; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 1303; GFX9-NEXT: s_sext_i32_i8 s3, s4 1304; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 1305; GFX9-NEXT: s_xor_b32 s2, s3, s2 1306; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 1307; GFX9-NEXT: s_ashr_i32 s2, s2, 30 1308; GFX9-NEXT: s_lshr_b32 s5, s4, 8 1309; GFX9-NEXT: s_or_b32 s6, s2, 1 1310; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 1311; GFX9-NEXT: v_trunc_f32_e32 v2, v2 1312; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 1313; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 1314; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 1315; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 1316; GFX9-NEXT: s_cselect_b32 s2, s6, 0 1317; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 1318; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 1319; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1320; GFX9-NEXT: v_mov_b32_e32 v1, 0 1321; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 1322; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1323; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 1324; GFX9-NEXT: s_endpgm 1325; 1326; GFX90A-LABEL: srem_i8: 1327; GFX90A: ; %bb.0: 1328; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1329; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 1330; GFX90A-NEXT: v_mov_b32_e32 v0, 0 1331; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1332; GFX90A-NEXT: s_bfe_i32 s0, s4, 0x80008 1333; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s0 1334; GFX90A-NEXT: s_sext_i32_i8 s1, s4 1335; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 1336; GFX90A-NEXT: s_xor_b32 s0, s1, s0 1337; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v1 1338; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 1339; GFX90A-NEXT: s_lshr_b32 s5, s4, 8 1340; GFX90A-NEXT: s_or_b32 s6, s0, 1 1341; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 1342; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 1343; GFX90A-NEXT: v_mad_f32 v2, -v3, v1, v2 1344; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 1345; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v1| 1346; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 1347; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 1348; GFX90A-NEXT: v_add_u32_e32 v1, s0, v3 1349; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s5 1350; GFX90A-NEXT: v_sub_u32_e32 v1, s4, v1 1351; GFX90A-NEXT: global_store_byte v0, v1, s[2:3] 1352; GFX90A-NEXT: s_endpgm 1353 %r = srem i8 %x, %y 1354 store i8 %r, i8 addrspace(1)* %out 1355 ret void 1356} 1357 1358define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1359; CHECK-LABEL: @udiv_v4i32( 1360; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1361; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1362; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 1363; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 1364; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 1365; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 1366; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 1367; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 1368; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 1369; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 1370; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 1371; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 1372; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 1373; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 1374; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 1375; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 1376; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1377; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1378; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1379; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1380; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1381; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 1382; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 1383; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 1384; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1 1385; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]] 1386; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]] 1387; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]] 1388; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]] 1389; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 1390; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 1391; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i64 0 1392; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[X]], i64 1 1393; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1394; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float 1395; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 1396; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000 1397; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32 1398; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]] 1399; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]] 1400; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64 1401; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64 1402; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]] 1403; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 1404; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32 1405; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32 1406; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]] 1407; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64 1408; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64 1409; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]] 1410; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 1411; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32 1412; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32 1413; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]] 1414; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]] 1415; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]] 1416; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1 1417; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]] 1418; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]] 1419; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]] 1420; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]] 1421; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1 1422; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]] 1423; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP63]], i64 1 1424; CHECK-NEXT: [[TMP65:%.*]] = extractelement <4 x i32> [[X]], i64 2 1425; CHECK-NEXT: [[TMP66:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1426; CHECK-NEXT: [[TMP67:%.*]] = uitofp i32 [[TMP66]] to float 1427; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP67]]) 1428; CHECK-NEXT: [[TMP69:%.*]] = fmul fast float [[TMP68]], 0x41EFFFFFC0000000 1429; CHECK-NEXT: [[TMP70:%.*]] = fptoui float [[TMP69]] to i32 1430; CHECK-NEXT: [[TMP71:%.*]] = sub i32 0, [[TMP66]] 1431; CHECK-NEXT: [[TMP72:%.*]] = mul i32 [[TMP71]], [[TMP70]] 1432; CHECK-NEXT: [[TMP73:%.*]] = zext i32 [[TMP70]] to i64 1433; CHECK-NEXT: [[TMP74:%.*]] = zext i32 [[TMP72]] to i64 1434; CHECK-NEXT: [[TMP75:%.*]] = mul i64 [[TMP73]], [[TMP74]] 1435; CHECK-NEXT: [[TMP76:%.*]] = trunc i64 [[TMP75]] to i32 1436; CHECK-NEXT: [[TMP77:%.*]] = lshr i64 [[TMP75]], 32 1437; CHECK-NEXT: [[TMP78:%.*]] = trunc i64 [[TMP77]] to i32 1438; CHECK-NEXT: [[TMP79:%.*]] = add i32 [[TMP70]], [[TMP78]] 1439; CHECK-NEXT: [[TMP80:%.*]] = zext i32 [[TMP65]] to i64 1440; CHECK-NEXT: [[TMP81:%.*]] = zext i32 [[TMP79]] to i64 1441; CHECK-NEXT: [[TMP82:%.*]] = mul i64 [[TMP80]], [[TMP81]] 1442; CHECK-NEXT: [[TMP83:%.*]] = trunc i64 [[TMP82]] to i32 1443; CHECK-NEXT: [[TMP84:%.*]] = lshr i64 [[TMP82]], 32 1444; CHECK-NEXT: [[TMP85:%.*]] = trunc i64 [[TMP84]] to i32 1445; CHECK-NEXT: [[TMP86:%.*]] = mul i32 [[TMP85]], [[TMP66]] 1446; CHECK-NEXT: [[TMP87:%.*]] = sub i32 [[TMP65]], [[TMP86]] 1447; CHECK-NEXT: [[TMP88:%.*]] = icmp uge i32 [[TMP87]], [[TMP66]] 1448; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP85]], 1 1449; CHECK-NEXT: [[TMP90:%.*]] = select i1 [[TMP88]], i32 [[TMP89]], i32 [[TMP85]] 1450; CHECK-NEXT: [[TMP91:%.*]] = sub i32 [[TMP87]], [[TMP66]] 1451; CHECK-NEXT: [[TMP92:%.*]] = select i1 [[TMP88]], i32 [[TMP91]], i32 [[TMP87]] 1452; CHECK-NEXT: [[TMP93:%.*]] = icmp uge i32 [[TMP92]], [[TMP66]] 1453; CHECK-NEXT: [[TMP94:%.*]] = add i32 [[TMP90]], 1 1454; CHECK-NEXT: [[TMP95:%.*]] = select i1 [[TMP93]], i32 [[TMP94]], i32 [[TMP90]] 1455; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP95]], i64 2 1456; CHECK-NEXT: [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 3 1457; CHECK-NEXT: [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1458; CHECK-NEXT: [[TMP99:%.*]] = uitofp i32 [[TMP98]] to float 1459; CHECK-NEXT: [[TMP100:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP99]]) 1460; CHECK-NEXT: [[TMP101:%.*]] = fmul fast float [[TMP100]], 0x41EFFFFFC0000000 1461; CHECK-NEXT: [[TMP102:%.*]] = fptoui float [[TMP101]] to i32 1462; CHECK-NEXT: [[TMP103:%.*]] = sub i32 0, [[TMP98]] 1463; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP102]] 1464; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP102]] to i64 1465; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64 1466; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 1467; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 1468; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 1469; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 1470; CHECK-NEXT: [[TMP111:%.*]] = add i32 [[TMP102]], [[TMP110]] 1471; CHECK-NEXT: [[TMP112:%.*]] = zext i32 [[TMP97]] to i64 1472; CHECK-NEXT: [[TMP113:%.*]] = zext i32 [[TMP111]] to i64 1473; CHECK-NEXT: [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]] 1474; CHECK-NEXT: [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32 1475; CHECK-NEXT: [[TMP116:%.*]] = lshr i64 [[TMP114]], 32 1476; CHECK-NEXT: [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32 1477; CHECK-NEXT: [[TMP118:%.*]] = mul i32 [[TMP117]], [[TMP98]] 1478; CHECK-NEXT: [[TMP119:%.*]] = sub i32 [[TMP97]], [[TMP118]] 1479; CHECK-NEXT: [[TMP120:%.*]] = icmp uge i32 [[TMP119]], [[TMP98]] 1480; CHECK-NEXT: [[TMP121:%.*]] = add i32 [[TMP117]], 1 1481; CHECK-NEXT: [[TMP122:%.*]] = select i1 [[TMP120]], i32 [[TMP121]], i32 [[TMP117]] 1482; CHECK-NEXT: [[TMP123:%.*]] = sub i32 [[TMP119]], [[TMP98]] 1483; CHECK-NEXT: [[TMP124:%.*]] = select i1 [[TMP120]], i32 [[TMP123]], i32 [[TMP119]] 1484; CHECK-NEXT: [[TMP125:%.*]] = icmp uge i32 [[TMP124]], [[TMP98]] 1485; CHECK-NEXT: [[TMP126:%.*]] = add i32 [[TMP122]], 1 1486; CHECK-NEXT: [[TMP127:%.*]] = select i1 [[TMP125]], i32 [[TMP126]], i32 [[TMP122]] 1487; CHECK-NEXT: [[TMP128:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP127]], i64 3 1488; CHECK-NEXT: store <4 x i32> [[TMP128]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1489; CHECK-NEXT: ret void 1490; 1491; GFX6-LABEL: udiv_v4i32: 1492; GFX6: ; %bb.0: 1493; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1494; GFX6-NEXT: s_mov_b32 s3, 0x4f7ffffe 1495; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 1496; GFX6-NEXT: s_mov_b32 s15, 0xf000 1497; GFX6-NEXT: s_mov_b32 s14, -1 1498; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1499; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 1500; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 1501; GFX6-NEXT: s_sub_i32 s2, 0, s8 1502; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s10 1503; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 1504; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 1505; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s11 1506; GFX6-NEXT: v_mul_f32_e32 v0, s3, v0 1507; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 1508; GFX6-NEXT: v_mul_f32_e32 v1, s3, v1 1509; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 1510; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 1511; GFX6-NEXT: s_sub_i32 s2, 0, s9 1512; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 1513; GFX6-NEXT: s_sub_i32 s2, 0, s10 1514; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 1515; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 1516; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1517; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 1518; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 1519; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 1520; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 1521; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 1522; GFX6-NEXT: v_mul_lo_u32 v5, v1, s9 1523; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 1524; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 1525; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 1526; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v2 1527; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 1528; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 1529; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 1530; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v4 1531; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1532; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s5, v5 1533; GFX6-NEXT: v_mul_f32_e32 v2, s3, v2 1534; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 1535; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1536; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v3 1537; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 1538; GFX6-NEXT: v_mul_lo_u32 v4, s2, v2 1539; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s9, v3 1540; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1541; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4 1542; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v1 1543; GFX6-NEXT: s_sub_i32 s0, 0, s11 1544; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 1545; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v6 1546; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 1547; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1548; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 1549; GFX6-NEXT: v_mul_f32_e32 v4, s3, v4 1550; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 1551; GFX6-NEXT: v_mul_lo_u32 v3, v2, s10 1552; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v2 1553; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 1554; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 1555; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v3 1556; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 1557; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 1558; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s10, v3 1559; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 1560; GFX6-NEXT: v_mul_hi_u32 v4, s7, v4 1561; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 1562; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v2 1563; GFX6-NEXT: v_mul_lo_u32 v6, v4, s11 1564; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 1565; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1566; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1567; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v6 1568; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v3 1569; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] 1570; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s11, v3 1571; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1572; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1573; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1574; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 1575; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 1576; GFX6-NEXT: s_endpgm 1577; 1578; GFX9-LABEL: udiv_v4i32: 1579; GFX9: ; %bb.0: 1580; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 1581; GFX9-NEXT: s_mov_b32 s12, 0x4f7ffffe 1582; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1583; GFX9-NEXT: v_mov_b32_e32 v4, 0 1584; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1585; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 1586; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 1587; GFX9-NEXT: s_sub_i32 s2, 0, s8 1588; GFX9-NEXT: s_sub_i32 s3, 0, s9 1589; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1590; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 1591; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s10 1592; GFX9-NEXT: v_mul_f32_e32 v0, s12, v0 1593; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1594; GFX9-NEXT: v_mul_f32_e32 v1, s12, v1 1595; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 1596; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 1597; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 1598; GFX9-NEXT: s_sub_i32 s2, 0, s10 1599; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 1600; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 1601; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 1602; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 1603; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 1604; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 1605; GFX9-NEXT: v_mul_f32_e32 v3, s12, v5 1606; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 1607; GFX9-NEXT: v_mul_lo_u32 v5, v0, s8 1608; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s11 1609; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 1610; GFX9-NEXT: v_add_u32_e32 v7, 1, v0 1611; GFX9-NEXT: v_sub_u32_e32 v5, s4, v5 1612; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 1613; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc 1614; GFX9-NEXT: v_subrev_u32_e32 v7, s8, v5 1615; GFX9-NEXT: v_mul_lo_u32 v6, v1, s9 1616; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc 1617; GFX9-NEXT: v_add_u32_e32 v7, 1, v0 1618; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 1619; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 1620; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc 1621; GFX9-NEXT: v_mul_lo_u32 v7, s2, v3 1622; GFX9-NEXT: v_sub_u32_e32 v6, s5, v6 1623; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 1624; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v6 1625; GFX9-NEXT: v_mul_f32_e32 v2, s12, v2 1626; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1627; GFX9-NEXT: v_mul_hi_u32 v5, v3, v7 1628; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 1629; GFX9-NEXT: s_sub_i32 s2, 0, s11 1630; GFX9-NEXT: v_subrev_u32_e32 v7, s9, v6 1631; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 1632; GFX9-NEXT: v_mul_lo_u32 v5, s2, v2 1633; GFX9-NEXT: v_mul_hi_u32 v3, s6, v3 1634; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc 1635; GFX9-NEXT: v_add_u32_e32 v7, 1, v1 1636; GFX9-NEXT: v_mul_hi_u32 v5, v2, v5 1637; GFX9-NEXT: v_mul_lo_u32 v8, v3, s10 1638; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v6 1639; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc 1640; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 1641; GFX9-NEXT: v_mul_hi_u32 v5, s7, v2 1642; GFX9-NEXT: v_sub_u32_e32 v6, s6, v8 1643; GFX9-NEXT: v_add_u32_e32 v7, 1, v3 1644; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v6 1645; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc 1646; GFX9-NEXT: v_subrev_u32_e32 v3, s10, v6 1647; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc 1648; GFX9-NEXT: v_mul_lo_u32 v6, v5, s11 1649; GFX9-NEXT: v_add_u32_e32 v7, 1, v2 1650; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 1651; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc 1652; GFX9-NEXT: v_sub_u32_e32 v3, s7, v6 1653; GFX9-NEXT: v_add_u32_e32 v6, 1, v5 1654; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1655; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 1656; GFX9-NEXT: v_subrev_u32_e32 v6, s11, v3 1657; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1658; GFX9-NEXT: v_add_u32_e32 v6, 1, v5 1659; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1660; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 1661; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1662; GFX9-NEXT: s_endpgm 1663; 1664; GFX90A-LABEL: udiv_v4i32: 1665; GFX90A: ; %bb.0: 1666; GFX90A-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 1667; GFX90A-NEXT: s_mov_b32 s3, 0x4f7ffffe 1668; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1669; GFX90A-NEXT: v_mov_b32_e32 v4, 0 1670; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1671; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 1672; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 1673; GFX90A-NEXT: s_sub_i32 s2, 0, s8 1674; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 1675; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 1676; GFX90A-NEXT: v_mul_f32_e32 v0, s3, v0 1677; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 1678; GFX90A-NEXT: v_mul_f32_e32 v1, s3, v1 1679; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 1680; GFX90A-NEXT: v_mul_lo_u32 v2, s2, v0 1681; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 1682; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 1683; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 1684; GFX90A-NEXT: v_mul_lo_u32 v2, v0, s8 1685; GFX90A-NEXT: v_sub_u32_e32 v2, s4, v2 1686; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 1687; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 1688; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1689; GFX90A-NEXT: v_subrev_u32_e32 v3, s8, v2 1690; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 1691; GFX90A-NEXT: s_sub_i32 s2, 0, s9 1692; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 1693; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 1694; GFX90A-NEXT: v_mul_lo_u32 v2, s2, v1 1695; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1696; GFX90A-NEXT: v_mul_hi_u32 v2, v1, v2 1697; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s10 1698; GFX90A-NEXT: v_add_u32_e32 v1, v1, v2 1699; GFX90A-NEXT: v_mul_hi_u32 v1, s5, v1 1700; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s9 1701; GFX90A-NEXT: v_sub_u32_e32 v2, s5, v2 1702; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v3 1703; GFX90A-NEXT: v_add_u32_e32 v5, 1, v1 1704; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 1705; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1706; GFX90A-NEXT: v_subrev_u32_e32 v5, s9, v2 1707; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1708; GFX90A-NEXT: v_add_u32_e32 v5, 1, v1 1709; GFX90A-NEXT: v_mul_f32_e32 v3, s3, v3 1710; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 1711; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 1712; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1713; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s11 1714; GFX90A-NEXT: s_sub_i32 s2, 0, s10 1715; GFX90A-NEXT: v_mul_lo_u32 v2, s2, v3 1716; GFX90A-NEXT: v_mul_hi_u32 v2, v3, v2 1717; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v5 1718; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 1719; GFX90A-NEXT: v_mul_hi_u32 v2, s6, v2 1720; GFX90A-NEXT: v_mul_lo_u32 v3, v2, s10 1721; GFX90A-NEXT: v_mul_f32_e32 v5, s3, v5 1722; GFX90A-NEXT: v_sub_u32_e32 v3, s6, v3 1723; GFX90A-NEXT: v_cvt_u32_f32_e32 v5, v5 1724; GFX90A-NEXT: v_add_u32_e32 v6, 1, v2 1725; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 1726; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 1727; GFX90A-NEXT: v_subrev_u32_e32 v6, s10, v3 1728; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1729; GFX90A-NEXT: s_sub_i32 s2, 0, s11 1730; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 1731; GFX90A-NEXT: v_mul_lo_u32 v3, s2, v5 1732; GFX90A-NEXT: v_mul_hi_u32 v3, v5, v3 1733; GFX90A-NEXT: v_add_u32_e32 v3, v5, v3 1734; GFX90A-NEXT: v_mul_hi_u32 v3, s7, v3 1735; GFX90A-NEXT: v_mul_lo_u32 v5, v3, s11 1736; GFX90A-NEXT: v_add_u32_e32 v6, 1, v2 1737; GFX90A-NEXT: v_sub_u32_e32 v5, s7, v5 1738; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 1739; GFX90A-NEXT: v_add_u32_e32 v6, 1, v3 1740; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 1741; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1742; GFX90A-NEXT: v_subrev_u32_e32 v6, s11, v5 1743; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 1744; GFX90A-NEXT: v_add_u32_e32 v6, 1, v3 1745; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 1746; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1747; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1748; GFX90A-NEXT: s_endpgm 1749 %r = udiv <4 x i32> %x, %y 1750 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1751 ret void 1752} 1753 1754define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1755; CHECK-LABEL: @urem_v4i32( 1756; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1757; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1758; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 1759; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 1760; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 1761; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 1762; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 1763; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 1764; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 1765; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 1766; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 1767; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 1768; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 1769; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 1770; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 1771; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 1772; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1773; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1774; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1775; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1776; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1777; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 1778; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 1779; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 1780; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]] 1781; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]] 1782; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]] 1783; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]] 1784; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]] 1785; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> undef, i32 [[TMP29]], i64 0 1786; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[X]], i64 1 1787; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1788; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float 1789; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 1790; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000 1791; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32 1792; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]] 1793; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]] 1794; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64 1795; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64 1796; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]] 1797; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32 1798; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32 1799; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 1800; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]] 1801; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64 1802; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64 1803; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]] 1804; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32 1805; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32 1806; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 1807; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]] 1808; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]] 1809; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]] 1810; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]] 1811; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]] 1812; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]] 1813; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]] 1814; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]] 1815; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP59]], i64 1 1816; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i32> [[X]], i64 2 1817; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1818; CHECK-NEXT: [[TMP63:%.*]] = uitofp i32 [[TMP62]] to float 1819; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP63]]) 1820; CHECK-NEXT: [[TMP65:%.*]] = fmul fast float [[TMP64]], 0x41EFFFFFC0000000 1821; CHECK-NEXT: [[TMP66:%.*]] = fptoui float [[TMP65]] to i32 1822; CHECK-NEXT: [[TMP67:%.*]] = sub i32 0, [[TMP62]] 1823; CHECK-NEXT: [[TMP68:%.*]] = mul i32 [[TMP67]], [[TMP66]] 1824; CHECK-NEXT: [[TMP69:%.*]] = zext i32 [[TMP66]] to i64 1825; CHECK-NEXT: [[TMP70:%.*]] = zext i32 [[TMP68]] to i64 1826; CHECK-NEXT: [[TMP71:%.*]] = mul i64 [[TMP69]], [[TMP70]] 1827; CHECK-NEXT: [[TMP72:%.*]] = trunc i64 [[TMP71]] to i32 1828; CHECK-NEXT: [[TMP73:%.*]] = lshr i64 [[TMP71]], 32 1829; CHECK-NEXT: [[TMP74:%.*]] = trunc i64 [[TMP73]] to i32 1830; CHECK-NEXT: [[TMP75:%.*]] = add i32 [[TMP66]], [[TMP74]] 1831; CHECK-NEXT: [[TMP76:%.*]] = zext i32 [[TMP61]] to i64 1832; CHECK-NEXT: [[TMP77:%.*]] = zext i32 [[TMP75]] to i64 1833; CHECK-NEXT: [[TMP78:%.*]] = mul i64 [[TMP76]], [[TMP77]] 1834; CHECK-NEXT: [[TMP79:%.*]] = trunc i64 [[TMP78]] to i32 1835; CHECK-NEXT: [[TMP80:%.*]] = lshr i64 [[TMP78]], 32 1836; CHECK-NEXT: [[TMP81:%.*]] = trunc i64 [[TMP80]] to i32 1837; CHECK-NEXT: [[TMP82:%.*]] = mul i32 [[TMP81]], [[TMP62]] 1838; CHECK-NEXT: [[TMP83:%.*]] = sub i32 [[TMP61]], [[TMP82]] 1839; CHECK-NEXT: [[TMP84:%.*]] = icmp uge i32 [[TMP83]], [[TMP62]] 1840; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP83]], [[TMP62]] 1841; CHECK-NEXT: [[TMP86:%.*]] = select i1 [[TMP84]], i32 [[TMP85]], i32 [[TMP83]] 1842; CHECK-NEXT: [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP62]] 1843; CHECK-NEXT: [[TMP88:%.*]] = sub i32 [[TMP86]], [[TMP62]] 1844; CHECK-NEXT: [[TMP89:%.*]] = select i1 [[TMP87]], i32 [[TMP88]], i32 [[TMP86]] 1845; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP89]], i64 2 1846; CHECK-NEXT: [[TMP91:%.*]] = extractelement <4 x i32> [[X]], i64 3 1847; CHECK-NEXT: [[TMP92:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1848; CHECK-NEXT: [[TMP93:%.*]] = uitofp i32 [[TMP92]] to float 1849; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP93]]) 1850; CHECK-NEXT: [[TMP95:%.*]] = fmul fast float [[TMP94]], 0x41EFFFFFC0000000 1851; CHECK-NEXT: [[TMP96:%.*]] = fptoui float [[TMP95]] to i32 1852; CHECK-NEXT: [[TMP97:%.*]] = sub i32 0, [[TMP92]] 1853; CHECK-NEXT: [[TMP98:%.*]] = mul i32 [[TMP97]], [[TMP96]] 1854; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP96]] to i64 1855; CHECK-NEXT: [[TMP100:%.*]] = zext i32 [[TMP98]] to i64 1856; CHECK-NEXT: [[TMP101:%.*]] = mul i64 [[TMP99]], [[TMP100]] 1857; CHECK-NEXT: [[TMP102:%.*]] = trunc i64 [[TMP101]] to i32 1858; CHECK-NEXT: [[TMP103:%.*]] = lshr i64 [[TMP101]], 32 1859; CHECK-NEXT: [[TMP104:%.*]] = trunc i64 [[TMP103]] to i32 1860; CHECK-NEXT: [[TMP105:%.*]] = add i32 [[TMP96]], [[TMP104]] 1861; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP91]] to i64 1862; CHECK-NEXT: [[TMP107:%.*]] = zext i32 [[TMP105]] to i64 1863; CHECK-NEXT: [[TMP108:%.*]] = mul i64 [[TMP106]], [[TMP107]] 1864; CHECK-NEXT: [[TMP109:%.*]] = trunc i64 [[TMP108]] to i32 1865; CHECK-NEXT: [[TMP110:%.*]] = lshr i64 [[TMP108]], 32 1866; CHECK-NEXT: [[TMP111:%.*]] = trunc i64 [[TMP110]] to i32 1867; CHECK-NEXT: [[TMP112:%.*]] = mul i32 [[TMP111]], [[TMP92]] 1868; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP91]], [[TMP112]] 1869; CHECK-NEXT: [[TMP114:%.*]] = icmp uge i32 [[TMP113]], [[TMP92]] 1870; CHECK-NEXT: [[TMP115:%.*]] = sub i32 [[TMP113]], [[TMP92]] 1871; CHECK-NEXT: [[TMP116:%.*]] = select i1 [[TMP114]], i32 [[TMP115]], i32 [[TMP113]] 1872; CHECK-NEXT: [[TMP117:%.*]] = icmp uge i32 [[TMP116]], [[TMP92]] 1873; CHECK-NEXT: [[TMP118:%.*]] = sub i32 [[TMP116]], [[TMP92]] 1874; CHECK-NEXT: [[TMP119:%.*]] = select i1 [[TMP117]], i32 [[TMP118]], i32 [[TMP116]] 1875; CHECK-NEXT: [[TMP120:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP119]], i64 3 1876; CHECK-NEXT: store <4 x i32> [[TMP120]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1877; CHECK-NEXT: ret void 1878; 1879; GFX6-LABEL: urem_v4i32: 1880; GFX6: ; %bb.0: 1881; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1882; GFX6-NEXT: s_mov_b32 s13, 0x4f7ffffe 1883; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1884; GFX6-NEXT: s_mov_b32 s3, 0xf000 1885; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1886; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 1887; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 1888; GFX6-NEXT: s_sub_i32 s2, 0, s8 1889; GFX6-NEXT: s_sub_i32 s12, 0, s9 1890; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 1891; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 1892; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 1893; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s11 1894; GFX6-NEXT: v_mul_f32_e32 v0, s13, v0 1895; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 1896; GFX6-NEXT: v_mul_f32_e32 v1, s13, v1 1897; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 1898; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 1899; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 1900; GFX6-NEXT: s_mov_b32 s2, -1 1901; GFX6-NEXT: v_mul_lo_u32 v4, s12, v1 1902; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 1903; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 1904; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1905; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 1906; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 1907; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 1908; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 1909; GFX6-NEXT: v_mul_f32_e32 v2, s13, v3 1910; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 1911; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 1912; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1913; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 1914; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1915; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1916; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 1917; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1918; GFX6-NEXT: s_sub_i32 s4, 0, s10 1919; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1920; GFX6-NEXT: v_mul_lo_u32 v3, s4, v2 1921; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 1922; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 1923; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1924; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 1925; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1926; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 1927; GFX6-NEXT: s_sub_i32 s4, 0, s11 1928; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 1929; GFX6-NEXT: v_mul_f32_e32 v3, s13, v4 1930; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 1931; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 1932; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 1933; GFX6-NEXT: v_mul_lo_u32 v5, s4, v3 1934; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1935; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1936; GFX6-NEXT: v_mul_lo_u32 v2, v2, s10 1937; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 1938; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 1939; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 1940; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 1941; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 1942; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1943; GFX6-NEXT: v_mul_lo_u32 v3, v3, s11 1944; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1945; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s10, v2 1946; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1947; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 1948; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 1949; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 1950; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1951; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1952; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 1953; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1954; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1955; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1956; GFX6-NEXT: s_endpgm 1957; 1958; GFX9-LABEL: urem_v4i32: 1959; GFX9: ; %bb.0: 1960; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 1961; GFX9-NEXT: s_mov_b32 s12, 0x4f7ffffe 1962; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1963; GFX9-NEXT: v_mov_b32_e32 v4, 0 1964; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1965; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 1966; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 1967; GFX9-NEXT: s_sub_i32 s2, 0, s8 1968; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s10 1969; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1970; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 1971; GFX9-NEXT: s_sub_i32 s3, 0, s9 1972; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 1973; GFX9-NEXT: v_mul_f32_e32 v0, s12, v0 1974; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1975; GFX9-NEXT: v_mul_f32_e32 v1, s12, v1 1976; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 1977; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s11 1978; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 1979; GFX9-NEXT: s_sub_i32 s2, 0, s10 1980; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 1981; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 1982; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 1983; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 1984; GFX9-NEXT: v_mul_f32_e32 v2, s12, v5 1985; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 1986; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 1987; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v6 1988; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 1989; GFX9-NEXT: v_mul_lo_u32 v5, s2, v2 1990; GFX9-NEXT: s_sub_i32 s2, 0, s11 1991; GFX9-NEXT: v_mul_f32_e32 v3, s12, v3 1992; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 1993; GFX9-NEXT: v_mul_hi_u32 v5, v2, v5 1994; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 1995; GFX9-NEXT: v_mul_lo_u32 v0, v0, s8 1996; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 1997; GFX9-NEXT: v_mul_lo_u32 v5, s2, v3 1998; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 1999; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 2000; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 2001; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 2002; GFX9-NEXT: v_subrev_u32_e32 v6, s8, v0 2003; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2004; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 2005; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 2006; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 2007; GFX9-NEXT: v_mul_lo_u32 v2, v2, s10 2008; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 2009; GFX9-NEXT: v_subrev_u32_e32 v6, s8, v0 2010; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2011; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 2012; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v1 2013; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2014; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 2015; GFX9-NEXT: v_mul_lo_u32 v3, v3, s11 2016; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v1 2017; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2018; GFX9-NEXT: v_sub_u32_e32 v2, s6, v2 2019; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 2020; GFX9-NEXT: v_subrev_u32_e32 v5, s10, v2 2021; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 2022; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2023; GFX9-NEXT: v_subrev_u32_e32 v5, s10, v2 2024; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 2025; GFX9-NEXT: v_sub_u32_e32 v3, s7, v3 2026; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2027; GFX9-NEXT: v_subrev_u32_e32 v5, s11, v3 2028; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 2029; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2030; GFX9-NEXT: v_subrev_u32_e32 v5, s11, v3 2031; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 2032; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2033; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2034; GFX9-NEXT: s_endpgm 2035; 2036; GFX90A-LABEL: urem_v4i32: 2037; GFX90A: ; %bb.0: 2038; GFX90A-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 2039; GFX90A-NEXT: s_mov_b32 s12, 0x4f7ffffe 2040; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2041; GFX90A-NEXT: v_mov_b32_e32 v4, 0 2042; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 2043; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 2044; GFX90A-NEXT: s_sub_i32 s2, 0, s8 2045; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 2046; GFX90A-NEXT: s_sub_i32 s3, 0, s9 2047; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 2048; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 2049; GFX90A-NEXT: v_mul_f32_e32 v0, s12, v0 2050; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 2051; GFX90A-NEXT: v_mul_f32_e32 v1, s12, v1 2052; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 2053; GFX90A-NEXT: v_mul_lo_u32 v2, s2, v0 2054; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 2055; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 2056; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 2057; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s8 2058; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 2059; GFX90A-NEXT: v_subrev_u32_e32 v2, s8, v0 2060; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2061; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2062; GFX90A-NEXT: v_subrev_u32_e32 v2, s8, v0 2063; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2064; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2065; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s10 2066; GFX90A-NEXT: v_mul_lo_u32 v3, s3, v1 2067; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 2068; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 2069; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v2 2070; GFX90A-NEXT: v_mul_hi_u32 v1, s5, v1 2071; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s9 2072; GFX90A-NEXT: v_sub_u32_e32 v1, s5, v1 2073; GFX90A-NEXT: v_mul_f32_e32 v2, s12, v2 2074; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 2075; GFX90A-NEXT: v_subrev_u32_e32 v3, s9, v1 2076; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2077; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2078; GFX90A-NEXT: v_subrev_u32_e32 v3, s9, v1 2079; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2080; GFX90A-NEXT: s_sub_i32 s2, 0, s10 2081; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2082; GFX90A-NEXT: v_mul_lo_u32 v3, s2, v2 2083; GFX90A-NEXT: v_mul_hi_u32 v3, v2, v3 2084; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 2085; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s11 2086; GFX90A-NEXT: v_mul_hi_u32 v2, s6, v2 2087; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s10 2088; GFX90A-NEXT: v_sub_u32_e32 v2, s6, v2 2089; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v3 2090; GFX90A-NEXT: v_subrev_u32_e32 v5, s10, v2 2091; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 2092; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2093; GFX90A-NEXT: v_mul_f32_e32 v3, s12, v3 2094; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 2095; GFX90A-NEXT: v_subrev_u32_e32 v5, s10, v2 2096; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 2097; GFX90A-NEXT: s_sub_i32 s2, 0, s11 2098; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2099; GFX90A-NEXT: v_mul_lo_u32 v5, s2, v3 2100; GFX90A-NEXT: v_mul_hi_u32 v5, v3, v5 2101; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 2102; GFX90A-NEXT: v_mul_hi_u32 v3, s7, v3 2103; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s11 2104; GFX90A-NEXT: v_sub_u32_e32 v3, s7, v3 2105; GFX90A-NEXT: v_subrev_u32_e32 v5, s11, v3 2106; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 2107; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2108; GFX90A-NEXT: v_subrev_u32_e32 v5, s11, v3 2109; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 2110; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2111; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2112; GFX90A-NEXT: s_endpgm 2113 %r = urem <4 x i32> %x, %y 2114 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 2115 ret void 2116} 2117 2118define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 2119; CHECK-LABEL: @sdiv_v4i32( 2120; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 2121; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 2122; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 2123; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 2124; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 2125; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 2126; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 2127; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 2128; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 2129; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 2130; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 2131; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000 2132; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 2133; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]] 2134; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]] 2135; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64 2136; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 2137; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 2138; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 2139; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 2140; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 2141; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]] 2142; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64 2143; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64 2144; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 2145; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 2146; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 2147; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 2148; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]] 2149; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]] 2150; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]] 2151; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1 2152; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]] 2153; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]] 2154; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]] 2155; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]] 2156; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1 2157; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]] 2158; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]] 2159; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]] 2160; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> undef, i32 [[TMP40]], i64 0 2161; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[X]], i64 1 2162; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i32> [[Y]], i64 1 2163; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31 2164; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31 2165; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]] 2166; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]] 2167; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]] 2168; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]] 2169; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]] 2170; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float 2171; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]]) 2172; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000 2173; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32 2174; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]] 2175; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]] 2176; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64 2177; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64 2178; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]] 2179; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32 2180; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32 2181; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32 2182; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]] 2183; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64 2184; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64 2185; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]] 2186; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32 2187; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32 2188; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32 2189; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]] 2190; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]] 2191; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]] 2192; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1 2193; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]] 2194; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]] 2195; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]] 2196; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]] 2197; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1 2198; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]] 2199; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]] 2200; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]] 2201; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP81]], i64 1 2202; CHECK-NEXT: [[TMP83:%.*]] = extractelement <4 x i32> [[X]], i64 2 2203; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i32> [[Y]], i64 2 2204; CHECK-NEXT: [[TMP85:%.*]] = ashr i32 [[TMP83]], 31 2205; CHECK-NEXT: [[TMP86:%.*]] = ashr i32 [[TMP84]], 31 2206; CHECK-NEXT: [[TMP87:%.*]] = xor i32 [[TMP85]], [[TMP86]] 2207; CHECK-NEXT: [[TMP88:%.*]] = add i32 [[TMP83]], [[TMP85]] 2208; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP84]], [[TMP86]] 2209; CHECK-NEXT: [[TMP90:%.*]] = xor i32 [[TMP88]], [[TMP85]] 2210; CHECK-NEXT: [[TMP91:%.*]] = xor i32 [[TMP89]], [[TMP86]] 2211; CHECK-NEXT: [[TMP92:%.*]] = uitofp i32 [[TMP91]] to float 2212; CHECK-NEXT: [[TMP93:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP92]]) 2213; CHECK-NEXT: [[TMP94:%.*]] = fmul fast float [[TMP93]], 0x41EFFFFFC0000000 2214; CHECK-NEXT: [[TMP95:%.*]] = fptoui float [[TMP94]] to i32 2215; CHECK-NEXT: [[TMP96:%.*]] = sub i32 0, [[TMP91]] 2216; CHECK-NEXT: [[TMP97:%.*]] = mul i32 [[TMP96]], [[TMP95]] 2217; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP95]] to i64 2218; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64 2219; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]] 2220; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 2221; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32 2222; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32 2223; CHECK-NEXT: [[TMP104:%.*]] = add i32 [[TMP95]], [[TMP103]] 2224; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP90]] to i64 2225; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64 2226; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 2227; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 2228; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 2229; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 2230; CHECK-NEXT: [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP91]] 2231; CHECK-NEXT: [[TMP112:%.*]] = sub i32 [[TMP90]], [[TMP111]] 2232; CHECK-NEXT: [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP91]] 2233; CHECK-NEXT: [[TMP114:%.*]] = add i32 [[TMP110]], 1 2234; CHECK-NEXT: [[TMP115:%.*]] = select i1 [[TMP113]], i32 [[TMP114]], i32 [[TMP110]] 2235; CHECK-NEXT: [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP91]] 2236; CHECK-NEXT: [[TMP117:%.*]] = select i1 [[TMP113]], i32 [[TMP116]], i32 [[TMP112]] 2237; CHECK-NEXT: [[TMP118:%.*]] = icmp uge i32 [[TMP117]], [[TMP91]] 2238; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], 1 2239; CHECK-NEXT: [[TMP120:%.*]] = select i1 [[TMP118]], i32 [[TMP119]], i32 [[TMP115]] 2240; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP120]], [[TMP87]] 2241; CHECK-NEXT: [[TMP122:%.*]] = sub i32 [[TMP121]], [[TMP87]] 2242; CHECK-NEXT: [[TMP123:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP122]], i64 2 2243; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i32> [[X]], i64 3 2244; CHECK-NEXT: [[TMP125:%.*]] = extractelement <4 x i32> [[Y]], i64 3 2245; CHECK-NEXT: [[TMP126:%.*]] = ashr i32 [[TMP124]], 31 2246; CHECK-NEXT: [[TMP127:%.*]] = ashr i32 [[TMP125]], 31 2247; CHECK-NEXT: [[TMP128:%.*]] = xor i32 [[TMP126]], [[TMP127]] 2248; CHECK-NEXT: [[TMP129:%.*]] = add i32 [[TMP124]], [[TMP126]] 2249; CHECK-NEXT: [[TMP130:%.*]] = add i32 [[TMP125]], [[TMP127]] 2250; CHECK-NEXT: [[TMP131:%.*]] = xor i32 [[TMP129]], [[TMP126]] 2251; CHECK-NEXT: [[TMP132:%.*]] = xor i32 [[TMP130]], [[TMP127]] 2252; CHECK-NEXT: [[TMP133:%.*]] = uitofp i32 [[TMP132]] to float 2253; CHECK-NEXT: [[TMP134:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP133]]) 2254; CHECK-NEXT: [[TMP135:%.*]] = fmul fast float [[TMP134]], 0x41EFFFFFC0000000 2255; CHECK-NEXT: [[TMP136:%.*]] = fptoui float [[TMP135]] to i32 2256; CHECK-NEXT: [[TMP137:%.*]] = sub i32 0, [[TMP132]] 2257; CHECK-NEXT: [[TMP138:%.*]] = mul i32 [[TMP137]], [[TMP136]] 2258; CHECK-NEXT: [[TMP139:%.*]] = zext i32 [[TMP136]] to i64 2259; CHECK-NEXT: [[TMP140:%.*]] = zext i32 [[TMP138]] to i64 2260; CHECK-NEXT: [[TMP141:%.*]] = mul i64 [[TMP139]], [[TMP140]] 2261; CHECK-NEXT: [[TMP142:%.*]] = trunc i64 [[TMP141]] to i32 2262; CHECK-NEXT: [[TMP143:%.*]] = lshr i64 [[TMP141]], 32 2263; CHECK-NEXT: [[TMP144:%.*]] = trunc i64 [[TMP143]] to i32 2264; CHECK-NEXT: [[TMP145:%.*]] = add i32 [[TMP136]], [[TMP144]] 2265; CHECK-NEXT: [[TMP146:%.*]] = zext i32 [[TMP131]] to i64 2266; CHECK-NEXT: [[TMP147:%.*]] = zext i32 [[TMP145]] to i64 2267; CHECK-NEXT: [[TMP148:%.*]] = mul i64 [[TMP146]], [[TMP147]] 2268; CHECK-NEXT: [[TMP149:%.*]] = trunc i64 [[TMP148]] to i32 2269; CHECK-NEXT: [[TMP150:%.*]] = lshr i64 [[TMP148]], 32 2270; CHECK-NEXT: [[TMP151:%.*]] = trunc i64 [[TMP150]] to i32 2271; CHECK-NEXT: [[TMP152:%.*]] = mul i32 [[TMP151]], [[TMP132]] 2272; CHECK-NEXT: [[TMP153:%.*]] = sub i32 [[TMP131]], [[TMP152]] 2273; CHECK-NEXT: [[TMP154:%.*]] = icmp uge i32 [[TMP153]], [[TMP132]] 2274; CHECK-NEXT: [[TMP155:%.*]] = add i32 [[TMP151]], 1 2275; CHECK-NEXT: [[TMP156:%.*]] = select i1 [[TMP154]], i32 [[TMP155]], i32 [[TMP151]] 2276; CHECK-NEXT: [[TMP157:%.*]] = sub i32 [[TMP153]], [[TMP132]] 2277; CHECK-NEXT: [[TMP158:%.*]] = select i1 [[TMP154]], i32 [[TMP157]], i32 [[TMP153]] 2278; CHECK-NEXT: [[TMP159:%.*]] = icmp uge i32 [[TMP158]], [[TMP132]] 2279; CHECK-NEXT: [[TMP160:%.*]] = add i32 [[TMP156]], 1 2280; CHECK-NEXT: [[TMP161:%.*]] = select i1 [[TMP159]], i32 [[TMP160]], i32 [[TMP156]] 2281; CHECK-NEXT: [[TMP162:%.*]] = xor i32 [[TMP161]], [[TMP128]] 2282; CHECK-NEXT: [[TMP163:%.*]] = sub i32 [[TMP162]], [[TMP128]] 2283; CHECK-NEXT: [[TMP164:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP163]], i64 3 2284; CHECK-NEXT: store <4 x i32> [[TMP164]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 2285; CHECK-NEXT: ret void 2286; 2287; GFX6-LABEL: sdiv_v4i32: 2288; GFX6: ; %bb.0: 2289; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 2290; GFX6-NEXT: s_mov_b32 s16, 0x4f7ffffe 2291; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 2292; GFX6-NEXT: s_mov_b32 s15, 0xf000 2293; GFX6-NEXT: s_mov_b32 s14, -1 2294; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2295; GFX6-NEXT: s_ashr_i32 s2, s8, 31 2296; GFX6-NEXT: s_add_i32 s3, s8, s2 2297; GFX6-NEXT: s_xor_b32 s3, s3, s2 2298; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 2299; GFX6-NEXT: s_ashr_i32 s8, s9, 31 2300; GFX6-NEXT: s_add_i32 s0, s9, s8 2301; GFX6-NEXT: s_xor_b32 s9, s0, s8 2302; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 2303; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 2304; GFX6-NEXT: s_sub_i32 s1, 0, s3 2305; GFX6-NEXT: s_ashr_i32 s0, s4, 31 2306; GFX6-NEXT: v_mul_f32_e32 v0, s16, v0 2307; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 2308; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 2309; GFX6-NEXT: s_xor_b32 s2, s0, s2 2310; GFX6-NEXT: v_mul_lo_u32 v2, s1, v0 2311; GFX6-NEXT: s_add_i32 s1, s4, s0 2312; GFX6-NEXT: v_mul_f32_e32 v1, s16, v1 2313; GFX6-NEXT: s_xor_b32 s1, s1, s0 2314; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 2315; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2316; GFX6-NEXT: s_sub_i32 s0, 0, s9 2317; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 2318; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 2319; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 2320; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 2321; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 2322; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 2323; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 2324; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v3 2325; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 2326; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s3, v3 2327; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] 2328; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 2329; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2330; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 2331; GFX6-NEXT: s_ashr_i32 s0, s5, 31 2332; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 2333; GFX6-NEXT: s_add_i32 s1, s5, s0 2334; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 2335; GFX6-NEXT: s_ashr_i32 s3, s10, 31 2336; GFX6-NEXT: s_xor_b32 s1, s1, s0 2337; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 2338; GFX6-NEXT: s_xor_b32 s2, s0, s8 2339; GFX6-NEXT: s_add_i32 s0, s10, s3 2340; GFX6-NEXT: s_xor_b32 s4, s0, s3 2341; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 2342; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 2343; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 2344; GFX6-NEXT: v_mul_lo_u32 v2, v1, s9 2345; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 2346; GFX6-NEXT: v_mul_f32_e32 v3, s16, v3 2347; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 2348; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 2349; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 2350; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 2351; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v2 2352; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 2353; GFX6-NEXT: s_sub_i32 s0, 0, s4 2354; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 2355; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 2356; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 2357; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 2358; GFX6-NEXT: v_mul_hi_u32 v2, v3, v5 2359; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 2360; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 2361; GFX6-NEXT: s_ashr_i32 s2, s11, 31 2362; GFX6-NEXT: s_ashr_i32 s0, s6, 31 2363; GFX6-NEXT: s_add_i32 s5, s11, s2 2364; GFX6-NEXT: s_add_i32 s1, s6, s0 2365; GFX6-NEXT: s_xor_b32 s5, s5, s2 2366; GFX6-NEXT: s_xor_b32 s1, s1, s0 2367; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 2368; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s5 2369; GFX6-NEXT: v_mul_hi_u32 v2, s1, v2 2370; GFX6-NEXT: s_xor_b32 s3, s0, s3 2371; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v4 2372; GFX6-NEXT: v_mul_lo_u32 v3, v2, s4 2373; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v2 2374; GFX6-NEXT: v_mul_f32_e32 v4, s16, v4 2375; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 2376; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 2377; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v3 2378; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 2379; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v3 2380; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 2381; GFX6-NEXT: s_sub_i32 s0, 0, s5 2382; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 2383; GFX6-NEXT: s_ashr_i32 s0, s7, 31 2384; GFX6-NEXT: s_add_i32 s1, s7, s0 2385; GFX6-NEXT: s_xor_b32 s1, s1, s0 2386; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 2387; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v2 2388; GFX6-NEXT: s_xor_b32 s2, s0, s2 2389; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 2390; GFX6-NEXT: v_mul_hi_u32 v4, s1, v4 2391; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 2392; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2393; GFX6-NEXT: v_xor_b32_e32 v2, s3, v2 2394; GFX6-NEXT: v_mul_lo_u32 v3, v4, s5 2395; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 2396; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v2 2397; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 2398; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v3 2399; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] 2400; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v3 2401; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 2402; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 2403; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 2404; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 2405; GFX6-NEXT: v_xor_b32_e32 v3, s2, v3 2406; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v3 2407; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 2408; GFX6-NEXT: s_endpgm 2409; 2410; GFX9-LABEL: sdiv_v4i32: 2411; GFX9: ; %bb.0: 2412; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 2413; GFX9-NEXT: s_mov_b32 s15, 0x4f7ffffe 2414; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2415; GFX9-NEXT: v_mov_b32_e32 v4, 0 2416; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2417; GFX9-NEXT: s_ashr_i32 s2, s8, 31 2418; GFX9-NEXT: s_add_i32 s3, s8, s2 2419; GFX9-NEXT: s_xor_b32 s3, s3, s2 2420; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 2421; GFX9-NEXT: s_ashr_i32 s12, s9, 31 2422; GFX9-NEXT: s_add_i32 s9, s9, s12 2423; GFX9-NEXT: s_xor_b32 s9, s9, s12 2424; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 2425; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 2426; GFX9-NEXT: s_sub_i32 s14, 0, s3 2427; GFX9-NEXT: s_ashr_i32 s8, s4, 31 2428; GFX9-NEXT: v_mul_f32_e32 v0, s15, v0 2429; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 2430; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 2431; GFX9-NEXT: s_add_i32 s4, s4, s8 2432; GFX9-NEXT: s_xor_b32 s4, s4, s8 2433; GFX9-NEXT: v_mul_lo_u32 v2, s14, v0 2434; GFX9-NEXT: v_mul_f32_e32 v1, s15, v1 2435; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 2436; GFX9-NEXT: s_sub_i32 s14, 0, s9 2437; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 2438; GFX9-NEXT: s_ashr_i32 s13, s5, 31 2439; GFX9-NEXT: v_mul_lo_u32 v3, s14, v1 2440; GFX9-NEXT: s_add_i32 s5, s5, s13 2441; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 2442; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 2443; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 2444; GFX9-NEXT: s_xor_b32 s5, s5, s13 2445; GFX9-NEXT: s_xor_b32 s2, s8, s2 2446; GFX9-NEXT: v_mul_lo_u32 v3, v0, s3 2447; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 2448; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 2449; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 2450; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 2451; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 2452; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2453; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v3 2454; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 2455; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 2456; GFX9-NEXT: s_ashr_i32 s3, s10, 31 2457; GFX9-NEXT: s_add_i32 s4, s10, s3 2458; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 2459; GFX9-NEXT: s_xor_b32 s4, s4, s3 2460; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 2461; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 2462; GFX9-NEXT: v_mul_lo_u32 v2, v1, s9 2463; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 2464; GFX9-NEXT: s_ashr_i32 s8, s11, 31 2465; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 2466; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 2467; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 2468; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 2469; GFX9-NEXT: v_mul_f32_e32 v3, s15, v3 2470; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 2471; GFX9-NEXT: v_subrev_u32_e32 v5, s9, v2 2472; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2473; GFX9-NEXT: s_sub_i32 s5, 0, s4 2474; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 2475; GFX9-NEXT: v_mul_lo_u32 v2, s5, v3 2476; GFX9-NEXT: s_add_i32 s9, s11, s8 2477; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 2478; GFX9-NEXT: s_xor_b32 s9, s9, s8 2479; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 2480; GFX9-NEXT: v_mul_hi_u32 v2, v3, v2 2481; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s9 2482; GFX9-NEXT: s_ashr_i32 s5, s6, 31 2483; GFX9-NEXT: s_add_i32 s6, s6, s5 2484; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 2485; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v5 2486; GFX9-NEXT: s_xor_b32 s6, s6, s5 2487; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 2488; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 2489; GFX9-NEXT: v_mul_f32_e32 v3, s15, v3 2490; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 2491; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 2492; GFX9-NEXT: s_xor_b32 s2, s13, s12 2493; GFX9-NEXT: v_mul_lo_u32 v5, v2, s4 2494; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 2495; GFX9-NEXT: v_subrev_u32_e32 v1, s2, v1 2496; GFX9-NEXT: s_xor_b32 s2, s5, s3 2497; GFX9-NEXT: s_sub_i32 s3, 0, s9 2498; GFX9-NEXT: v_mul_lo_u32 v7, s3, v3 2499; GFX9-NEXT: v_sub_u32_e32 v5, s6, v5 2500; GFX9-NEXT: v_add_u32_e32 v6, 1, v2 2501; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 2502; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2503; GFX9-NEXT: v_subrev_u32_e32 v6, s4, v5 2504; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 2505; GFX9-NEXT: v_mul_hi_u32 v6, v3, v7 2506; GFX9-NEXT: s_ashr_i32 s3, s7, 31 2507; GFX9-NEXT: s_add_i32 s5, s7, s3 2508; GFX9-NEXT: s_xor_b32 s5, s5, s3 2509; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 2510; GFX9-NEXT: v_mul_hi_u32 v3, s5, v3 2511; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 2512; GFX9-NEXT: v_add_u32_e32 v6, 1, v2 2513; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2514; GFX9-NEXT: v_mul_lo_u32 v5, v3, s9 2515; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 2516; GFX9-NEXT: v_xor_b32_e32 v2, s2, v2 2517; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v2 2518; GFX9-NEXT: v_sub_u32_e32 v5, s5, v5 2519; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 2520; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2521; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v5 2522; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 2523; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 2524; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 2525; GFX9-NEXT: s_xor_b32 s2, s3, s8 2526; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2527; GFX9-NEXT: v_xor_b32_e32 v3, s2, v3 2528; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v3 2529; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2530; GFX9-NEXT: s_endpgm 2531; 2532; GFX90A-LABEL: sdiv_v4i32: 2533; GFX90A: ; %bb.0: 2534; GFX90A-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 2535; GFX90A-NEXT: s_mov_b32 s13, 0x4f7ffffe 2536; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2537; GFX90A-NEXT: v_mov_b32_e32 v4, 0 2538; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 2539; GFX90A-NEXT: s_ashr_i32 s2, s8, 31 2540; GFX90A-NEXT: s_add_i32 s3, s8, s2 2541; GFX90A-NEXT: s_xor_b32 s3, s3, s2 2542; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 2543; GFX90A-NEXT: s_ashr_i32 s8, s4, 31 2544; GFX90A-NEXT: s_add_i32 s4, s4, s8 2545; GFX90A-NEXT: s_xor_b32 s2, s8, s2 2546; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 2547; GFX90A-NEXT: s_xor_b32 s4, s4, s8 2548; GFX90A-NEXT: s_sub_i32 s8, 0, s3 2549; GFX90A-NEXT: s_ashr_i32 s12, s9, 31 2550; GFX90A-NEXT: v_mul_f32_e32 v0, s13, v0 2551; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 2552; GFX90A-NEXT: v_mul_lo_u32 v1, s8, v0 2553; GFX90A-NEXT: v_mul_hi_u32 v1, v0, v1 2554; GFX90A-NEXT: v_add_u32_e32 v0, v0, v1 2555; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 2556; GFX90A-NEXT: v_mul_lo_u32 v1, v0, s3 2557; GFX90A-NEXT: v_sub_u32_e32 v1, s4, v1 2558; GFX90A-NEXT: s_add_i32 s4, s9, s12 2559; GFX90A-NEXT: s_xor_b32 s4, s4, s12 2560; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s4 2561; GFX90A-NEXT: v_add_u32_e32 v2, 1, v0 2562; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 2563; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2564; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v1 2565; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 2566; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 2567; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v3 2568; GFX90A-NEXT: v_add_u32_e32 v2, 1, v0 2569; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2570; GFX90A-NEXT: v_xor_b32_e32 v0, s2, v0 2571; GFX90A-NEXT: v_mul_f32_e32 v1, s13, v1 2572; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 2573; GFX90A-NEXT: v_subrev_u32_e32 v0, s2, v0 2574; GFX90A-NEXT: s_ashr_i32 s2, s5, 31 2575; GFX90A-NEXT: s_add_i32 s5, s5, s2 2576; GFX90A-NEXT: s_xor_b32 s3, s2, s12 2577; GFX90A-NEXT: s_xor_b32 s2, s5, s2 2578; GFX90A-NEXT: s_sub_i32 s5, 0, s4 2579; GFX90A-NEXT: v_mul_lo_u32 v2, s5, v1 2580; GFX90A-NEXT: v_mul_hi_u32 v2, v1, v2 2581; GFX90A-NEXT: v_add_u32_e32 v1, v1, v2 2582; GFX90A-NEXT: v_mul_hi_u32 v1, s2, v1 2583; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s4 2584; GFX90A-NEXT: v_sub_u32_e32 v2, s2, v2 2585; GFX90A-NEXT: s_ashr_i32 s2, s10, 31 2586; GFX90A-NEXT: s_add_i32 s5, s10, s2 2587; GFX90A-NEXT: s_xor_b32 s5, s5, s2 2588; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s5 2589; GFX90A-NEXT: v_add_u32_e32 v3, 1, v1 2590; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 2591; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2592; GFX90A-NEXT: v_subrev_u32_e32 v3, s4, v2 2593; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 2594; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 2595; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v5 2596; GFX90A-NEXT: v_add_u32_e32 v3, 1, v1 2597; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2598; GFX90A-NEXT: v_xor_b32_e32 v1, s3, v1 2599; GFX90A-NEXT: v_mul_f32_e32 v2, s13, v2 2600; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 2601; GFX90A-NEXT: v_subrev_u32_e32 v1, s3, v1 2602; GFX90A-NEXT: s_ashr_i32 s3, s6, 31 2603; GFX90A-NEXT: s_add_i32 s4, s6, s3 2604; GFX90A-NEXT: s_xor_b32 s2, s3, s2 2605; GFX90A-NEXT: s_xor_b32 s3, s4, s3 2606; GFX90A-NEXT: s_sub_i32 s4, 0, s5 2607; GFX90A-NEXT: v_mul_lo_u32 v3, s4, v2 2608; GFX90A-NEXT: v_mul_hi_u32 v3, v2, v3 2609; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 2610; GFX90A-NEXT: v_mul_hi_u32 v2, s3, v2 2611; GFX90A-NEXT: v_mul_lo_u32 v3, v2, s5 2612; GFX90A-NEXT: v_sub_u32_e32 v3, s3, v3 2613; GFX90A-NEXT: s_ashr_i32 s3, s11, 31 2614; GFX90A-NEXT: s_add_i32 s4, s11, s3 2615; GFX90A-NEXT: s_xor_b32 s4, s4, s3 2616; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s4 2617; GFX90A-NEXT: v_add_u32_e32 v5, 1, v2 2618; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 2619; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2620; GFX90A-NEXT: v_subrev_u32_e32 v5, s5, v3 2621; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2622; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 2623; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v6 2624; GFX90A-NEXT: v_add_u32_e32 v5, 1, v2 2625; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2626; GFX90A-NEXT: v_xor_b32_e32 v2, s2, v2 2627; GFX90A-NEXT: v_mul_f32_e32 v3, s13, v3 2628; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 2629; GFX90A-NEXT: v_subrev_u32_e32 v2, s2, v2 2630; GFX90A-NEXT: s_ashr_i32 s2, s7, 31 2631; GFX90A-NEXT: s_add_i32 s5, s7, s2 2632; GFX90A-NEXT: s_xor_b32 s3, s2, s3 2633; GFX90A-NEXT: s_xor_b32 s2, s5, s2 2634; GFX90A-NEXT: s_sub_i32 s5, 0, s4 2635; GFX90A-NEXT: v_mul_lo_u32 v5, s5, v3 2636; GFX90A-NEXT: v_mul_hi_u32 v5, v3, v5 2637; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 2638; GFX90A-NEXT: v_mul_hi_u32 v3, s2, v3 2639; GFX90A-NEXT: v_mul_lo_u32 v5, v3, s4 2640; GFX90A-NEXT: v_sub_u32_e32 v5, s2, v5 2641; GFX90A-NEXT: v_add_u32_e32 v6, 1, v3 2642; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 2643; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2644; GFX90A-NEXT: v_subrev_u32_e32 v6, s4, v5 2645; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 2646; GFX90A-NEXT: v_add_u32_e32 v6, 1, v3 2647; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 2648; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2649; GFX90A-NEXT: v_xor_b32_e32 v3, s3, v3 2650; GFX90A-NEXT: v_subrev_u32_e32 v3, s3, v3 2651; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2652; GFX90A-NEXT: s_endpgm 2653 %r = sdiv <4 x i32> %x, %y 2654 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 2655 ret void 2656} 2657 2658define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 2659; CHECK-LABEL: @srem_v4i32( 2660; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 2661; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 2662; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 2663; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 2664; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 2665; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 2666; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 2667; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 2668; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 2669; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2670; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000 2671; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 2672; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]] 2673; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]] 2674; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64 2675; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 2676; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]] 2677; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 2678; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32 2679; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 2680; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] 2681; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 2682; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64 2683; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 2684; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 2685; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 2686; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 2687; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]] 2688; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]] 2689; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]] 2690; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]] 2691; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]] 2692; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]] 2693; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]] 2694; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]] 2695; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]] 2696; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]] 2697; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i32> undef, i32 [[TMP37]], i64 0 2698; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i32> [[X]], i64 1 2699; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[Y]], i64 1 2700; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31 2701; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31 2702; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]] 2703; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]] 2704; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]] 2705; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]] 2706; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float 2707; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]]) 2708; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000 2709; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32 2710; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]] 2711; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]] 2712; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64 2713; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64 2714; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]] 2715; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32 2716; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32 2717; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32 2718; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]] 2719; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64 2720; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64 2721; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]] 2722; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32 2723; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32 2724; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32 2725; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]] 2726; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]] 2727; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]] 2728; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]] 2729; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]] 2730; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]] 2731; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]] 2732; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]] 2733; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]] 2734; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]] 2735; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP75]], i64 1 2736; CHECK-NEXT: [[TMP77:%.*]] = extractelement <4 x i32> [[X]], i64 2 2737; CHECK-NEXT: [[TMP78:%.*]] = extractelement <4 x i32> [[Y]], i64 2 2738; CHECK-NEXT: [[TMP79:%.*]] = ashr i32 [[TMP77]], 31 2739; CHECK-NEXT: [[TMP80:%.*]] = ashr i32 [[TMP78]], 31 2740; CHECK-NEXT: [[TMP81:%.*]] = add i32 [[TMP77]], [[TMP79]] 2741; CHECK-NEXT: [[TMP82:%.*]] = add i32 [[TMP78]], [[TMP80]] 2742; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP79]] 2743; CHECK-NEXT: [[TMP84:%.*]] = xor i32 [[TMP82]], [[TMP80]] 2744; CHECK-NEXT: [[TMP85:%.*]] = uitofp i32 [[TMP84]] to float 2745; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP85]]) 2746; CHECK-NEXT: [[TMP87:%.*]] = fmul fast float [[TMP86]], 0x41EFFFFFC0000000 2747; CHECK-NEXT: [[TMP88:%.*]] = fptoui float [[TMP87]] to i32 2748; CHECK-NEXT: [[TMP89:%.*]] = sub i32 0, [[TMP84]] 2749; CHECK-NEXT: [[TMP90:%.*]] = mul i32 [[TMP89]], [[TMP88]] 2750; CHECK-NEXT: [[TMP91:%.*]] = zext i32 [[TMP88]] to i64 2751; CHECK-NEXT: [[TMP92:%.*]] = zext i32 [[TMP90]] to i64 2752; CHECK-NEXT: [[TMP93:%.*]] = mul i64 [[TMP91]], [[TMP92]] 2753; CHECK-NEXT: [[TMP94:%.*]] = trunc i64 [[TMP93]] to i32 2754; CHECK-NEXT: [[TMP95:%.*]] = lshr i64 [[TMP93]], 32 2755; CHECK-NEXT: [[TMP96:%.*]] = trunc i64 [[TMP95]] to i32 2756; CHECK-NEXT: [[TMP97:%.*]] = add i32 [[TMP88]], [[TMP96]] 2757; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP83]] to i64 2758; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64 2759; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]] 2760; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 2761; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32 2762; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32 2763; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP84]] 2764; CHECK-NEXT: [[TMP105:%.*]] = sub i32 [[TMP83]], [[TMP104]] 2765; CHECK-NEXT: [[TMP106:%.*]] = icmp uge i32 [[TMP105]], [[TMP84]] 2766; CHECK-NEXT: [[TMP107:%.*]] = sub i32 [[TMP105]], [[TMP84]] 2767; CHECK-NEXT: [[TMP108:%.*]] = select i1 [[TMP106]], i32 [[TMP107]], i32 [[TMP105]] 2768; CHECK-NEXT: [[TMP109:%.*]] = icmp uge i32 [[TMP108]], [[TMP84]] 2769; CHECK-NEXT: [[TMP110:%.*]] = sub i32 [[TMP108]], [[TMP84]] 2770; CHECK-NEXT: [[TMP111:%.*]] = select i1 [[TMP109]], i32 [[TMP110]], i32 [[TMP108]] 2771; CHECK-NEXT: [[TMP112:%.*]] = xor i32 [[TMP111]], [[TMP79]] 2772; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP112]], [[TMP79]] 2773; CHECK-NEXT: [[TMP114:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP113]], i64 2 2774; CHECK-NEXT: [[TMP115:%.*]] = extractelement <4 x i32> [[X]], i64 3 2775; CHECK-NEXT: [[TMP116:%.*]] = extractelement <4 x i32> [[Y]], i64 3 2776; CHECK-NEXT: [[TMP117:%.*]] = ashr i32 [[TMP115]], 31 2777; CHECK-NEXT: [[TMP118:%.*]] = ashr i32 [[TMP116]], 31 2778; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], [[TMP117]] 2779; CHECK-NEXT: [[TMP120:%.*]] = add i32 [[TMP116]], [[TMP118]] 2780; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP119]], [[TMP117]] 2781; CHECK-NEXT: [[TMP122:%.*]] = xor i32 [[TMP120]], [[TMP118]] 2782; CHECK-NEXT: [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float 2783; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]]) 2784; CHECK-NEXT: [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41EFFFFFC0000000 2785; CHECK-NEXT: [[TMP126:%.*]] = fptoui float [[TMP125]] to i32 2786; CHECK-NEXT: [[TMP127:%.*]] = sub i32 0, [[TMP122]] 2787; CHECK-NEXT: [[TMP128:%.*]] = mul i32 [[TMP127]], [[TMP126]] 2788; CHECK-NEXT: [[TMP129:%.*]] = zext i32 [[TMP126]] to i64 2789; CHECK-NEXT: [[TMP130:%.*]] = zext i32 [[TMP128]] to i64 2790; CHECK-NEXT: [[TMP131:%.*]] = mul i64 [[TMP129]], [[TMP130]] 2791; CHECK-NEXT: [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32 2792; CHECK-NEXT: [[TMP133:%.*]] = lshr i64 [[TMP131]], 32 2793; CHECK-NEXT: [[TMP134:%.*]] = trunc i64 [[TMP133]] to i32 2794; CHECK-NEXT: [[TMP135:%.*]] = add i32 [[TMP126]], [[TMP134]] 2795; CHECK-NEXT: [[TMP136:%.*]] = zext i32 [[TMP121]] to i64 2796; CHECK-NEXT: [[TMP137:%.*]] = zext i32 [[TMP135]] to i64 2797; CHECK-NEXT: [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]] 2798; CHECK-NEXT: [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32 2799; CHECK-NEXT: [[TMP140:%.*]] = lshr i64 [[TMP138]], 32 2800; CHECK-NEXT: [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32 2801; CHECK-NEXT: [[TMP142:%.*]] = mul i32 [[TMP141]], [[TMP122]] 2802; CHECK-NEXT: [[TMP143:%.*]] = sub i32 [[TMP121]], [[TMP142]] 2803; CHECK-NEXT: [[TMP144:%.*]] = icmp uge i32 [[TMP143]], [[TMP122]] 2804; CHECK-NEXT: [[TMP145:%.*]] = sub i32 [[TMP143]], [[TMP122]] 2805; CHECK-NEXT: [[TMP146:%.*]] = select i1 [[TMP144]], i32 [[TMP145]], i32 [[TMP143]] 2806; CHECK-NEXT: [[TMP147:%.*]] = icmp uge i32 [[TMP146]], [[TMP122]] 2807; CHECK-NEXT: [[TMP148:%.*]] = sub i32 [[TMP146]], [[TMP122]] 2808; CHECK-NEXT: [[TMP149:%.*]] = select i1 [[TMP147]], i32 [[TMP148]], i32 [[TMP146]] 2809; CHECK-NEXT: [[TMP150:%.*]] = xor i32 [[TMP149]], [[TMP117]] 2810; CHECK-NEXT: [[TMP151:%.*]] = sub i32 [[TMP150]], [[TMP117]] 2811; CHECK-NEXT: [[TMP152:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP151]], i64 3 2812; CHECK-NEXT: store <4 x i32> [[TMP152]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 2813; CHECK-NEXT: ret void 2814; 2815; GFX6-LABEL: srem_v4i32: 2816; GFX6: ; %bb.0: 2817; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 2818; GFX6-NEXT: s_mov_b32 s14, 0x4f7ffffe 2819; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2820; GFX6-NEXT: s_mov_b32 s3, 0xf000 2821; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2822; GFX6-NEXT: s_ashr_i32 s2, s8, 31 2823; GFX6-NEXT: s_add_i32 s8, s8, s2 2824; GFX6-NEXT: s_xor_b32 s8, s8, s2 2825; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 2826; GFX6-NEXT: s_ashr_i32 s12, s9, 31 2827; GFX6-NEXT: s_add_i32 s9, s9, s12 2828; GFX6-NEXT: s_xor_b32 s9, s9, s12 2829; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 2830; GFX6-NEXT: s_sub_i32 s13, 0, s8 2831; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 2832; GFX6-NEXT: s_ashr_i32 s12, s4, 31 2833; GFX6-NEXT: v_mul_f32_e32 v0, s14, v0 2834; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 2835; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 2836; GFX6-NEXT: s_add_i32 s4, s4, s12 2837; GFX6-NEXT: s_xor_b32 s4, s4, s12 2838; GFX6-NEXT: v_mul_lo_u32 v2, s13, v0 2839; GFX6-NEXT: v_mul_f32_e32 v1, s14, v1 2840; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2841; GFX6-NEXT: s_sub_i32 s13, 0, s9 2842; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 2843; GFX6-NEXT: s_mov_b32 s2, -1 2844; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 2845; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 2846; GFX6-NEXT: v_mul_lo_u32 v2, s13, v1 2847; GFX6-NEXT: s_ashr_i32 s13, s5, 31 2848; GFX6-NEXT: s_add_i32 s5, s5, s13 2849; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 2850; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 2851; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 2852; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 2853; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2854; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 2855; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 2856; GFX6-NEXT: s_xor_b32 s4, s5, s13 2857; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2858; GFX6-NEXT: s_ashr_i32 s5, s10, 31 2859; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2860; GFX6-NEXT: s_add_i32 s8, s10, s5 2861; GFX6-NEXT: s_xor_b32 s5, s8, s5 2862; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 2863; GFX6-NEXT: v_mul_hi_u32 v1, s4, v1 2864; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 2865; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 2866; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 2867; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 2868; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 2869; GFX6-NEXT: v_mul_f32_e32 v2, s14, v2 2870; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 2871; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 2872; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v1 2873; GFX6-NEXT: s_sub_i32 s4, 0, s5 2874; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2875; GFX6-NEXT: v_mul_lo_u32 v4, s4, v2 2876; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2877; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v1 2878; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2879; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2880; GFX6-NEXT: v_mul_hi_u32 v3, v2, v4 2881; GFX6-NEXT: s_ashr_i32 s8, s11, 31 2882; GFX6-NEXT: s_add_i32 s9, s11, s8 2883; GFX6-NEXT: s_ashr_i32 s4, s6, 31 2884; GFX6-NEXT: s_xor_b32 s8, s9, s8 2885; GFX6-NEXT: s_add_i32 s6, s6, s4 2886; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 2887; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 2888; GFX6-NEXT: s_xor_b32 s6, s6, s4 2889; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 2890; GFX6-NEXT: v_xor_b32_e32 v1, s13, v1 2891; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 2892; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s13, v1 2893; GFX6-NEXT: v_mul_lo_u32 v2, v2, s5 2894; GFX6-NEXT: v_mul_f32_e32 v3, s14, v3 2895; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 2896; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 2897; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v2 2898; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 2899; GFX6-NEXT: s_sub_i32 s6, 0, s8 2900; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 2901; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 2902; GFX6-NEXT: s_ashr_i32 s6, s7, 31 2903; GFX6-NEXT: s_add_i32 s7, s7, s6 2904; GFX6-NEXT: s_xor_b32 s7, s7, s6 2905; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 2906; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v2 2907; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 2908; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 2909; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 2910; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2911; GFX6-NEXT: v_xor_b32_e32 v2, s4, v2 2912; GFX6-NEXT: v_mul_lo_u32 v3, v3, s8 2913; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s4, v2 2914; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 2915; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 2916; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 2917; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 2918; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 2919; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 2920; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 2921; GFX6-NEXT: v_xor_b32_e32 v3, s6, v3 2922; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s6, v3 2923; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2924; GFX6-NEXT: s_endpgm 2925; 2926; GFX9-LABEL: srem_v4i32: 2927; GFX9: ; %bb.0: 2928; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 2929; GFX9-NEXT: s_mov_b32 s13, 0x4f7ffffe 2930; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2931; GFX9-NEXT: v_mov_b32_e32 v4, 0 2932; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2933; GFX9-NEXT: s_ashr_i32 s2, s8, 31 2934; GFX9-NEXT: s_add_i32 s8, s8, s2 2935; GFX9-NEXT: s_xor_b32 s2, s8, s2 2936; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 2937; GFX9-NEXT: s_ashr_i32 s3, s9, 31 2938; GFX9-NEXT: s_sub_i32 s12, 0, s2 2939; GFX9-NEXT: s_add_i32 s8, s9, s3 2940; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 2941; GFX9-NEXT: s_xor_b32 s3, s8, s3 2942; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 2943; GFX9-NEXT: s_ashr_i32 s8, s4, 31 2944; GFX9-NEXT: v_mul_f32_e32 v0, s13, v0 2945; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 2946; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 2947; GFX9-NEXT: s_add_i32 s4, s4, s8 2948; GFX9-NEXT: s_xor_b32 s4, s4, s8 2949; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0 2950; GFX9-NEXT: v_mul_f32_e32 v1, s13, v1 2951; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 2952; GFX9-NEXT: s_sub_i32 s12, 0, s3 2953; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 2954; GFX9-NEXT: s_ashr_i32 s9, s5, 31 2955; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 2956; GFX9-NEXT: s_add_i32 s5, s5, s9 2957; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 2958; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 2959; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 2960; GFX9-NEXT: s_xor_b32 s5, s5, s9 2961; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 2962; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 2963; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 2964; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 2965; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v0 2966; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 2967; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2968; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v0 2969; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 2970; GFX9-NEXT: s_ashr_i32 s2, s10, 31 2971; GFX9-NEXT: s_add_i32 s4, s10, s2 2972; GFX9-NEXT: s_xor_b32 s2, s4, s2 2973; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2974; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 2975; GFX9-NEXT: v_mul_lo_u32 v1, v1, s3 2976; GFX9-NEXT: v_xor_b32_e32 v0, s8, v0 2977; GFX9-NEXT: v_subrev_u32_e32 v0, s8, v0 2978; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 2979; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 2980; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 2981; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 2982; GFX9-NEXT: v_mul_f32_e32 v2, s13, v2 2983; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 2984; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2985; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 2986; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 2987; GFX9-NEXT: s_sub_i32 s3, 0, s2 2988; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2989; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 2990; GFX9-NEXT: s_ashr_i32 s3, s11, 31 2991; GFX9-NEXT: s_add_i32 s4, s11, s3 2992; GFX9-NEXT: s_xor_b32 s3, s4, s3 2993; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 2994; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 2995; GFX9-NEXT: s_ashr_i32 s4, s6, 31 2996; GFX9-NEXT: s_add_i32 s5, s6, s4 2997; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 2998; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 2999; GFX9-NEXT: s_xor_b32 s5, s5, s4 3000; GFX9-NEXT: v_mul_hi_u32 v2, s5, v2 3001; GFX9-NEXT: v_mul_f32_e32 v3, s13, v5 3002; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 3003; GFX9-NEXT: s_sub_i32 s6, 0, s3 3004; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 3005; GFX9-NEXT: v_xor_b32_e32 v1, s9, v1 3006; GFX9-NEXT: v_mul_lo_u32 v5, s6, v3 3007; GFX9-NEXT: v_subrev_u32_e32 v1, s9, v1 3008; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 3009; GFX9-NEXT: s_ashr_i32 s5, s7, 31 3010; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 3011; GFX9-NEXT: s_add_i32 s6, s7, s5 3012; GFX9-NEXT: s_xor_b32 s6, s6, s5 3013; GFX9-NEXT: v_subrev_u32_e32 v6, s2, v2 3014; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 3015; GFX9-NEXT: v_mul_hi_u32 v3, s6, v3 3016; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 3017; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 3018; GFX9-NEXT: v_subrev_u32_e32 v5, s2, v2 3019; GFX9-NEXT: v_mul_lo_u32 v3, v3, s3 3020; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 3021; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 3022; GFX9-NEXT: v_xor_b32_e32 v2, s4, v2 3023; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 3024; GFX9-NEXT: v_subrev_u32_e32 v5, s3, v3 3025; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 3026; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 3027; GFX9-NEXT: v_subrev_u32_e32 v5, s3, v3 3028; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 3029; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 3030; GFX9-NEXT: v_xor_b32_e32 v3, s5, v3 3031; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 3032; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v3 3033; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 3034; GFX9-NEXT: s_endpgm 3035; 3036; GFX90A-LABEL: srem_v4i32: 3037; GFX90A: ; %bb.0: 3038; GFX90A-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 3039; GFX90A-NEXT: s_mov_b32 s12, 0x4f7ffffe 3040; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3041; GFX90A-NEXT: v_mov_b32_e32 v4, 0 3042; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 3043; GFX90A-NEXT: s_ashr_i32 s2, s8, 31 3044; GFX90A-NEXT: s_add_i32 s3, s8, s2 3045; GFX90A-NEXT: s_xor_b32 s2, s3, s2 3046; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s2 3047; GFX90A-NEXT: s_ashr_i32 s8, s9, 31 3048; GFX90A-NEXT: s_add_i32 s9, s9, s8 3049; GFX90A-NEXT: s_xor_b32 s8, s9, s8 3050; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 3051; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s8 3052; GFX90A-NEXT: s_sub_i32 s9, 0, s2 3053; GFX90A-NEXT: s_ashr_i32 s3, s4, 31 3054; GFX90A-NEXT: v_mul_f32_e32 v0, s12, v0 3055; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 3056; GFX90A-NEXT: s_add_i32 s4, s4, s3 3057; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 3058; GFX90A-NEXT: s_xor_b32 s4, s4, s3 3059; GFX90A-NEXT: v_mul_lo_u32 v2, s9, v0 3060; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 3061; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 3062; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 3063; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s2 3064; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 3065; GFX90A-NEXT: v_mul_f32_e32 v1, s12, v1 3066; GFX90A-NEXT: v_subrev_u32_e32 v2, s2, v0 3067; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 3068; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 3069; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3070; GFX90A-NEXT: v_subrev_u32_e32 v2, s2, v0 3071; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 3072; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3073; GFX90A-NEXT: s_sub_i32 s4, 0, s8 3074; GFX90A-NEXT: v_xor_b32_e32 v0, s3, v0 3075; GFX90A-NEXT: s_ashr_i32 s2, s5, 31 3076; GFX90A-NEXT: v_mul_lo_u32 v2, s4, v1 3077; GFX90A-NEXT: v_subrev_u32_e32 v0, s3, v0 3078; GFX90A-NEXT: s_add_i32 s3, s5, s2 3079; GFX90A-NEXT: v_mul_hi_u32 v2, v1, v2 3080; GFX90A-NEXT: s_xor_b32 s3, s3, s2 3081; GFX90A-NEXT: v_add_u32_e32 v1, v1, v2 3082; GFX90A-NEXT: v_mul_hi_u32 v1, s3, v1 3083; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s8 3084; GFX90A-NEXT: v_sub_u32_e32 v1, s3, v1 3085; GFX90A-NEXT: s_ashr_i32 s3, s10, 31 3086; GFX90A-NEXT: s_add_i32 s4, s10, s3 3087; GFX90A-NEXT: v_subrev_u32_e32 v2, s8, v1 3088; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 3089; GFX90A-NEXT: s_xor_b32 s3, s4, s3 3090; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3091; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s3 3092; GFX90A-NEXT: v_subrev_u32_e32 v3, s8, v1 3093; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 3094; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 3095; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v2 3096; GFX90A-NEXT: v_xor_b32_e32 v1, s2, v1 3097; GFX90A-NEXT: s_sub_i32 s5, 0, s3 3098; GFX90A-NEXT: v_subrev_u32_e32 v1, s2, v1 3099; GFX90A-NEXT: v_mul_f32_e32 v2, s12, v2 3100; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 3101; GFX90A-NEXT: s_ashr_i32 s2, s6, 31 3102; GFX90A-NEXT: s_add_i32 s4, s6, s2 3103; GFX90A-NEXT: s_xor_b32 s4, s4, s2 3104; GFX90A-NEXT: v_mul_lo_u32 v3, s5, v2 3105; GFX90A-NEXT: v_mul_hi_u32 v3, v2, v3 3106; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 3107; GFX90A-NEXT: v_mul_hi_u32 v2, s4, v2 3108; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s3 3109; GFX90A-NEXT: v_sub_u32_e32 v2, s4, v2 3110; GFX90A-NEXT: s_ashr_i32 s4, s11, 31 3111; GFX90A-NEXT: s_add_i32 s5, s11, s4 3112; GFX90A-NEXT: v_subrev_u32_e32 v3, s3, v2 3113; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 3114; GFX90A-NEXT: s_xor_b32 s4, s5, s4 3115; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 3116; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s4 3117; GFX90A-NEXT: v_subrev_u32_e32 v5, s3, v2 3118; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 3119; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 3120; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v3 3121; GFX90A-NEXT: v_xor_b32_e32 v2, s2, v2 3122; GFX90A-NEXT: s_sub_i32 s5, 0, s4 3123; GFX90A-NEXT: v_subrev_u32_e32 v2, s2, v2 3124; GFX90A-NEXT: v_mul_f32_e32 v3, s12, v3 3125; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 3126; GFX90A-NEXT: s_ashr_i32 s2, s7, 31 3127; GFX90A-NEXT: s_add_i32 s3, s7, s2 3128; GFX90A-NEXT: s_xor_b32 s3, s3, s2 3129; GFX90A-NEXT: v_mul_lo_u32 v5, s5, v3 3130; GFX90A-NEXT: v_mul_hi_u32 v5, v3, v5 3131; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 3132; GFX90A-NEXT: v_mul_hi_u32 v3, s3, v3 3133; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s4 3134; GFX90A-NEXT: v_sub_u32_e32 v3, s3, v3 3135; GFX90A-NEXT: v_subrev_u32_e32 v5, s4, v3 3136; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 3137; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 3138; GFX90A-NEXT: v_subrev_u32_e32 v5, s4, v3 3139; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 3140; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 3141; GFX90A-NEXT: v_xor_b32_e32 v3, s2, v3 3142; GFX90A-NEXT: v_subrev_u32_e32 v3, s2, v3 3143; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 3144; GFX90A-NEXT: s_endpgm 3145 %r = srem <4 x i32> %x, %y 3146 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 3147 ret void 3148} 3149 3150define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 3151; CHECK-LABEL: @udiv_v4i16( 3152; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 3153; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 3154; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 3155; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 3156; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3157; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3158; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3159; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3160; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3161; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3162; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3163; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3164; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3165; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3166; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3167; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3168; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3169; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 3170; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 3171; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i16> undef, i16 [[TMP19]], i64 0 3172; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1 3173; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1 3174; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 3175; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 3176; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 3177; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 3178; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 3179; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 3180; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 3181; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 3182; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 3183; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 3184; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 3185; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 3186; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 3187; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 3188; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 3189; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 3190; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 3191; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1 3192; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2 3193; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2 3194; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 3195; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 3196; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 3197; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 3198; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 3199; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 3200; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 3201; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 3202; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 3203; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 3204; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 3205; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 3206; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 3207; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 3208; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 3209; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 3210; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 3211; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2 3212; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3 3213; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3 3214; CHECK-NEXT: [[TMP63:%.*]] = zext i16 [[TMP61]] to i32 3215; CHECK-NEXT: [[TMP64:%.*]] = zext i16 [[TMP62]] to i32 3216; CHECK-NEXT: [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float 3217; CHECK-NEXT: [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float 3218; CHECK-NEXT: [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]]) 3219; CHECK-NEXT: [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]] 3220; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]]) 3221; CHECK-NEXT: [[TMP70:%.*]] = fneg fast float [[TMP69]] 3222; CHECK-NEXT: [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]]) 3223; CHECK-NEXT: [[TMP72:%.*]] = fptoui float [[TMP69]] to i32 3224; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]]) 3225; CHECK-NEXT: [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 3226; CHECK-NEXT: [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]] 3227; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0 3228; CHECK-NEXT: [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]] 3229; CHECK-NEXT: [[TMP78:%.*]] = and i32 [[TMP77]], 65535 3230; CHECK-NEXT: [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16 3231; CHECK-NEXT: [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3 3232; CHECK-NEXT: store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 3233; CHECK-NEXT: ret void 3234; 3235; GFX6-LABEL: udiv_v4i16: 3236; GFX6: ; %bb.0: 3237; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3238; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 3239; GFX6-NEXT: s_mov_b32 s8, 0xffff 3240; GFX6-NEXT: s_mov_b32 s7, 0xf000 3241; GFX6-NEXT: s_mov_b32 s6, -1 3242; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3243; GFX6-NEXT: s_and_b32 s9, s2, s8 3244; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 3245; GFX6-NEXT: s_lshr_b32 s9, s0, 16 3246; GFX6-NEXT: s_and_b32 s0, s0, s8 3247; GFX6-NEXT: s_lshr_b32 s2, s2, 16 3248; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 3249; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3250; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 3251; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s9 3252; GFX6-NEXT: s_and_b32 s2, s3, s8 3253; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3254; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 3255; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3256; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3257; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 3258; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3259; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 3260; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3261; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 3262; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 3263; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s2 3264; GFX6-NEXT: s_lshr_b32 s0, s1, 16 3265; GFX6-NEXT: s_lshr_b32 s10, s3, 16 3266; GFX6-NEXT: s_and_b32 s1, s1, s8 3267; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 3268; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 3269; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 3270; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 3271; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 3272; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc 3273; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 3274; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s0 3275; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v3 3276; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3277; GFX6-NEXT: v_mad_f32 v5, -v1, v4, v5 3278; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 3279; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 3280; GFX6-NEXT: v_mul_f32_e32 v4, v6, v7 3281; GFX6-NEXT: v_trunc_f32_e32 v4, v4 3282; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v4 3283; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3284; GFX6-NEXT: v_mad_f32 v4, -v4, v3, v6 3285; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 3286; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 3287; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3288; GFX6-NEXT: v_and_b32_e32 v1, s8, v1 3289; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3290; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 3291; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 3292; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 3293; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3294; GFX6-NEXT: s_endpgm 3295; 3296; GFX9-LABEL: udiv_v4i16: 3297; GFX9: ; %bb.0: 3298; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3299; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3300; GFX9-NEXT: s_mov_b32 s8, 0xffff 3301; GFX9-NEXT: v_mov_b32_e32 v2, 0 3302; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3303; GFX9-NEXT: s_and_b32 s1, s6, s8 3304; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 3305; GFX9-NEXT: s_lshr_b32 s0, s4, 16 3306; GFX9-NEXT: s_and_b32 s4, s4, s8 3307; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 3308; GFX9-NEXT: s_lshr_b32 s4, s6, 16 3309; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3310; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 3311; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 3312; GFX9-NEXT: s_and_b32 s0, s7, s8 3313; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 3314; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 3315; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3316; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 3317; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 3318; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3319; GFX9-NEXT: v_mul_f32_e32 v1, v5, v6 3320; GFX9-NEXT: v_trunc_f32_e32 v1, v1 3321; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 3322; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 3323; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 3324; GFX9-NEXT: s_lshr_b32 s6, s7, 16 3325; GFX9-NEXT: s_and_b32 s0, s5, s8 3326; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 3327; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 3328; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 3329; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 3330; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 3331; GFX9-NEXT: s_lshr_b32 s1, s5, 16 3332; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc 3333; GFX9-NEXT: v_mul_f32_e32 v1, v6, v7 3334; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 3335; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v4 3336; GFX9-NEXT: v_trunc_f32_e32 v1, v1 3337; GFX9-NEXT: v_mad_f32 v6, -v1, v5, v6 3338; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 3339; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 3340; GFX9-NEXT: v_mul_f32_e32 v5, v7, v8 3341; GFX9-NEXT: v_trunc_f32_e32 v5, v5 3342; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v5 3343; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 3344; GFX9-NEXT: v_mad_f32 v5, -v5, v4, v7 3345; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 3346; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff 3347; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc 3348; GFX9-NEXT: v_and_b32_e32 v1, v5, v1 3349; GFX9-NEXT: v_and_b32_e32 v0, v5, v0 3350; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 3351; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 3352; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3353; GFX9-NEXT: s_endpgm 3354; 3355; GFX90A-LABEL: udiv_v4i16: 3356; GFX90A: ; %bb.0: 3357; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3358; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3359; GFX90A-NEXT: s_mov_b32 s8, 0xffff 3360; GFX90A-NEXT: v_mov_b32_e32 v2, 0 3361; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 3362; GFX90A-NEXT: s_and_b32 s1, s6, s8 3363; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s1 3364; GFX90A-NEXT: s_lshr_b32 s0, s4, 16 3365; GFX90A-NEXT: s_and_b32 s4, s4, s8 3366; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s4 3367; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 3368; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 3369; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s4 3370; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 3371; GFX90A-NEXT: s_and_b32 s0, s7, s8 3372; GFX90A-NEXT: v_mul_f32_e32 v3, v1, v3 3373; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 3374; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 3375; GFX90A-NEXT: v_mad_f32 v1, -v3, v0, v1 3376; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 3377; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3378; GFX90A-NEXT: v_mul_f32_e32 v1, v5, v6 3379; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 3380; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 3381; GFX90A-NEXT: v_mad_f32 v3, -v1, v4, v5 3382; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 3383; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 3384; GFX90A-NEXT: s_and_b32 s0, s5, s8 3385; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 3386; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 3387; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 3388; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 3389; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s6 3390; GFX90A-NEXT: s_lshr_b32 s1, s5, 16 3391; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc 3392; GFX90A-NEXT: v_mul_f32_e32 v1, v6, v7 3393; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s1 3394; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v4 3395; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 3396; GFX90A-NEXT: v_mad_f32 v6, -v1, v5, v6 3397; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 3398; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 3399; GFX90A-NEXT: v_mul_f32_e32 v5, v7, v8 3400; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 3401; GFX90A-NEXT: v_cvt_u32_f32_e32 v6, v5 3402; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 3403; GFX90A-NEXT: v_mad_f32 v5, -v5, v4, v7 3404; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 3405; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff 3406; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc 3407; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 3408; GFX90A-NEXT: v_and_b32_e32 v0, v5, v0 3409; GFX90A-NEXT: v_lshl_or_b32 v1, v4, 16, v1 3410; GFX90A-NEXT: v_lshl_or_b32 v0, v3, 16, v0 3411; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3412; GFX90A-NEXT: s_endpgm 3413 %r = udiv <4 x i16> %x, %y 3414 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 3415 ret void 3416} 3417 3418define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 3419; CHECK-LABEL: @urem_v4i16( 3420; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 3421; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 3422; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 3423; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 3424; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3425; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3426; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3427; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3428; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3429; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3430; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3431; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3432; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3433; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3434; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3435; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3436; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3437; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 3438; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 3439; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 3440; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 3441; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i16> undef, i16 [[TMP21]], i64 0 3442; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1 3443; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1 3444; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 3445; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 3446; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 3447; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 3448; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 3449; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 3450; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 3451; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 3452; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 3453; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 3454; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 3455; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 3456; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 3457; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 3458; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 3459; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 3460; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 3461; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 3462; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 3463; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1 3464; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2 3465; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2 3466; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 3467; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 3468; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 3469; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 3470; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 3471; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 3472; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 3473; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 3474; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 3475; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 3476; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 3477; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 3478; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 3479; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 3480; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 3481; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 3482; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 3483; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 3484; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 3485; CHECK-NEXT: [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2 3486; CHECK-NEXT: [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3 3487; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3 3488; CHECK-NEXT: [[TMP69:%.*]] = zext i16 [[TMP67]] to i32 3489; CHECK-NEXT: [[TMP70:%.*]] = zext i16 [[TMP68]] to i32 3490; CHECK-NEXT: [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float 3491; CHECK-NEXT: [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float 3492; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]]) 3493; CHECK-NEXT: [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]] 3494; CHECK-NEXT: [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]]) 3495; CHECK-NEXT: [[TMP76:%.*]] = fneg fast float [[TMP75]] 3496; CHECK-NEXT: [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]]) 3497; CHECK-NEXT: [[TMP78:%.*]] = fptoui float [[TMP75]] to i32 3498; CHECK-NEXT: [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]]) 3499; CHECK-NEXT: [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]]) 3500; CHECK-NEXT: [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]] 3501; CHECK-NEXT: [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0 3502; CHECK-NEXT: [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]] 3503; CHECK-NEXT: [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]] 3504; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]] 3505; CHECK-NEXT: [[TMP86:%.*]] = and i32 [[TMP85]], 65535 3506; CHECK-NEXT: [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16 3507; CHECK-NEXT: [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3 3508; CHECK-NEXT: store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 3509; CHECK-NEXT: ret void 3510; 3511; GFX6-LABEL: urem_v4i16: 3512; GFX6: ; %bb.0: 3513; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3514; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 3515; GFX6-NEXT: s_mov_b32 s8, 0xffff 3516; GFX6-NEXT: s_mov_b32 s7, 0xf000 3517; GFX6-NEXT: s_mov_b32 s6, -1 3518; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3519; GFX6-NEXT: s_and_b32 s9, s2, s8 3520; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 3521; GFX6-NEXT: s_and_b32 s10, s0, s8 3522; GFX6-NEXT: s_lshr_b32 s11, s2, 16 3523; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 3524; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3525; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s11 3526; GFX6-NEXT: s_lshr_b32 s9, s0, 16 3527; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s9 3528; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3529; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 3530; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3531; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3532; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 3533; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3534; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 3535; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3536; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 3537; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1 3538; GFX6-NEXT: v_mad_f32 v1, -v1, v3, v4 3539; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 3540; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3 3541; GFX6-NEXT: s_and_b32 s2, s3, s8 3542; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 3543; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 3544; GFX6-NEXT: s_and_b32 s2, s1, s8 3545; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11 3546; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 3547; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 3548; GFX6-NEXT: s_lshr_b32 s12, s3, 16 3549; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s9, v1 3550; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 3551; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s12 3552; GFX6-NEXT: s_lshr_b32 s10, s1, 16 3553; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s10 3554; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3555; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 3556; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 3557; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v3 3558; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 3559; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 3560; GFX6-NEXT: v_mul_f32_e32 v2, v6, v7 3561; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3562; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 3563; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3564; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v6 3565; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 3566; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 3567; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 3568; GFX6-NEXT: v_mul_lo_u32 v2, v2, s12 3569; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 3570; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 3571; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 3572; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3573; GFX6-NEXT: v_and_b32_e32 v1, s8, v1 3574; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 3575; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 3576; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 3577; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3578; GFX6-NEXT: s_endpgm 3579; 3580; GFX9-LABEL: urem_v4i16: 3581; GFX9: ; %bb.0: 3582; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3583; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3584; GFX9-NEXT: s_mov_b32 s8, 0xffff 3585; GFX9-NEXT: v_mov_b32_e32 v2, 0 3586; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3587; GFX9-NEXT: s_and_b32 s1, s6, s8 3588; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 3589; GFX9-NEXT: s_and_b32 s9, s4, s8 3590; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 3591; GFX9-NEXT: s_lshr_b32 s9, s6, 16 3592; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3593; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s9 3594; GFX9-NEXT: s_lshr_b32 s0, s4, 16 3595; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 3596; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 3597; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3598; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 3599; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 3600; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 3601; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3602; GFX9-NEXT: s_lshr_b32 s10, s7, 16 3603; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 3604; GFX9-NEXT: v_mul_f32_e32 v1, v5, v6 3605; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 3606; GFX9-NEXT: v_trunc_f32_e32 v1, v1 3607; GFX9-NEXT: s_and_b32 s6, s7, s8 3608; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 3609; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6 3610; GFX9-NEXT: s_and_b32 s6, s5, s8 3611; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s6 3612; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 3613; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 3614; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s10 3615; GFX9-NEXT: s_lshr_b32 s1, s5, 16 3616; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 3617; GFX9-NEXT: v_mul_f32_e32 v3, v6, v7 3618; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 3619; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v4 3620; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3621; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 3622; GFX9-NEXT: v_mad_f32 v6, -v3, v5, v6 3623; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 3624; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 3625; GFX9-NEXT: v_mul_f32_e32 v5, v7, v8 3626; GFX9-NEXT: v_trunc_f32_e32 v5, v5 3627; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v5 3628; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 3629; GFX9-NEXT: v_mad_f32 v5, -v5, v4, v7 3630; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 3631; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc 3632; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 3633; GFX9-NEXT: v_mul_lo_u32 v3, v3, s7 3634; GFX9-NEXT: v_mul_lo_u32 v4, v4, s10 3635; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 3636; GFX9-NEXT: v_sub_u32_e32 v5, s0, v1 3637; GFX9-NEXT: v_sub_u32_e32 v1, s5, v3 3638; GFX9-NEXT: v_sub_u32_e32 v3, s1, v4 3639; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff 3640; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 3641; GFX9-NEXT: v_and_b32_e32 v0, v4, v0 3642; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 3643; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 3644; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3645; GFX9-NEXT: s_endpgm 3646; 3647; GFX90A-LABEL: urem_v4i16: 3648; GFX90A: ; %bb.0: 3649; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3650; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3651; GFX90A-NEXT: s_mov_b32 s8, 0xffff 3652; GFX90A-NEXT: v_mov_b32_e32 v2, 0 3653; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 3654; GFX90A-NEXT: s_and_b32 s1, s6, s8 3655; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s1 3656; GFX90A-NEXT: s_and_b32 s9, s4, s8 3657; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 3658; GFX90A-NEXT: s_lshr_b32 s9, s6, 16 3659; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 3660; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s9 3661; GFX90A-NEXT: s_lshr_b32 s0, s4, 16 3662; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 3663; GFX90A-NEXT: v_mul_f32_e32 v3, v1, v3 3664; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 3665; GFX90A-NEXT: v_mad_f32 v1, -v3, v0, v1 3666; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 3667; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 3668; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3669; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 3670; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 3671; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s6 3672; GFX90A-NEXT: v_mul_f32_e32 v1, v5, v6 3673; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 3674; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 3675; GFX90A-NEXT: s_and_b32 s4, s7, s8 3676; GFX90A-NEXT: v_mad_f32 v3, -v1, v4, v5 3677; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s4 3678; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 3679; GFX90A-NEXT: s_and_b32 s4, s5, s8 3680; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s4 3681; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 3682; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 3683; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s10 3684; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 3685; GFX90A-NEXT: s_lshr_b32 s1, s5, 16 3686; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s9 3687; GFX90A-NEXT: v_sub_u32_e32 v3, s0, v1 3688; GFX90A-NEXT: v_mul_f32_e32 v1, v6, v7 3689; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s1 3690; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v4 3691; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 3692; GFX90A-NEXT: v_mad_f32 v6, -v1, v5, v6 3693; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 3694; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 3695; GFX90A-NEXT: v_mul_f32_e32 v5, v7, v8 3696; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 3697; GFX90A-NEXT: v_cvt_u32_f32_e32 v6, v5 3698; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 3699; GFX90A-NEXT: v_mad_f32 v5, -v5, v4, v7 3700; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 3701; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s7 3702; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc 3703; GFX90A-NEXT: v_sub_u32_e32 v1, s5, v1 3704; GFX90A-NEXT: v_mul_lo_u32 v4, v4, s10 3705; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff 3706; GFX90A-NEXT: v_sub_u32_e32 v4, s1, v4 3707; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 3708; GFX90A-NEXT: v_and_b32_e32 v0, v5, v0 3709; GFX90A-NEXT: v_lshl_or_b32 v1, v4, 16, v1 3710; GFX90A-NEXT: v_lshl_or_b32 v0, v3, 16, v0 3711; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3712; GFX90A-NEXT: s_endpgm 3713 %r = urem <4 x i16> %x, %y 3714 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 3715 ret void 3716} 3717 3718define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 3719; CHECK-LABEL: @sdiv_v4i16( 3720; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 3721; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 3722; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 3723; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 3724; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 3725; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 3726; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 3727; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 3728; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 3729; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 3730; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 3731; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 3732; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 3733; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 3734; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 3735; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 3736; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3737; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 3738; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 3739; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 3740; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 3741; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 3742; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 3743; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> undef, i16 [[TMP23]], i64 0 3744; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1 3745; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1 3746; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 3747; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 3748; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 3749; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 3750; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 3751; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 3752; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 3753; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 3754; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 3755; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 3756; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 3757; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 3758; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 3759; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 3760; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 3761; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 3762; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 3763; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 3764; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 3765; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 3766; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 3767; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1 3768; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2 3769; CHECK-NEXT: [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2 3770; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 3771; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 3772; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 3773; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 3774; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 3775; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 3776; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 3777; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 3778; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 3779; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 3780; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 3781; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 3782; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 3783; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 3784; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 3785; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 3786; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 3787; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 3788; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 3789; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 3790; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 3791; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2 3792; CHECK-NEXT: [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3 3793; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3 3794; CHECK-NEXT: [[TMP75:%.*]] = sext i16 [[TMP73]] to i32 3795; CHECK-NEXT: [[TMP76:%.*]] = sext i16 [[TMP74]] to i32 3796; CHECK-NEXT: [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]] 3797; CHECK-NEXT: [[TMP78:%.*]] = ashr i32 [[TMP77]], 30 3798; CHECK-NEXT: [[TMP79:%.*]] = or i32 [[TMP78]], 1 3799; CHECK-NEXT: [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float 3800; CHECK-NEXT: [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float 3801; CHECK-NEXT: [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]]) 3802; CHECK-NEXT: [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]] 3803; CHECK-NEXT: [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]]) 3804; CHECK-NEXT: [[TMP85:%.*]] = fneg fast float [[TMP84]] 3805; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]]) 3806; CHECK-NEXT: [[TMP87:%.*]] = fptosi float [[TMP84]] to i32 3807; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]]) 3808; CHECK-NEXT: [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]]) 3809; CHECK-NEXT: [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]] 3810; CHECK-NEXT: [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0 3811; CHECK-NEXT: [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]] 3812; CHECK-NEXT: [[TMP93:%.*]] = shl i32 [[TMP92]], 16 3813; CHECK-NEXT: [[TMP94:%.*]] = ashr i32 [[TMP93]], 16 3814; CHECK-NEXT: [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16 3815; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3 3816; CHECK-NEXT: store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 3817; CHECK-NEXT: ret void 3818; 3819; GFX6-LABEL: sdiv_v4i16: 3820; GFX6: ; %bb.0: 3821; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3822; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 3823; GFX6-NEXT: s_mov_b32 s7, 0xf000 3824; GFX6-NEXT: s_mov_b32 s6, -1 3825; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3826; GFX6-NEXT: s_sext_i32_i16 s8, s2 3827; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 3828; GFX6-NEXT: s_sext_i32_i16 s9, s0 3829; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 3830; GFX6-NEXT: s_xor_b32 s8, s9, s8 3831; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3832; GFX6-NEXT: s_ashr_i32 s2, s2, 16 3833; GFX6-NEXT: s_ashr_i32 s8, s8, 30 3834; GFX6-NEXT: s_or_b32 s8, s8, 1 3835; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3836; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3837; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3838; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 3839; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3840; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2 3841; GFX6-NEXT: v_mov_b32_e32 v3, s8 3842; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3843; GFX6-NEXT: s_ashr_i32 s0, s0, 16 3844; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 3845; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 3846; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 3847; GFX6-NEXT: s_xor_b32 s0, s0, s2 3848; GFX6-NEXT: s_ashr_i32 s0, s0, 30 3849; GFX6-NEXT: s_or_b32 s0, s0, 1 3850; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 3851; GFX6-NEXT: v_trunc_f32_e32 v3, v3 3852; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 3853; GFX6-NEXT: v_mov_b32_e32 v4, s0 3854; GFX6-NEXT: s_sext_i32_i16 s0, s3 3855; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 3856; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 3857; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 3858; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 3859; GFX6-NEXT: s_sext_i32_i16 s2, s1 3860; GFX6-NEXT: v_add_i32_e32 v3, vcc, v1, v3 3861; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2 3862; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 3863; GFX6-NEXT: s_xor_b32 s0, s2, s0 3864; GFX6-NEXT: s_ashr_i32 s0, s0, 30 3865; GFX6-NEXT: s_or_b32 s0, s0, 1 3866; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 3867; GFX6-NEXT: v_trunc_f32_e32 v4, v4 3868; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1 3869; GFX6-NEXT: v_mov_b32_e32 v5, s0 3870; GFX6-NEXT: s_ashr_i32 s0, s3, 16 3871; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 3872; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 3873; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 3874; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc 3875; GFX6-NEXT: s_ashr_i32 s1, s1, 16 3876; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 3877; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 3878; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 3879; GFX6-NEXT: s_xor_b32 s0, s1, s0 3880; GFX6-NEXT: s_ashr_i32 s0, s0, 30 3881; GFX6-NEXT: s_or_b32 s0, s0, 1 3882; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 3883; GFX6-NEXT: v_trunc_f32_e32 v5, v5 3884; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 3885; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 3886; GFX6-NEXT: v_mov_b32_e32 v6, s0 3887; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| 3888; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc 3889; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 3890; GFX6-NEXT: s_mov_b32 s0, 0xffff 3891; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3892; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 3893; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 3894; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 3895; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 3896; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 3897; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3898; GFX6-NEXT: s_endpgm 3899; 3900; GFX9-LABEL: sdiv_v4i16: 3901; GFX9: ; %bb.0: 3902; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3903; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3904; GFX9-NEXT: v_mov_b32_e32 v2, 0 3905; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3906; GFX9-NEXT: s_sext_i32_i16 s0, s6 3907; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 3908; GFX9-NEXT: s_sext_i32_i16 s1, s4 3909; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 3910; GFX9-NEXT: s_xor_b32 s0, s1, s0 3911; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3912; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3913; GFX9-NEXT: s_or_b32 s8, s0, 1 3914; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 3915; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3916; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 3917; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3918; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3919; GFX9-NEXT: s_cselect_b32 s0, s8, 0 3920; GFX9-NEXT: s_ashr_i32 s1, s6, 16 3921; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 3922; GFX9-NEXT: s_ashr_i32 s4, s4, 16 3923; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s4 3924; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 3925; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 3926; GFX9-NEXT: v_add_u32_e32 v3, s0, v3 3927; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 3928; GFX9-NEXT: s_xor_b32 s0, s4, s1 3929; GFX9-NEXT: v_trunc_f32_e32 v4, v4 3930; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3931; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 3932; GFX9-NEXT: s_or_b32 s4, s0, 1 3933; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3934; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3935; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 3936; GFX9-NEXT: s_sext_i32_i16 s1, s7 3937; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 3938; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3939; GFX9-NEXT: v_add_u32_e32 v4, s0, v4 3940; GFX9-NEXT: s_sext_i32_i16 s0, s5 3941; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 3942; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 3943; GFX9-NEXT: s_xor_b32 s0, s0, s1 3944; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3945; GFX9-NEXT: s_or_b32 s4, s0, 1 3946; GFX9-NEXT: v_mul_f32_e32 v5, v1, v5 3947; GFX9-NEXT: v_trunc_f32_e32 v5, v5 3948; GFX9-NEXT: v_mad_f32 v1, -v5, v0, v1 3949; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3950; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3951; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 3952; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3953; GFX9-NEXT: s_ashr_i32 s1, s7, 16 3954; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 3955; GFX9-NEXT: v_add_u32_e32 v1, s0, v5 3956; GFX9-NEXT: s_ashr_i32 s0, s5, 16 3957; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 3958; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v0 3959; GFX9-NEXT: s_xor_b32 s0, s0, s1 3960; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3961; GFX9-NEXT: s_or_b32 s4, s0, 1 3962; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 3963; GFX9-NEXT: v_trunc_f32_e32 v6, v6 3964; GFX9-NEXT: v_mad_f32 v5, -v6, v0, v5 3965; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 3966; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| 3967; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3968; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3969; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff 3970; GFX9-NEXT: v_add_u32_e32 v0, s0, v6 3971; GFX9-NEXT: v_and_b32_e32 v1, v5, v1 3972; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 3973; GFX9-NEXT: v_and_b32_e32 v0, v5, v3 3974; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 3975; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3976; GFX9-NEXT: s_endpgm 3977; 3978; GFX90A-LABEL: sdiv_v4i16: 3979; GFX90A: ; %bb.0: 3980; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3981; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3982; GFX90A-NEXT: v_mov_b32_e32 v2, 0 3983; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 3984; GFX90A-NEXT: s_sext_i32_i16 s0, s6 3985; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 3986; GFX90A-NEXT: s_sext_i32_i16 s1, s4 3987; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s1 3988; GFX90A-NEXT: s_xor_b32 s0, s1, s0 3989; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 3990; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 3991; GFX90A-NEXT: s_or_b32 s8, s0, 1 3992; GFX90A-NEXT: v_mul_f32_e32 v3, v1, v3 3993; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 3994; GFX90A-NEXT: v_mad_f32 v1, -v3, v0, v1 3995; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3996; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 3997; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 3998; GFX90A-NEXT: s_ashr_i32 s1, s6, 16 3999; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 4000; GFX90A-NEXT: s_ashr_i32 s4, s4, 16 4001; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s4 4002; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 4003; GFX90A-NEXT: v_rcp_iflag_f32_e32 v4, v0 4004; GFX90A-NEXT: v_add_u32_e32 v3, s0, v3 4005; GFX90A-NEXT: v_mul_f32_e32 v4, v1, v4 4006; GFX90A-NEXT: s_xor_b32 s0, s4, s1 4007; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 4008; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4009; GFX90A-NEXT: v_mad_f32 v1, -v4, v0, v1 4010; GFX90A-NEXT: s_or_b32 s4, s0, 1 4011; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 4012; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4013; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 4014; GFX90A-NEXT: s_sext_i32_i16 s1, s7 4015; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 4016; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 4017; GFX90A-NEXT: v_add_u32_e32 v4, s0, v4 4018; GFX90A-NEXT: s_sext_i32_i16 s0, s5 4019; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s0 4020; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v0 4021; GFX90A-NEXT: s_xor_b32 s0, s0, s1 4022; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4023; GFX90A-NEXT: s_or_b32 s4, s0, 1 4024; GFX90A-NEXT: v_mul_f32_e32 v5, v1, v5 4025; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 4026; GFX90A-NEXT: v_mad_f32 v1, -v5, v0, v1 4027; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 4028; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4029; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 4030; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 4031; GFX90A-NEXT: s_ashr_i32 s1, s7, 16 4032; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 4033; GFX90A-NEXT: v_add_u32_e32 v1, s0, v5 4034; GFX90A-NEXT: s_ashr_i32 s0, s5, 16 4035; GFX90A-NEXT: v_cvt_f32_i32_e32 v5, s0 4036; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v0 4037; GFX90A-NEXT: s_xor_b32 s0, s0, s1 4038; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4039; GFX90A-NEXT: s_or_b32 s4, s0, 1 4040; GFX90A-NEXT: v_mul_f32_e32 v6, v5, v6 4041; GFX90A-NEXT: v_trunc_f32_e32 v6, v6 4042; GFX90A-NEXT: v_mad_f32 v5, -v6, v0, v5 4043; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 4044; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| 4045; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4046; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 4047; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff 4048; GFX90A-NEXT: v_add_u32_e32 v0, s0, v6 4049; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 4050; GFX90A-NEXT: v_lshl_or_b32 v1, v0, 16, v1 4051; GFX90A-NEXT: v_and_b32_e32 v0, v5, v3 4052; GFX90A-NEXT: v_lshl_or_b32 v0, v4, 16, v0 4053; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 4054; GFX90A-NEXT: s_endpgm 4055 %r = sdiv <4 x i16> %x, %y 4056 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 4057 ret void 4058} 4059 4060define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 4061; CHECK-LABEL: @srem_v4i16( 4062; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 4063; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 4064; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 4065; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 4066; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 4067; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 4068; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 4069; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 4070; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 4071; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 4072; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 4073; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 4074; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 4075; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 4076; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 4077; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 4078; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4079; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 4080; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 4081; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 4082; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 4083; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 4084; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 4085; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 4086; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 4087; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> undef, i16 [[TMP25]], i64 0 4088; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1 4089; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1 4090; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 4091; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 4092; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 4093; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 4094; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 4095; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 4096; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 4097; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 4098; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 4099; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 4100; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 4101; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 4102; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 4103; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 4104; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 4105; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 4106; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 4107; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 4108; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 4109; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 4110; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 4111; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 4112; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 4113; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1 4114; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2 4115; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2 4116; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 4117; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 4118; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 4119; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 4120; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 4121; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 4122; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 4123; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 4124; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 4125; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 4126; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 4127; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 4128; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 4129; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 4130; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 4131; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 4132; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 4133; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 4134; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 4135; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 4136; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 4137; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 4138; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 4139; CHECK-NEXT: [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2 4140; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3 4141; CHECK-NEXT: [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3 4142; CHECK-NEXT: [[TMP81:%.*]] = sext i16 [[TMP79]] to i32 4143; CHECK-NEXT: [[TMP82:%.*]] = sext i16 [[TMP80]] to i32 4144; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]] 4145; CHECK-NEXT: [[TMP84:%.*]] = ashr i32 [[TMP83]], 30 4146; CHECK-NEXT: [[TMP85:%.*]] = or i32 [[TMP84]], 1 4147; CHECK-NEXT: [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float 4148; CHECK-NEXT: [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float 4149; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]]) 4150; CHECK-NEXT: [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]] 4151; CHECK-NEXT: [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]]) 4152; CHECK-NEXT: [[TMP91:%.*]] = fneg fast float [[TMP90]] 4153; CHECK-NEXT: [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]]) 4154; CHECK-NEXT: [[TMP93:%.*]] = fptosi float [[TMP90]] to i32 4155; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]]) 4156; CHECK-NEXT: [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]]) 4157; CHECK-NEXT: [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]] 4158; CHECK-NEXT: [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0 4159; CHECK-NEXT: [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]] 4160; CHECK-NEXT: [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]] 4161; CHECK-NEXT: [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]] 4162; CHECK-NEXT: [[TMP101:%.*]] = shl i32 [[TMP100]], 16 4163; CHECK-NEXT: [[TMP102:%.*]] = ashr i32 [[TMP101]], 16 4164; CHECK-NEXT: [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16 4165; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3 4166; CHECK-NEXT: store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 4167; CHECK-NEXT: ret void 4168; 4169; GFX6-LABEL: srem_v4i16: 4170; GFX6: ; %bb.0: 4171; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4172; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 4173; GFX6-NEXT: s_mov_b32 s7, 0xf000 4174; GFX6-NEXT: s_mov_b32 s6, -1 4175; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4176; GFX6-NEXT: s_sext_i32_i16 s8, s2 4177; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 4178; GFX6-NEXT: s_sext_i32_i16 s9, s0 4179; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 4180; GFX6-NEXT: s_xor_b32 s8, s9, s8 4181; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 4182; GFX6-NEXT: s_ashr_i32 s8, s8, 30 4183; GFX6-NEXT: s_or_b32 s8, s8, 1 4184; GFX6-NEXT: v_mov_b32_e32 v3, s8 4185; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 4186; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4187; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 4188; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 4189; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 4190; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 4191; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 4192; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 4193; GFX6-NEXT: s_ashr_i32 s2, s2, 16 4194; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2 4195; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 4196; GFX6-NEXT: s_ashr_i32 s0, s0, 16 4197; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 4198; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 4199; GFX6-NEXT: s_xor_b32 s8, s0, s2 4200; GFX6-NEXT: s_ashr_i32 s8, s8, 30 4201; GFX6-NEXT: s_or_b32 s8, s8, 1 4202; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 4203; GFX6-NEXT: v_trunc_f32_e32 v3, v3 4204; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 4205; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 4206; GFX6-NEXT: v_mov_b32_e32 v4, s8 4207; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 4208; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 4209; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 4210; GFX6-NEXT: v_mul_lo_u32 v1, v1, s2 4211; GFX6-NEXT: s_sext_i32_i16 s2, s3 4212; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s2 4213; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s0, v1 4214; GFX6-NEXT: s_sext_i32_i16 s0, s1 4215; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 4216; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 4217; GFX6-NEXT: s_xor_b32 s0, s0, s2 4218; GFX6-NEXT: s_ashr_i32 s0, s0, 30 4219; GFX6-NEXT: s_or_b32 s0, s0, 1 4220; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 4221; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4222; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1 4223; GFX6-NEXT: v_mov_b32_e32 v5, s0 4224; GFX6-NEXT: s_ashr_i32 s0, s3, 16 4225; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 4226; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 4227; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 4228; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc 4229; GFX6-NEXT: s_ashr_i32 s2, s1, 16 4230; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 4231; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s2 4232; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 4233; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 4234; GFX6-NEXT: s_xor_b32 s3, s2, s0 4235; GFX6-NEXT: s_ashr_i32 s3, s3, 30 4236; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 4237; GFX6-NEXT: v_trunc_f32_e32 v5, v5 4238; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 4239; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 4240; GFX6-NEXT: s_or_b32 s3, s3, 1 4241; GFX6-NEXT: v_mov_b32_e32 v6, s3 4242; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| 4243; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc 4244; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 4245; GFX6-NEXT: v_mul_lo_u32 v2, v2, s0 4246; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 4247; GFX6-NEXT: s_mov_b32 s0, 0xffff 4248; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 4249; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 4250; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 4251; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 4252; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 4253; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 4254; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 4255; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4256; GFX6-NEXT: s_endpgm 4257; 4258; GFX9-LABEL: srem_v4i16: 4259; GFX9: ; %bb.0: 4260; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4261; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 4262; GFX9-NEXT: v_mov_b32_e32 v2, 0 4263; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4264; GFX9-NEXT: s_sext_i32_i16 s0, s6 4265; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 4266; GFX9-NEXT: s_sext_i32_i16 s1, s4 4267; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 4268; GFX9-NEXT: s_xor_b32 s0, s1, s0 4269; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 4270; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4271; GFX9-NEXT: s_or_b32 s8, s0, 1 4272; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 4273; GFX9-NEXT: v_trunc_f32_e32 v3, v3 4274; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 4275; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 4276; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4277; GFX9-NEXT: s_cselect_b32 s0, s8, 0 4278; GFX9-NEXT: s_ashr_i32 s9, s6, 16 4279; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 4280; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s9 4281; GFX9-NEXT: s_ashr_i32 s8, s4, 16 4282; GFX9-NEXT: v_add_u32_e32 v1, s0, v3 4283; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s8 4284; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 4285; GFX9-NEXT: s_xor_b32 s0, s8, s9 4286; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4287; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 4288; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 4289; GFX9-NEXT: v_trunc_f32_e32 v4, v4 4290; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 4291; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 4292; GFX9-NEXT: s_or_b32 s6, s0, 1 4293; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| 4294; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4295; GFX9-NEXT: s_cselect_b32 s0, s6, 0 4296; GFX9-NEXT: v_add_u32_e32 v0, s0, v4 4297; GFX9-NEXT: s_sext_i32_i16 s0, s7 4298; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 4299; GFX9-NEXT: s_sext_i32_i16 s1, s5 4300; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 4301; GFX9-NEXT: s_xor_b32 s0, s1, s0 4302; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 4303; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4304; GFX9-NEXT: s_or_b32 s6, s0, 1 4305; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 4306; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 4307; GFX9-NEXT: v_trunc_f32_e32 v5, v5 4308; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 4309; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 4310; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 4311; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4312; GFX9-NEXT: s_cselect_b32 s0, s6, 0 4313; GFX9-NEXT: s_ashr_i32 s6, s7, 16 4314; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s6 4315; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 4316; GFX9-NEXT: v_mul_lo_u32 v3, v3, s7 4317; GFX9-NEXT: s_ashr_i32 s7, s5, 16 4318; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s7 4319; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 4320; GFX9-NEXT: s_xor_b32 s0, s7, s6 4321; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4322; GFX9-NEXT: s_or_b32 s9, s0, 1 4323; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 4324; GFX9-NEXT: v_trunc_f32_e32 v6, v6 4325; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 4326; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 4327; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| 4328; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4329; GFX9-NEXT: s_cselect_b32 s0, s9, 0 4330; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 4331; GFX9-NEXT: v_mul_lo_u32 v4, v4, s6 4332; GFX9-NEXT: v_sub_u32_e32 v5, s4, v1 4333; GFX9-NEXT: v_sub_u32_e32 v1, s5, v3 4334; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 4335; GFX9-NEXT: v_sub_u32_e32 v3, s7, v4 4336; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff 4337; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 4338; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 4339; GFX9-NEXT: v_and_b32_e32 v3, v4, v5 4340; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 4341; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 4342; GFX9-NEXT: s_endpgm 4343; 4344; GFX90A-LABEL: srem_v4i16: 4345; GFX90A: ; %bb.0: 4346; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4347; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 4348; GFX90A-NEXT: v_mov_b32_e32 v2, 0 4349; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 4350; GFX90A-NEXT: s_sext_i32_i16 s0, s6 4351; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 4352; GFX90A-NEXT: s_sext_i32_i16 s1, s4 4353; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s1 4354; GFX90A-NEXT: s_xor_b32 s0, s1, s0 4355; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 4356; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4357; GFX90A-NEXT: s_or_b32 s8, s0, 1 4358; GFX90A-NEXT: v_mul_f32_e32 v3, v1, v3 4359; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 4360; GFX90A-NEXT: v_mad_f32 v1, -v3, v0, v1 4361; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 4362; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 4363; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4364; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 4365; GFX90A-NEXT: s_ashr_i32 s8, s6, 16 4366; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s8 4367; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 4368; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s6 4369; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 4370; GFX90A-NEXT: s_ashr_i32 s4, s4, 16 4371; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s4 4372; GFX90A-NEXT: v_rcp_iflag_f32_e32 v4, v1 4373; GFX90A-NEXT: s_xor_b32 s0, s4, s8 4374; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4375; GFX90A-NEXT: s_or_b32 s6, s0, 1 4376; GFX90A-NEXT: v_mul_f32_e32 v4, v3, v4 4377; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 4378; GFX90A-NEXT: v_mad_f32 v3, -v4, v1, v3 4379; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 4380; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v1| 4381; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4382; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 4383; GFX90A-NEXT: v_add_u32_e32 v1, s0, v4 4384; GFX90A-NEXT: s_sext_i32_i16 s0, s7 4385; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s0 4386; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s8 4387; GFX90A-NEXT: s_sext_i32_i16 s1, s5 4388; GFX90A-NEXT: v_sub_u32_e32 v4, s4, v1 4389; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s1 4390; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v3 4391; GFX90A-NEXT: s_xor_b32 s0, s1, s0 4392; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4393; GFX90A-NEXT: s_or_b32 s4, s0, 1 4394; GFX90A-NEXT: v_mul_f32_e32 v5, v1, v5 4395; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 4396; GFX90A-NEXT: v_mad_f32 v1, -v5, v3, v1 4397; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 4398; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v3| 4399; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4400; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 4401; GFX90A-NEXT: s_ashr_i32 s4, s7, 16 4402; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s4 4403; GFX90A-NEXT: v_add_u32_e32 v1, s0, v5 4404; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s7 4405; GFX90A-NEXT: v_sub_u32_e32 v1, s5, v1 4406; GFX90A-NEXT: s_ashr_i32 s5, s5, 16 4407; GFX90A-NEXT: v_cvt_f32_i32_e32 v5, s5 4408; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v3 4409; GFX90A-NEXT: s_xor_b32 s0, s5, s4 4410; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4411; GFX90A-NEXT: s_or_b32 s6, s0, 1 4412; GFX90A-NEXT: v_mul_f32_e32 v6, v5, v6 4413; GFX90A-NEXT: v_trunc_f32_e32 v6, v6 4414; GFX90A-NEXT: v_mad_f32 v5, -v6, v3, v5 4415; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 4416; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| 4417; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4418; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 4419; GFX90A-NEXT: v_add_u32_e32 v3, s0, v6 4420; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s4 4421; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff 4422; GFX90A-NEXT: v_sub_u32_e32 v3, s5, v3 4423; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 4424; GFX90A-NEXT: v_and_b32_e32 v0, v5, v0 4425; GFX90A-NEXT: v_lshl_or_b32 v1, v3, 16, v1 4426; GFX90A-NEXT: v_lshl_or_b32 v0, v4, 16, v0 4427; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 4428; GFX90A-NEXT: s_endpgm 4429 %r = srem <4 x i16> %x, %y 4430 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 4431 ret void 4432} 4433 4434define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 4435; CHECK-LABEL: @udiv_i3( 4436; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 4437; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 4438; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 4439; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 4440; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 4441; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 4442; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 4443; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 4444; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 4445; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 4446; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4447; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 4448; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 4449; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 4450; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 4451; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 7 4452; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3 4453; CHECK-NEXT: store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]], align 1 4454; CHECK-NEXT: ret void 4455; 4456; GFX6-LABEL: udiv_i3: 4457; GFX6: ; %bb.0: 4458; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4459; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 4460; GFX6-NEXT: s_mov_b32 s7, 0xf000 4461; GFX6-NEXT: s_mov_b32 s6, -1 4462; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4463; GFX6-NEXT: s_bfe_u32 s1, s0, 0x30008 4464; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s1 4465; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 4466; GFX6-NEXT: s_and_b32 s0, s0, 7 4467; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 4468; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 4469; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4470; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 4471; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 4472; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 4473; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 4474; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 4475; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 4476; GFX6-NEXT: s_endpgm 4477; 4478; GFX9-LABEL: udiv_i3: 4479; GFX9: ; %bb.0: 4480; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4481; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 4482; GFX9-NEXT: v_mov_b32_e32 v2, 0 4483; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4484; GFX9-NEXT: s_bfe_u32 s0, s4, 0x30008 4485; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 4486; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 4487; GFX9-NEXT: s_and_b32 s0, s4, 7 4488; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 4489; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 4490; GFX9-NEXT: v_trunc_f32_e32 v1, v1 4491; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 4492; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v3 4493; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 4494; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 4495; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 4496; GFX9-NEXT: global_store_byte v2, v0, s[2:3] 4497; GFX9-NEXT: s_endpgm 4498; 4499; GFX90A-LABEL: udiv_i3: 4500; GFX90A: ; %bb.0: 4501; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4502; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 4503; GFX90A-NEXT: v_mov_b32_e32 v2, 0 4504; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 4505; GFX90A-NEXT: s_bfe_u32 s0, s4, 0x30008 4506; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 4507; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v0 4508; GFX90A-NEXT: s_and_b32 s0, s4, 7 4509; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 4510; GFX90A-NEXT: v_mul_f32_e32 v1, v3, v1 4511; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 4512; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v1 4513; GFX90A-NEXT: v_mad_f32 v1, -v1, v0, v3 4514; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 4515; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 4516; GFX90A-NEXT: v_and_b32_e32 v0, 7, v0 4517; GFX90A-NEXT: global_store_byte v2, v0, s[2:3] 4518; GFX90A-NEXT: s_endpgm 4519 %r = udiv i3 %x, %y 4520 store i3 %r, i3 addrspace(1)* %out 4521 ret void 4522} 4523 4524define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 4525; CHECK-LABEL: @urem_i3( 4526; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 4527; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 4528; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 4529; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 4530; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 4531; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 4532; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 4533; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 4534; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 4535; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 4536; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4537; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 4538; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 4539; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 4540; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 4541; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 4542; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 4543; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 7 4544; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3 4545; CHECK-NEXT: store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]], align 1 4546; CHECK-NEXT: ret void 4547; 4548; GFX6-LABEL: urem_i3: 4549; GFX6: ; %bb.0: 4550; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4551; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 4552; GFX6-NEXT: s_mov_b32 s7, 0xf000 4553; GFX6-NEXT: s_mov_b32 s6, -1 4554; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4555; GFX6-NEXT: s_bfe_u32 s1, s0, 0x30008 4556; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s1 4557; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 4558; GFX6-NEXT: s_and_b32 s2, s0, 7 4559; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 4560; GFX6-NEXT: s_lshr_b32 s1, s0, 8 4561; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 4562; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4563; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 4564; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 4565; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 4566; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 4567; GFX6-NEXT: v_mul_lo_u32 v0, v0, s1 4568; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 4569; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 4570; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 4571; GFX6-NEXT: s_endpgm 4572; 4573; GFX9-LABEL: urem_i3: 4574; GFX9: ; %bb.0: 4575; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 4576; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4577; GFX9-NEXT: s_bfe_u32 s3, s2, 0x30008 4578; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 4579; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 4580; GFX9-NEXT: s_and_b32 s4, s2, 7 4581; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 4582; GFX9-NEXT: s_lshr_b32 s3, s2, 8 4583; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 4584; GFX9-NEXT: v_trunc_f32_e32 v1, v1 4585; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 4586; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2 4587; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 4588; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4589; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 4590; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 4591; GFX9-NEXT: v_mov_b32_e32 v1, 0 4592; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 4593; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 4594; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4595; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 4596; GFX9-NEXT: s_endpgm 4597; 4598; GFX90A-LABEL: urem_i3: 4599; GFX90A: ; %bb.0: 4600; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4601; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 4602; GFX90A-NEXT: v_mov_b32_e32 v0, 0 4603; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 4604; GFX90A-NEXT: s_bfe_u32 s0, s4, 0x30008 4605; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 4606; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v1 4607; GFX90A-NEXT: s_and_b32 s1, s4, 7 4608; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v3, s1 4609; GFX90A-NEXT: s_lshr_b32 s0, s4, 8 4610; GFX90A-NEXT: v_mul_f32_e32 v2, v3, v2 4611; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 4612; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v2 4613; GFX90A-NEXT: v_mad_f32 v2, -v2, v1, v3 4614; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 4615; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc 4616; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s0 4617; GFX90A-NEXT: v_sub_u32_e32 v1, s4, v1 4618; GFX90A-NEXT: v_and_b32_e32 v1, 7, v1 4619; GFX90A-NEXT: global_store_byte v0, v1, s[2:3] 4620; GFX90A-NEXT: s_endpgm 4621 %r = urem i3 %x, %y 4622 store i3 %r, i3 addrspace(1)* %out 4623 ret void 4624} 4625 4626define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 4627; CHECK-LABEL: @sdiv_i3( 4628; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 4629; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 4630; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 4631; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 4632; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 4633; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 4634; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 4635; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 4636; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 4637; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 4638; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 4639; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 4640; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 4641; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 4642; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 4643; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 4644; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 4645; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 4646; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 29 4647; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 29 4648; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3 4649; CHECK-NEXT: store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]], align 1 4650; CHECK-NEXT: ret void 4651; 4652; GFX6-LABEL: sdiv_i3: 4653; GFX6: ; %bb.0: 4654; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4655; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 4656; GFX6-NEXT: s_mov_b32 s7, 0xf000 4657; GFX6-NEXT: s_mov_b32 s6, -1 4658; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4659; GFX6-NEXT: s_bfe_i32 s1, s0, 0x30008 4660; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 4661; GFX6-NEXT: s_bfe_i32 s0, s0, 0x30000 4662; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 4663; GFX6-NEXT: s_xor_b32 s0, s0, s1 4664; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 4665; GFX6-NEXT: s_ashr_i32 s0, s0, 30 4666; GFX6-NEXT: s_or_b32 s0, s0, 1 4667; GFX6-NEXT: v_mov_b32_e32 v3, s0 4668; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 4669; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4670; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 4671; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 4672; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 4673; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 4674; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 4675; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 4676; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 4677; GFX6-NEXT: s_endpgm 4678; 4679; GFX9-LABEL: sdiv_i3: 4680; GFX9: ; %bb.0: 4681; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4682; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 4683; GFX9-NEXT: v_mov_b32_e32 v1, 0 4684; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4685; GFX9-NEXT: s_bfe_i32 s0, s4, 0x30008 4686; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 4687; GFX9-NEXT: s_bfe_i32 s1, s4, 0x30000 4688; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 4689; GFX9-NEXT: s_xor_b32 s0, s1, s0 4690; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 4691; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4692; GFX9-NEXT: s_or_b32 s4, s0, 1 4693; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 4694; GFX9-NEXT: v_trunc_f32_e32 v3, v3 4695; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 4696; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 4697; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 4698; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4699; GFX9-NEXT: s_cselect_b32 s0, s4, 0 4700; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 4701; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 4702; GFX9-NEXT: global_store_byte v1, v0, s[2:3] 4703; GFX9-NEXT: s_endpgm 4704; 4705; GFX90A-LABEL: sdiv_i3: 4706; GFX90A: ; %bb.0: 4707; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4708; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 4709; GFX90A-NEXT: v_mov_b32_e32 v1, 0 4710; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 4711; GFX90A-NEXT: s_bfe_i32 s0, s4, 0x30008 4712; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 4713; GFX90A-NEXT: s_bfe_i32 s1, s4, 0x30000 4714; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 4715; GFX90A-NEXT: s_xor_b32 s0, s1, s0 4716; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 4717; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4718; GFX90A-NEXT: s_or_b32 s4, s0, 1 4719; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 4720; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 4721; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 4722; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 4723; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 4724; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4725; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 4726; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 4727; GFX90A-NEXT: v_and_b32_e32 v0, 7, v0 4728; GFX90A-NEXT: global_store_byte v1, v0, s[2:3] 4729; GFX90A-NEXT: s_endpgm 4730 %r = sdiv i3 %x, %y 4731 store i3 %r, i3 addrspace(1)* %out 4732 ret void 4733} 4734 4735define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 4736; CHECK-LABEL: @srem_i3( 4737; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 4738; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 4739; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 4740; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 4741; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 4742; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 4743; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 4744; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 4745; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 4746; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 4747; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 4748; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 4749; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 4750; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 4751; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 4752; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 4753; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 4754; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 4755; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 4756; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 4757; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 29 4758; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 29 4759; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3 4760; CHECK-NEXT: store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]], align 1 4761; CHECK-NEXT: ret void 4762; 4763; GFX6-LABEL: srem_i3: 4764; GFX6: ; %bb.0: 4765; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4766; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 4767; GFX6-NEXT: s_mov_b32 s7, 0xf000 4768; GFX6-NEXT: s_mov_b32 s6, -1 4769; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4770; GFX6-NEXT: s_bfe_i32 s1, s0, 0x30008 4771; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 4772; GFX6-NEXT: s_bfe_i32 s3, s0, 0x30000 4773; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 4774; GFX6-NEXT: s_xor_b32 s1, s3, s1 4775; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 4776; GFX6-NEXT: s_ashr_i32 s1, s1, 30 4777; GFX6-NEXT: s_or_b32 s1, s1, 1 4778; GFX6-NEXT: v_mov_b32_e32 v3, s1 4779; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 4780; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4781; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 4782; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 4783; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 4784; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 4785; GFX6-NEXT: s_lshr_b32 s2, s0, 8 4786; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 4787; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 4788; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 4789; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 4790; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 4791; GFX6-NEXT: s_endpgm 4792; 4793; GFX9-LABEL: srem_i3: 4794; GFX9: ; %bb.0: 4795; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 4796; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4797; GFX9-NEXT: s_bfe_i32 s2, s4, 0x30008 4798; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 4799; GFX9-NEXT: s_bfe_i32 s3, s4, 0x30000 4800; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 4801; GFX9-NEXT: s_xor_b32 s2, s3, s2 4802; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 4803; GFX9-NEXT: s_ashr_i32 s2, s2, 30 4804; GFX9-NEXT: s_lshr_b32 s5, s4, 8 4805; GFX9-NEXT: s_or_b32 s6, s2, 1 4806; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 4807; GFX9-NEXT: v_trunc_f32_e32 v2, v2 4808; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 4809; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 4810; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 4811; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 4812; GFX9-NEXT: s_cselect_b32 s2, s6, 0 4813; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 4814; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 4815; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4816; GFX9-NEXT: v_mov_b32_e32 v1, 0 4817; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 4818; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 4819; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4820; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 4821; GFX9-NEXT: s_endpgm 4822; 4823; GFX90A-LABEL: srem_i3: 4824; GFX90A: ; %bb.0: 4825; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4826; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 4827; GFX90A-NEXT: v_mov_b32_e32 v0, 0 4828; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 4829; GFX90A-NEXT: s_bfe_i32 s0, s4, 0x30008 4830; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s0 4831; GFX90A-NEXT: s_bfe_i32 s1, s4, 0x30000 4832; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 4833; GFX90A-NEXT: s_xor_b32 s0, s1, s0 4834; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v1 4835; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4836; GFX90A-NEXT: s_lshr_b32 s5, s4, 8 4837; GFX90A-NEXT: s_or_b32 s6, s0, 1 4838; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 4839; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 4840; GFX90A-NEXT: v_mad_f32 v2, -v3, v1, v2 4841; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 4842; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v1| 4843; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4844; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 4845; GFX90A-NEXT: v_add_u32_e32 v1, s0, v3 4846; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s5 4847; GFX90A-NEXT: v_sub_u32_e32 v1, s4, v1 4848; GFX90A-NEXT: v_and_b32_e32 v1, 7, v1 4849; GFX90A-NEXT: global_store_byte v0, v1, s[2:3] 4850; GFX90A-NEXT: s_endpgm 4851 %r = srem i3 %x, %y 4852 store i3 %r, i3 addrspace(1)* %out 4853 ret void 4854} 4855 4856define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 4857; CHECK-LABEL: @udiv_v3i16( 4858; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 4859; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 4860; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 4861; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 4862; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 4863; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 4864; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 4865; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 4866; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 4867; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 4868; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 4869; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 4870; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 4871; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 4872; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 4873; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 4874; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 4875; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 4876; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 4877; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i16> undef, i16 [[TMP19]], i64 0 4878; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1 4879; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1 4880; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 4881; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 4882; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 4883; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 4884; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 4885; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 4886; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 4887; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 4888; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 4889; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 4890; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 4891; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 4892; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 4893; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 4894; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 4895; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 4896; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 4897; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1 4898; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2 4899; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2 4900; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 4901; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 4902; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 4903; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 4904; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 4905; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 4906; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 4907; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 4908; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 4909; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 4910; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 4911; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 4912; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 4913; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 4914; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 4915; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 4916; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 4917; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2 4918; CHECK-NEXT: store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 4919; CHECK-NEXT: ret void 4920; 4921; GFX6-LABEL: udiv_v3i16: 4922; GFX6: ; %bb.0: 4923; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4924; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4925; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 4926; GFX6-NEXT: s_mov_b32 s8, 0xffff 4927; GFX6-NEXT: s_mov_b32 s7, 0xf000 4928; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4929; GFX6-NEXT: s_and_b32 s6, s0, s8 4930; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 4931; GFX6-NEXT: s_and_b32 s6, s2, s8 4932; GFX6-NEXT: s_lshr_b32 s0, s0, 16 4933; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s6 4934; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 4935; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s0 4936; GFX6-NEXT: s_lshr_b32 s0, s2, 16 4937; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s0 4938; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 4939; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 4940; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4941; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 4942; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 4943; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 4944; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 4945; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4946; GFX6-NEXT: s_and_b32 s0, s1, s8 4947; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 4948; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 4949; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s0 4950; GFX6-NEXT: s_and_b32 s0, s3, s8 4951; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 4952; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 4953; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 4954; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 4955; GFX6-NEXT: s_mov_b32 s6, -1 4956; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4957; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6 4958; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4959; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 4960; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5 4961; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 4962; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 4963; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 4964; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 4965; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 4966; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 4967; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4968; GFX6-NEXT: s_endpgm 4969; 4970; GFX9-LABEL: udiv_v3i16: 4971; GFX9: ; %bb.0: 4972; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4973; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 4974; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 4975; GFX9-NEXT: s_mov_b32 s8, 0xffff 4976; GFX9-NEXT: v_mov_b32_e32 v1, 0 4977; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4978; GFX9-NEXT: s_and_b32 s0, s6, s8 4979; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 4980; GFX9-NEXT: s_and_b32 s0, s4, s8 4981; GFX9-NEXT: s_lshr_b32 s1, s6, 16 4982; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s0 4983; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 4984; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s1 4985; GFX9-NEXT: s_lshr_b32 s0, s4, 16 4986; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 4987; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 4988; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 4989; GFX9-NEXT: v_trunc_f32_e32 v3, v3 4990; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 4991; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 4992; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 4993; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6 4994; GFX9-NEXT: v_trunc_f32_e32 v2, v2 4995; GFX9-NEXT: s_and_b32 s0, s7, s8 4996; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 4997; GFX9-NEXT: v_mad_f32 v3, -v2, v4, v5 4998; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 4999; GFX9-NEXT: s_and_b32 s0, s5, s8 5000; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 5001; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 5002; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 5003; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 5004; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 5005; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 5006; GFX9-NEXT: v_mul_f32_e32 v3, v6, v7 5007; GFX9-NEXT: v_trunc_f32_e32 v3, v3 5008; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 5009; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v6 5010; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 5011; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 5012; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 5013; GFX9-NEXT: global_store_short v1, v3, s[2:3] offset:4 5014; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 5015; GFX9-NEXT: s_endpgm 5016; 5017; GFX90A-LABEL: udiv_v3i16: 5018; GFX90A: ; %bb.0: 5019; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5020; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5021; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 5022; GFX90A-NEXT: s_mov_b32 s8, 0xffff 5023; GFX90A-NEXT: v_mov_b32_e32 v1, 0 5024; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 5025; GFX90A-NEXT: s_and_b32 s0, s6, s8 5026; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s0 5027; GFX90A-NEXT: s_and_b32 s0, s4, s8 5028; GFX90A-NEXT: s_lshr_b32 s1, s6, 16 5029; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s0 5030; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 5031; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s1 5032; GFX90A-NEXT: s_lshr_b32 s0, s4, 16 5033; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 5034; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 5035; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 5036; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 5037; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 5038; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 5039; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 5040; GFX90A-NEXT: v_mul_f32_e32 v2, v5, v6 5041; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 5042; GFX90A-NEXT: s_and_b32 s0, s7, s8 5043; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 5044; GFX90A-NEXT: v_mad_f32 v3, -v2, v4, v5 5045; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 5046; GFX90A-NEXT: s_and_b32 s0, s5, s8 5047; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 5048; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 5049; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 5050; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 5051; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 5052; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 5053; GFX90A-NEXT: v_mul_f32_e32 v3, v6, v7 5054; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 5055; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v3 5056; GFX90A-NEXT: v_mad_f32 v3, -v3, v5, v6 5057; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 5058; GFX90A-NEXT: v_lshl_or_b32 v0, v2, 16, v0 5059; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 5060; GFX90A-NEXT: global_store_short v1, v3, s[2:3] offset:4 5061; GFX90A-NEXT: global_store_dword v1, v0, s[2:3] 5062; GFX90A-NEXT: s_endpgm 5063 %r = udiv <3 x i16> %x, %y 5064 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 5065 ret void 5066} 5067 5068define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 5069; CHECK-LABEL: @urem_v3i16( 5070; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 5071; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 5072; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 5073; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 5074; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 5075; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 5076; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 5077; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 5078; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 5079; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 5080; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 5081; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 5082; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 5083; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 5084; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 5085; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 5086; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 5087; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 5088; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 5089; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 5090; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 5091; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i16> undef, i16 [[TMP21]], i64 0 5092; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1 5093; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1 5094; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 5095; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 5096; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 5097; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 5098; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 5099; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 5100; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 5101; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 5102; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 5103; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 5104; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 5105; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 5106; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 5107; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 5108; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 5109; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 5110; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 5111; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 5112; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 5113; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1 5114; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2 5115; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2 5116; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 5117; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 5118; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 5119; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 5120; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 5121; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 5122; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 5123; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 5124; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 5125; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 5126; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 5127; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 5128; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 5129; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 5130; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 5131; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 5132; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 5133; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 5134; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 5135; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2 5136; CHECK-NEXT: store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 5137; CHECK-NEXT: ret void 5138; 5139; GFX6-LABEL: urem_v3i16: 5140; GFX6: ; %bb.0: 5141; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5142; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 5143; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 5144; GFX6-NEXT: s_mov_b32 s8, 0xffff 5145; GFX6-NEXT: s_mov_b32 s7, 0xf000 5146; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5147; GFX6-NEXT: v_mov_b32_e32 v1, s2 5148; GFX6-NEXT: s_and_b32 s6, s0, s8 5149; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 5150; GFX6-NEXT: s_and_b32 s6, s2, s8 5151; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 5152; GFX6-NEXT: v_mov_b32_e32 v4, s0 5153; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 5154; GFX6-NEXT: v_alignbit_b32 v4, s1, v4, 16 5155; GFX6-NEXT: v_and_b32_e32 v5, s8, v4 5156; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 5157; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 5158; GFX6-NEXT: v_trunc_f32_e32 v3, v3 5159; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 5160; GFX6-NEXT: v_mad_f32 v2, -v3, v0, v2 5161; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 5162; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v5 5163; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc 5164; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 5165; GFX6-NEXT: v_mul_lo_u32 v0, v0, s0 5166; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 5167; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 5168; GFX6-NEXT: s_and_b32 s0, s1, s8 5169; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s0 5170; GFX6-NEXT: s_and_b32 s0, s3, s8 5171; GFX6-NEXT: v_mul_f32_e32 v5, v3, v5 5172; GFX6-NEXT: v_trunc_f32_e32 v5, v5 5173; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s0 5174; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 5175; GFX6-NEXT: v_mad_f32 v3, -v5, v2, v3 5176; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 5177; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 5178; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 5179; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 5180; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 5181; GFX6-NEXT: v_trunc_f32_e32 v3, v3 5182; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 5183; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3 5184; GFX6-NEXT: v_mad_f32 v3, -v3, v6, v7 5185; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v6 5186; GFX6-NEXT: s_mov_b32 s6, -1 5187; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 5188; GFX6-NEXT: v_mul_lo_u32 v3, v3, s1 5189; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 5190; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 5191; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 5192; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 5193; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 5194; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 5195; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5196; GFX6-NEXT: s_endpgm 5197; 5198; GFX9-LABEL: urem_v3i16: 5199; GFX9: ; %bb.0: 5200; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5201; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5202; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 5203; GFX9-NEXT: s_mov_b32 s8, 0xffff 5204; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5205; GFX9-NEXT: s_and_b32 s0, s4, s8 5206; GFX9-NEXT: s_and_b32 s1, s6, s8 5207; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 5208; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 5209; GFX9-NEXT: s_lshr_b32 s6, s6, 16 5210; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 5211; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 5212; GFX9-NEXT: s_lshr_b32 s4, s4, 16 5213; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 5214; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v2 5215; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 5216; GFX9-NEXT: v_trunc_f32_e32 v3, v3 5217; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v3 5218; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 5219; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 5220; GFX9-NEXT: v_mul_f32_e32 v1, v4, v5 5221; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc 5222; GFX9-NEXT: v_mul_lo_u32 v0, v0, s1 5223; GFX9-NEXT: v_trunc_f32_e32 v1, v1 5224; GFX9-NEXT: s_and_b32 s1, s7, s8 5225; GFX9-NEXT: v_mad_f32 v3, -v1, v2, v4 5226; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s1 5227; GFX9-NEXT: s_and_b32 s5, s5, s8 5228; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5 5229; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 5230; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 5231; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 5232; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 5233; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 5234; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6 5235; GFX9-NEXT: v_trunc_f32_e32 v2, v2 5236; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 5237; GFX9-NEXT: v_mad_f32 v2, -v2, v4, v5 5238; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 5239; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 5240; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc 5241; GFX9-NEXT: v_mul_lo_u32 v2, v2, s1 5242; GFX9-NEXT: v_mov_b32_e32 v3, 0 5243; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 5244; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 5245; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 5246; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 5247; GFX9-NEXT: global_store_short v3, v2, s[2:3] offset:4 5248; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 5249; GFX9-NEXT: s_endpgm 5250; 5251; GFX90A-LABEL: urem_v3i16: 5252; GFX90A: ; %bb.0: 5253; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5254; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5255; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 5256; GFX90A-NEXT: s_mov_b32 s8, 0xffff 5257; GFX90A-NEXT: v_mov_b32_e32 v1, 0 5258; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 5259; GFX90A-NEXT: s_and_b32 s1, s4, s8 5260; GFX90A-NEXT: s_and_b32 s0, s6, s8 5261; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s0 5262; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s1 5263; GFX90A-NEXT: s_lshr_b32 s6, s6, 16 5264; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s6 5265; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 5266; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 5267; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s4 5268; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 5269; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 5270; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 5271; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 5272; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 5273; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 5274; GFX90A-NEXT: v_mul_f32_e32 v2, v5, v6 5275; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 5276; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 5277; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s0 5278; GFX90A-NEXT: s_and_b32 s0, s7, s8 5279; GFX90A-NEXT: v_mad_f32 v3, -v2, v4, v5 5280; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 5281; GFX90A-NEXT: v_sub_u32_e32 v0, s1, v0 5282; GFX90A-NEXT: s_and_b32 s1, s5, s8 5283; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s1 5284; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 5285; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 5286; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 5287; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 5288; GFX90A-NEXT: v_mul_f32_e32 v3, v6, v7 5289; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 5290; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v3 5291; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 5292; GFX90A-NEXT: v_mad_f32 v3, -v3, v5, v6 5293; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 5294; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 5295; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s6 5296; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s0 5297; GFX90A-NEXT: v_sub_u32_e32 v2, s4, v2 5298; GFX90A-NEXT: v_sub_u32_e32 v3, s1, v3 5299; GFX90A-NEXT: v_lshl_or_b32 v0, v2, 16, v0 5300; GFX90A-NEXT: global_store_short v1, v3, s[2:3] offset:4 5301; GFX90A-NEXT: global_store_dword v1, v0, s[2:3] 5302; GFX90A-NEXT: s_endpgm 5303 %r = urem <3 x i16> %x, %y 5304 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 5305 ret void 5306} 5307 5308define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 5309; CHECK-LABEL: @sdiv_v3i16( 5310; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 5311; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 5312; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 5313; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 5314; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 5315; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 5316; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 5317; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 5318; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 5319; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 5320; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 5321; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 5322; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 5323; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 5324; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 5325; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 5326; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 5327; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 5328; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 5329; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 5330; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 5331; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 5332; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 5333; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i16> undef, i16 [[TMP23]], i64 0 5334; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1 5335; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1 5336; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 5337; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 5338; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 5339; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 5340; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 5341; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 5342; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 5343; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 5344; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 5345; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 5346; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 5347; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 5348; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 5349; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 5350; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 5351; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 5352; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 5353; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 5354; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 5355; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 5356; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 5357; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1 5358; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2 5359; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2 5360; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 5361; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 5362; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 5363; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 5364; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 5365; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 5366; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 5367; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 5368; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 5369; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 5370; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 5371; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 5372; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 5373; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 5374; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 5375; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 5376; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 5377; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 5378; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 5379; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 5380; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 5381; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2 5382; CHECK-NEXT: store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 5383; CHECK-NEXT: ret void 5384; 5385; GFX6-LABEL: sdiv_v3i16: 5386; GFX6: ; %bb.0: 5387; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5388; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 5389; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 5390; GFX6-NEXT: s_mov_b32 s7, 0xf000 5391; GFX6-NEXT: s_mov_b32 s6, -1 5392; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5393; GFX6-NEXT: s_sext_i32_i16 s9, s2 5394; GFX6-NEXT: s_sext_i32_i16 s8, s0 5395; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 5396; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 5397; GFX6-NEXT: s_xor_b32 s8, s9, s8 5398; GFX6-NEXT: s_ashr_i32 s0, s0, 16 5399; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 5400; GFX6-NEXT: s_ashr_i32 s8, s8, 30 5401; GFX6-NEXT: s_or_b32 s8, s8, 1 5402; GFX6-NEXT: v_mov_b32_e32 v3, s8 5403; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 5404; GFX6-NEXT: v_trunc_f32_e32 v2, v2 5405; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 5406; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 5407; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 5408; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 5409; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 5410; GFX6-NEXT: s_ashr_i32 s2, s2, 16 5411; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 5412; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s2 5413; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 5414; GFX6-NEXT: s_xor_b32 s0, s2, s0 5415; GFX6-NEXT: s_ashr_i32 s0, s0, 30 5416; GFX6-NEXT: s_or_b32 s0, s0, 1 5417; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 5418; GFX6-NEXT: v_trunc_f32_e32 v3, v3 5419; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 5420; GFX6-NEXT: v_mov_b32_e32 v4, s0 5421; GFX6-NEXT: s_sext_i32_i16 s0, s1 5422; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 5423; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 5424; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 5425; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 5426; GFX6-NEXT: s_sext_i32_i16 s1, s3 5427; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 5428; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s1 5429; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 5430; GFX6-NEXT: s_xor_b32 s0, s1, s0 5431; GFX6-NEXT: s_ashr_i32 s0, s0, 30 5432; GFX6-NEXT: s_or_b32 s0, s0, 1 5433; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 5434; GFX6-NEXT: v_trunc_f32_e32 v4, v4 5435; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 5436; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 5437; GFX6-NEXT: v_mov_b32_e32 v5, s0 5438; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 5439; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 5440; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 5441; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 5442; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 5443; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 5444; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 5445; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5446; GFX6-NEXT: s_endpgm 5447; 5448; GFX9-LABEL: sdiv_v3i16: 5449; GFX9: ; %bb.0: 5450; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5451; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5452; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 5453; GFX9-NEXT: v_mov_b32_e32 v1, 0 5454; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5455; GFX9-NEXT: s_sext_i32_i16 s1, s4 5456; GFX9-NEXT: s_sext_i32_i16 s0, s6 5457; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 5458; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 5459; GFX9-NEXT: s_xor_b32 s0, s1, s0 5460; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5461; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 5462; GFX9-NEXT: s_or_b32 s8, s0, 1 5463; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 5464; GFX9-NEXT: v_trunc_f32_e32 v3, v3 5465; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 5466; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 5467; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5468; GFX9-NEXT: s_cselect_b32 s0, s8, 0 5469; GFX9-NEXT: s_ashr_i32 s1, s6, 16 5470; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 5471; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 5472; GFX9-NEXT: s_ashr_i32 s4, s4, 16 5473; GFX9-NEXT: v_add_u32_e32 v2, s0, v3 5474; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 5475; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 5476; GFX9-NEXT: s_xor_b32 s0, s4, s1 5477; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5478; GFX9-NEXT: s_or_b32 s4, s0, 1 5479; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 5480; GFX9-NEXT: v_trunc_f32_e32 v4, v4 5481; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 5482; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| 5483; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5484; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 5485; GFX9-NEXT: s_sext_i32_i16 s1, s7 5486; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 5487; GFX9-NEXT: s_cselect_b32 s0, s4, 0 5488; GFX9-NEXT: v_add_u32_e32 v3, s0, v4 5489; GFX9-NEXT: s_sext_i32_i16 s0, s5 5490; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 5491; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 5492; GFX9-NEXT: s_xor_b32 s0, s0, s1 5493; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5494; GFX9-NEXT: s_or_b32 s4, s0, 1 5495; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 5496; GFX9-NEXT: v_trunc_f32_e32 v5, v5 5497; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 5498; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 5499; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| 5500; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5501; GFX9-NEXT: s_cselect_b32 s0, s4, 0 5502; GFX9-NEXT: v_add_u32_e32 v0, s0, v5 5503; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 5504; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 5505; GFX9-NEXT: global_store_short v1, v0, s[2:3] offset:4 5506; GFX9-NEXT: global_store_dword v1, v2, s[2:3] 5507; GFX9-NEXT: s_endpgm 5508; 5509; GFX90A-LABEL: sdiv_v3i16: 5510; GFX90A: ; %bb.0: 5511; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5512; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5513; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 5514; GFX90A-NEXT: v_mov_b32_e32 v1, 0 5515; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 5516; GFX90A-NEXT: s_sext_i32_i16 s1, s4 5517; GFX90A-NEXT: s_sext_i32_i16 s0, s6 5518; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 5519; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 5520; GFX90A-NEXT: s_xor_b32 s0, s1, s0 5521; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 5522; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 5523; GFX90A-NEXT: s_or_b32 s8, s0, 1 5524; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 5525; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 5526; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 5527; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 5528; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 5529; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 5530; GFX90A-NEXT: s_ashr_i32 s1, s6, 16 5531; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 5532; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 5533; GFX90A-NEXT: s_ashr_i32 s4, s4, 16 5534; GFX90A-NEXT: v_add_u32_e32 v2, s0, v3 5535; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s4 5536; GFX90A-NEXT: v_rcp_iflag_f32_e32 v4, v0 5537; GFX90A-NEXT: s_xor_b32 s0, s4, s1 5538; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 5539; GFX90A-NEXT: s_or_b32 s4, s0, 1 5540; GFX90A-NEXT: v_mul_f32_e32 v4, v3, v4 5541; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 5542; GFX90A-NEXT: v_mad_f32 v3, -v4, v0, v3 5543; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| 5544; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 5545; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 5546; GFX90A-NEXT: s_sext_i32_i16 s1, s7 5547; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 5548; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 5549; GFX90A-NEXT: v_add_u32_e32 v3, s0, v4 5550; GFX90A-NEXT: s_sext_i32_i16 s0, s5 5551; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s0 5552; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v0 5553; GFX90A-NEXT: s_xor_b32 s0, s0, s1 5554; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 5555; GFX90A-NEXT: s_or_b32 s4, s0, 1 5556; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 5557; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 5558; GFX90A-NEXT: v_mad_f32 v4, -v5, v0, v4 5559; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 5560; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| 5561; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 5562; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 5563; GFX90A-NEXT: v_add_u32_e32 v0, s0, v5 5564; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff, v2 5565; GFX90A-NEXT: v_lshl_or_b32 v2, v3, 16, v2 5566; GFX90A-NEXT: global_store_short v1, v0, s[2:3] offset:4 5567; GFX90A-NEXT: global_store_dword v1, v2, s[2:3] 5568; GFX90A-NEXT: s_endpgm 5569 %r = sdiv <3 x i16> %x, %y 5570 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 5571 ret void 5572} 5573 5574define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 5575; CHECK-LABEL: @srem_v3i16( 5576; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 5577; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 5578; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 5579; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 5580; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 5581; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 5582; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 5583; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 5584; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 5585; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 5586; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 5587; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 5588; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 5589; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 5590; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 5591; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 5592; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 5593; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 5594; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 5595; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 5596; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 5597; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 5598; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 5599; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 5600; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 5601; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i16> undef, i16 [[TMP25]], i64 0 5602; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1 5603; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1 5604; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 5605; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 5606; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 5607; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 5608; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 5609; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 5610; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 5611; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 5612; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 5613; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 5614; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 5615; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 5616; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 5617; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 5618; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 5619; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 5620; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 5621; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 5622; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 5623; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 5624; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 5625; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 5626; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 5627; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1 5628; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2 5629; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2 5630; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 5631; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 5632; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 5633; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 5634; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 5635; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 5636; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 5637; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 5638; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 5639; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 5640; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 5641; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 5642; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 5643; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 5644; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 5645; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 5646; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 5647; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 5648; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 5649; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 5650; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 5651; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 5652; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 5653; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2 5654; CHECK-NEXT: store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 5655; CHECK-NEXT: ret void 5656; 5657; GFX6-LABEL: srem_v3i16: 5658; GFX6: ; %bb.0: 5659; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5660; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 5661; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 5662; GFX6-NEXT: s_mov_b32 s7, 0xf000 5663; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5664; GFX6-NEXT: s_sext_i32_i16 s8, s2 5665; GFX6-NEXT: s_sext_i32_i16 s6, s0 5666; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 5667; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s8 5668; GFX6-NEXT: s_xor_b32 s6, s8, s6 5669; GFX6-NEXT: s_ashr_i32 s6, s6, 30 5670; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 5671; GFX6-NEXT: s_or_b32 s6, s6, 1 5672; GFX6-NEXT: v_mov_b32_e32 v3, s6 5673; GFX6-NEXT: s_mov_b32 s6, -1 5674; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 5675; GFX6-NEXT: v_trunc_f32_e32 v2, v2 5676; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 5677; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 5678; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 5679; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 5680; GFX6-NEXT: v_mov_b32_e32 v1, s2 5681; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 5682; GFX6-NEXT: v_mov_b32_e32 v2, s0 5683; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 16 5684; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 5685; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v3 5686; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 5687; GFX6-NEXT: v_bfe_i32 v5, v1, 0, 16 5688; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v5 5689; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 5690; GFX6-NEXT: v_mul_lo_u32 v0, v0, s0 5691; GFX6-NEXT: v_xor_b32_e32 v3, v5, v3 5692; GFX6-NEXT: s_sext_i32_i16 s0, s1 5693; GFX6-NEXT: v_mul_f32_e32 v5, v6, v7 5694; GFX6-NEXT: v_trunc_f32_e32 v5, v5 5695; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 5696; GFX6-NEXT: v_mad_f32 v6, -v5, v4, v6 5697; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 5698; GFX6-NEXT: v_ashrrev_i32_e32 v3, 30, v3 5699; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4| 5700; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s0 5701; GFX6-NEXT: v_or_b32_e32 v3, 1, v3 5702; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 5703; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 5704; GFX6-NEXT: s_sext_i32_i16 s2, s3 5705; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 5706; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s2 5707; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v4 5708; GFX6-NEXT: s_xor_b32 s0, s2, s0 5709; GFX6-NEXT: s_ashr_i32 s0, s0, 30 5710; GFX6-NEXT: s_or_b32 s0, s0, 1 5711; GFX6-NEXT: v_mul_f32_e32 v5, v3, v5 5712; GFX6-NEXT: v_trunc_f32_e32 v5, v5 5713; GFX6-NEXT: v_mad_f32 v3, -v5, v4, v3 5714; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 5715; GFX6-NEXT: v_mov_b32_e32 v6, s0 5716; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v4| 5717; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 5718; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 5719; GFX6-NEXT: v_mul_lo_u32 v3, v3, s1 5720; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 5721; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 5722; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 5723; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 5724; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 5725; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 5726; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5727; GFX6-NEXT: s_endpgm 5728; 5729; GFX9-LABEL: srem_v3i16: 5730; GFX9: ; %bb.0: 5731; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 5732; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 5733; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 5734; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5735; GFX9-NEXT: s_sext_i32_i16 s8, s2 5736; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 5737; GFX9-NEXT: s_sext_i32_i16 s9, s6 5738; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 5739; GFX9-NEXT: s_xor_b32 s0, s9, s8 5740; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 5741; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5742; GFX9-NEXT: s_or_b32 s10, s0, 1 5743; GFX9-NEXT: s_sext_i32_i16 s3, s3 5744; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 5745; GFX9-NEXT: v_trunc_f32_e32 v2, v2 5746; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 5747; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 5748; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5749; GFX9-NEXT: s_cselect_b32 s0, s10, 0 5750; GFX9-NEXT: s_ashr_i32 s2, s2, 16 5751; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 5752; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 5753; GFX9-NEXT: s_ashr_i32 s6, s6, 16 5754; GFX9-NEXT: v_add_u32_e32 v1, s0, v2 5755; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s6 5756; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 5757; GFX9-NEXT: s_xor_b32 s0, s6, s2 5758; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5759; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 5760; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 5761; GFX9-NEXT: v_trunc_f32_e32 v3, v3 5762; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 5763; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 5764; GFX9-NEXT: s_or_b32 s8, s0, 1 5765; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 5766; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5767; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 5768; GFX9-NEXT: s_cselect_b32 s0, s8, 0 5769; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 5770; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 5771; GFX9-NEXT: s_sext_i32_i16 s2, s7 5772; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2 5773; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 5774; GFX9-NEXT: s_xor_b32 s0, s2, s3 5775; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5776; GFX9-NEXT: s_or_b32 s7, s0, 1 5777; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 5778; GFX9-NEXT: v_trunc_f32_e32 v4, v4 5779; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 5780; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 5781; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| 5782; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5783; GFX9-NEXT: s_cselect_b32 s0, s7, 0 5784; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 5785; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 5786; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 5787; GFX9-NEXT: v_mov_b32_e32 v3, 0 5788; GFX9-NEXT: v_sub_u32_e32 v0, s6, v0 5789; GFX9-NEXT: v_sub_u32_e32 v2, s2, v2 5790; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 5791; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 5792; GFX9-NEXT: global_store_short v3, v2, s[4:5] offset:4 5793; GFX9-NEXT: global_store_dword v3, v0, s[4:5] 5794; GFX9-NEXT: s_endpgm 5795; 5796; GFX90A-LABEL: srem_v3i16: 5797; GFX90A: ; %bb.0: 5798; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5799; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5800; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 5801; GFX90A-NEXT: v_mov_b32_e32 v1, 0 5802; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 5803; GFX90A-NEXT: s_sext_i32_i16 s9, s4 5804; GFX90A-NEXT: s_sext_i32_i16 s8, s6 5805; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s8 5806; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s9 5807; GFX90A-NEXT: s_xor_b32 s0, s9, s8 5808; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 5809; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 5810; GFX90A-NEXT: s_or_b32 s10, s0, 1 5811; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 5812; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 5813; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 5814; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 5815; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 5816; GFX90A-NEXT: s_cselect_b32 s0, s10, 0 5817; GFX90A-NEXT: s_ashr_i32 s6, s6, 16 5818; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 5819; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s6 5820; GFX90A-NEXT: s_ashr_i32 s4, s4, 16 5821; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 5822; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s4 5823; GFX90A-NEXT: v_rcp_iflag_f32_e32 v4, v2 5824; GFX90A-NEXT: s_xor_b32 s0, s4, s6 5825; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 5826; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s8 5827; GFX90A-NEXT: v_mul_f32_e32 v4, v3, v4 5828; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 5829; GFX90A-NEXT: v_mad_f32 v3, -v4, v2, v3 5830; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 5831; GFX90A-NEXT: s_or_b32 s8, s0, 1 5832; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| 5833; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 5834; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 5835; GFX90A-NEXT: v_add_u32_e32 v2, s0, v4 5836; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s6 5837; GFX90A-NEXT: s_sext_i32_i16 s6, s7 5838; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s6 5839; GFX90A-NEXT: v_sub_u32_e32 v2, s4, v2 5840; GFX90A-NEXT: s_sext_i32_i16 s4, s5 5841; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s4 5842; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v3 5843; GFX90A-NEXT: s_xor_b32 s0, s4, s6 5844; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 5845; GFX90A-NEXT: s_or_b32 s5, s0, 1 5846; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 5847; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 5848; GFX90A-NEXT: v_mad_f32 v4, -v5, v3, v4 5849; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 5850; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 5851; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 5852; GFX90A-NEXT: s_cselect_b32 s0, s5, 0 5853; GFX90A-NEXT: v_add_u32_e32 v3, s0, v5 5854; GFX90A-NEXT: v_sub_u32_e32 v0, s9, v0 5855; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s6 5856; GFX90A-NEXT: v_sub_u32_e32 v3, s4, v3 5857; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 5858; GFX90A-NEXT: v_lshl_or_b32 v0, v2, 16, v0 5859; GFX90A-NEXT: global_store_short v1, v3, s[2:3] offset:4 5860; GFX90A-NEXT: global_store_dword v1, v0, s[2:3] 5861; GFX90A-NEXT: s_endpgm 5862 %r = srem <3 x i16> %x, %y 5863 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 5864 ret void 5865} 5866 5867define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 5868; CHECK-LABEL: @udiv_v3i15( 5869; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 5870; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 5871; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 5872; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 5873; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 5874; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 5875; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 5876; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 5877; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 5878; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 5879; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 5880; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 5881; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 5882; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 5883; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 5884; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 5885; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 5886; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 32767 5887; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15 5888; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i15> undef, i15 [[TMP19]], i64 0 5889; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1 5890; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1 5891; CHECK-NEXT: [[TMP23:%.*]] = zext i15 [[TMP21]] to i32 5892; CHECK-NEXT: [[TMP24:%.*]] = zext i15 [[TMP22]] to i32 5893; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 5894; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 5895; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 5896; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 5897; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 5898; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 5899; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 5900; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 5901; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 5902; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 5903; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 5904; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 5905; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 5906; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 32767 5907; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15 5908; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1 5909; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2 5910; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2 5911; CHECK-NEXT: [[TMP43:%.*]] = zext i15 [[TMP41]] to i32 5912; CHECK-NEXT: [[TMP44:%.*]] = zext i15 [[TMP42]] to i32 5913; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 5914; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 5915; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 5916; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 5917; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 5918; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 5919; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 5920; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 5921; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 5922; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 5923; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 5924; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 5925; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 5926; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 32767 5927; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15 5928; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2 5929; CHECK-NEXT: store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 5930; CHECK-NEXT: ret void 5931; 5932; GFX6-LABEL: udiv_v3i15: 5933; GFX6: ; %bb.0: 5934; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5935; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 5936; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 5937; GFX6-NEXT: s_mov_b32 s7, 0xf000 5938; GFX6-NEXT: s_mov_b32 s6, -1 5939; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5940; GFX6-NEXT: v_mov_b32_e32 v0, s2 5941; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 5942; GFX6-NEXT: s_movk_i32 s3, 0x7fff 5943; GFX6-NEXT: s_and_b32 s9, s0, s3 5944; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 5945; GFX6-NEXT: s_and_b32 s8, s2, s3 5946; GFX6-NEXT: v_mov_b32_e32 v2, s0 5947; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf000f 5948; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 5949; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 5950; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 5951; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf000f 5952; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 5953; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 5954; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s2 5955; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 5956; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 5957; GFX6-NEXT: v_trunc_f32_e32 v4, v4 5958; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 5959; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 5960; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v2 5961; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 5962; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7 5963; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 5964; GFX6-NEXT: v_trunc_f32_e32 v1, v1 5965; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 5966; GFX6-NEXT: v_mad_f32 v4, -v1, v5, v6 5967; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 5968; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0 5969; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 5970; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v5 5971; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 5972; GFX6-NEXT: v_mul_f32_e32 v1, v0, v6 5973; GFX6-NEXT: v_trunc_f32_e32 v1, v1 5974; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v1 5975; GFX6-NEXT: v_mad_f32 v0, -v1, v2, v0 5976; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 5977; GFX6-NEXT: v_and_b32_e32 v2, s3, v3 5978; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc 5979; GFX6-NEXT: v_and_b32_e32 v3, s3, v4 5980; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 5981; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 5982; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 5983; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 5984; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5985; GFX6-NEXT: s_waitcnt expcnt(0) 5986; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5987; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 5988; GFX6-NEXT: s_endpgm 5989; 5990; GFX9-LABEL: udiv_v3i15: 5991; GFX9: ; %bb.0: 5992; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5993; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5994; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 5995; GFX9-NEXT: s_movk_i32 s8, 0x7fff 5996; GFX9-NEXT: v_mov_b32_e32 v2, 0 5997; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5998; GFX9-NEXT: s_and_b32 s0, s4, s8 5999; GFX9-NEXT: s_and_b32 s1, s6, s8 6000; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 6001; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 6002; GFX9-NEXT: s_bfe_u32 s0, s6, 0xf000f 6003; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 6004; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 6005; GFX9-NEXT: s_bfe_u32 s1, s4, 0xf000f 6006; GFX9-NEXT: v_mov_b32_e32 v3, s6 6007; GFX9-NEXT: v_alignbit_b32 v3, s7, v3, 30 6008; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 6009; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 6010; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 6011; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 6012; GFX9-NEXT: v_trunc_f32_e32 v5, v5 6013; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 6014; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 6015; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 6016; GFX9-NEXT: v_mov_b32_e32 v0, s4 6017; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 6018; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 6019; GFX9-NEXT: v_mul_f32_e32 v1, v7, v8 6020; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 6021; GFX9-NEXT: v_trunc_f32_e32 v1, v1 6022; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 6023; GFX9-NEXT: v_mad_f32 v5, -v1, v6, v7 6024; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 6025; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 6026; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 6027; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v6 6028; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc 6029; GFX9-NEXT: v_mul_f32_e32 v1, v0, v7 6030; GFX9-NEXT: v_trunc_f32_e32 v1, v1 6031; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v1 6032; GFX9-NEXT: v_mad_f32 v0, -v1, v3, v0 6033; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v3 6034; GFX9-NEXT: v_and_b32_e32 v3, s8, v4 6035; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc 6036; GFX9-NEXT: v_and_b32_e32 v4, s8, v5 6037; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6038; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 6039; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 6040; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 6041; GFX9-NEXT: global_store_dword v2, v0, s[2:3] 6042; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6043; GFX9-NEXT: global_store_short v2, v0, s[2:3] offset:4 6044; GFX9-NEXT: s_endpgm 6045; 6046; GFX90A-LABEL: udiv_v3i15: 6047; GFX90A: ; %bb.0: 6048; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6049; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 6050; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 6051; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 6052; GFX90A-NEXT: v_mov_b32_e32 v2, 0 6053; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 6054; GFX90A-NEXT: s_and_b32 s0, s4, s8 6055; GFX90A-NEXT: s_and_b32 s1, s6, s8 6056; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s1 6057; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s0 6058; GFX90A-NEXT: s_bfe_u32 s0, s6, 0xf000f 6059; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 6060; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v1 6061; GFX90A-NEXT: s_bfe_u32 s1, s4, 0xf000f 6062; GFX90A-NEXT: v_mov_b32_e32 v3, s6 6063; GFX90A-NEXT: v_alignbit_b32 v3, s7, v3, 30 6064; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 6065; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s1 6066; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v6 6067; GFX90A-NEXT: v_and_b32_e32 v3, s8, v3 6068; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 6069; GFX90A-NEXT: v_mad_f32 v4, -v5, v1, v4 6070; GFX90A-NEXT: v_cvt_u32_f32_e32 v5, v5 6071; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, v3 6072; GFX90A-NEXT: v_mov_b32_e32 v0, s4 6073; GFX90A-NEXT: v_alignbit_b32 v0, s5, v0, 30 6074; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 6075; GFX90A-NEXT: v_mul_f32_e32 v1, v7, v8 6076; GFX90A-NEXT: v_and_b32_e32 v0, s8, v0 6077; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 6078; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 6079; GFX90A-NEXT: v_mad_f32 v5, -v1, v6, v7 6080; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 6081; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, v0 6082; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v3 6083; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v6 6084; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc 6085; GFX90A-NEXT: v_mul_f32_e32 v1, v0, v7 6086; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 6087; GFX90A-NEXT: v_cvt_u32_f32_e32 v6, v1 6088; GFX90A-NEXT: v_mad_f32 v0, -v1, v3, v0 6089; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v3 6090; GFX90A-NEXT: v_and_b32_e32 v3, s8, v4 6091; GFX90A-NEXT: v_and_b32_e32 v4, s8, v5 6092; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc 6093; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 6094; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6095; GFX90A-NEXT: v_or_b32_e32 v3, v3, v4 6096; GFX90A-NEXT: v_or_b32_e32 v0, v3, v0 6097; GFX90A-NEXT: global_store_dword v2, v0, s[2:3] 6098; GFX90A-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6099; GFX90A-NEXT: global_store_short v2, v0, s[2:3] offset:4 6100; GFX90A-NEXT: s_endpgm 6101 %r = udiv <3 x i15> %x, %y 6102 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 6103 ret void 6104} 6105 6106define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 6107; CHECK-LABEL: @urem_v3i15( 6108; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 6109; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 6110; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 6111; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 6112; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 6113; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 6114; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 6115; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 6116; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 6117; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 6118; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 6119; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 6120; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 6121; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 6122; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 6123; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 6124; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 6125; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 6126; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 6127; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 32767 6128; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15 6129; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i15> undef, i15 [[TMP21]], i64 0 6130; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1 6131; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1 6132; CHECK-NEXT: [[TMP25:%.*]] = zext i15 [[TMP23]] to i32 6133; CHECK-NEXT: [[TMP26:%.*]] = zext i15 [[TMP24]] to i32 6134; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 6135; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 6136; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 6137; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 6138; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 6139; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 6140; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 6141; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 6142; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 6143; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 6144; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 6145; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 6146; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 6147; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 6148; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 6149; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 32767 6150; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15 6151; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1 6152; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2 6153; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2 6154; CHECK-NEXT: [[TMP47:%.*]] = zext i15 [[TMP45]] to i32 6155; CHECK-NEXT: [[TMP48:%.*]] = zext i15 [[TMP46]] to i32 6156; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 6157; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 6158; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 6159; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 6160; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 6161; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 6162; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 6163; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 6164; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 6165; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 6166; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 6167; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 6168; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 6169; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 6170; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 6171; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 32767 6172; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15 6173; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2 6174; CHECK-NEXT: store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 6175; CHECK-NEXT: ret void 6176; 6177; GFX6-LABEL: urem_v3i15: 6178; GFX6: ; %bb.0: 6179; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6180; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 6181; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 6182; GFX6-NEXT: s_mov_b32 s7, 0xf000 6183; GFX6-NEXT: s_mov_b32 s6, -1 6184; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6185; GFX6-NEXT: v_mov_b32_e32 v0, s2 6186; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 6187; GFX6-NEXT: s_movk_i32 s3, 0x7fff 6188; GFX6-NEXT: s_and_b32 s10, s0, s3 6189; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 6190; GFX6-NEXT: s_and_b32 s9, s2, s3 6191; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s9 6192; GFX6-NEXT: v_mov_b32_e32 v2, s0 6193; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 6194; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 6195; GFX6-NEXT: s_bfe_u32 s1, s0, 0xf000f 6196; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 6197; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 6198; GFX6-NEXT: v_trunc_f32_e32 v4, v4 6199; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 6200; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 6201; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 6202; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf000f 6203; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 6204; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc 6205; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 6206; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 6207; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 6208; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 6209; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s2, v1 6210; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 6211; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2 6212; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0 6213; GFX6-NEXT: v_trunc_f32_e32 v1, v1 6214; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 6215; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 6216; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 6217; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 6218; GFX6-NEXT: s_lshr_b32 s0, s0, 15 6219; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 6220; GFX6-NEXT: v_trunc_f32_e32 v3, v3 6221; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3 6222; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6223; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 6224; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 6225; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 6226; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6227; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 6228; GFX6-NEXT: s_lshr_b32 s8, s2, 15 6229; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v1 6230; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 6231; GFX6-NEXT: v_and_b32_e32 v3, s3, v3 6232; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 6233; GFX6-NEXT: v_and_b32_e32 v2, s3, v6 6234; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 6235; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 6236; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 6237; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 6238; GFX6-NEXT: s_waitcnt expcnt(0) 6239; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6240; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 6241; GFX6-NEXT: s_endpgm 6242; 6243; GFX9-LABEL: urem_v3i15: 6244; GFX9: ; %bb.0: 6245; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6246; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 6247; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 6248; GFX9-NEXT: s_movk_i32 s8, 0x7fff 6249; GFX9-NEXT: v_mov_b32_e32 v2, 0 6250; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6251; GFX9-NEXT: v_mov_b32_e32 v0, s4 6252; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 6253; GFX9-NEXT: s_and_b32 s5, s6, s8 6254; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 6255; GFX9-NEXT: s_and_b32 s0, s4, s8 6256; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 6257; GFX9-NEXT: s_bfe_u32 s5, s6, 0xf000f 6258; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 6259; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s5 6260; GFX9-NEXT: v_mov_b32_e32 v3, s6 6261; GFX9-NEXT: v_alignbit_b32 v3, s7, v3, 30 6262; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 6263; GFX9-NEXT: v_trunc_f32_e32 v5, v5 6264; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 6265; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 6266; GFX9-NEXT: s_bfe_u32 s1, s4, 0xf000f 6267; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 6268; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 6269; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 6270; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 6271; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc 6272; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 6273; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 6274; GFX9-NEXT: v_mul_f32_e32 v4, v7, v8 6275; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 6276; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v5 6277; GFX9-NEXT: v_trunc_f32_e32 v4, v4 6278; GFX9-NEXT: v_mad_f32 v7, -v4, v6, v7 6279; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 6280; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 6281; GFX9-NEXT: v_mul_f32_e32 v6, v8, v9 6282; GFX9-NEXT: v_trunc_f32_e32 v6, v6 6283; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 6284; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 6285; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v8 6286; GFX9-NEXT: s_lshr_b32 s0, s6, 15 6287; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 6288; GFX9-NEXT: v_mul_lo_u32 v4, v4, s0 6289; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc 6290; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 6291; GFX9-NEXT: v_mul_lo_u32 v3, v5, v3 6292; GFX9-NEXT: s_lshr_b32 s0, s4, 15 6293; GFX9-NEXT: v_sub_u32_e32 v4, s0, v4 6294; GFX9-NEXT: v_sub_u32_e32 v5, s4, v1 6295; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 6296; GFX9-NEXT: v_and_b32_e32 v4, s8, v4 6297; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6298; GFX9-NEXT: v_and_b32_e32 v3, s8, v5 6299; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 6300; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 6301; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 6302; GFX9-NEXT: global_store_dword v2, v0, s[2:3] 6303; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6304; GFX9-NEXT: global_store_short v2, v0, s[2:3] offset:4 6305; GFX9-NEXT: s_endpgm 6306; 6307; GFX90A-LABEL: urem_v3i15: 6308; GFX90A: ; %bb.0: 6309; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6310; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 6311; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 6312; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 6313; GFX90A-NEXT: v_mov_b32_e32 v2, 0 6314; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 6315; GFX90A-NEXT: s_and_b32 s1, s4, s8 6316; GFX90A-NEXT: s_and_b32 s9, s6, s8 6317; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 6318; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s1 6319; GFX90A-NEXT: v_mov_b32_e32 v3, s6 6320; GFX90A-NEXT: v_alignbit_b32 v3, s7, v3, 30 6321; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v1 6322; GFX90A-NEXT: s_bfe_u32 s7, s6, 0xf000f 6323; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s7 6324; GFX90A-NEXT: v_mov_b32_e32 v0, s4 6325; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 6326; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 6327; GFX90A-NEXT: v_mad_f32 v4, -v5, v1, v4 6328; GFX90A-NEXT: v_cvt_u32_f32_e32 v5, v5 6329; GFX90A-NEXT: v_alignbit_b32 v0, s5, v0, 30 6330; GFX90A-NEXT: s_bfe_u32 s5, s4, 0xf000f 6331; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s5 6332; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v6 6333; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 6334; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc 6335; GFX90A-NEXT: v_and_b32_e32 v3, s8, v3 6336; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s6 6337; GFX90A-NEXT: v_sub_u32_e32 v4, s4, v1 6338; GFX90A-NEXT: v_mul_f32_e32 v1, v7, v8 6339; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, v3 6340; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 6341; GFX90A-NEXT: v_mad_f32 v7, -v1, v6, v7 6342; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 6343; GFX90A-NEXT: v_and_b32_e32 v0, s8, v0 6344; GFX90A-NEXT: v_cvt_f32_u32_e32 v8, v0 6345; GFX90A-NEXT: v_rcp_iflag_f32_e32 v9, v5 6346; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 6347; GFX90A-NEXT: s_lshr_b32 s1, s6, 15 6348; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 6349; GFX90A-NEXT: s_lshr_b32 s0, s4, 15 6350; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s1 6351; GFX90A-NEXT: v_sub_u32_e32 v6, s0, v1 6352; GFX90A-NEXT: v_mul_f32_e32 v1, v8, v9 6353; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 6354; GFX90A-NEXT: v_cvt_u32_f32_e32 v7, v1 6355; GFX90A-NEXT: v_mad_f32 v1, -v1, v5, v8 6356; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v5 6357; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc 6358; GFX90A-NEXT: v_mul_lo_u32 v1, v1, v3 6359; GFX90A-NEXT: v_and_b32_e32 v3, s8, v4 6360; GFX90A-NEXT: v_and_b32_e32 v4, s8, v6 6361; GFX90A-NEXT: v_sub_u32_e32 v0, v0, v1 6362; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 6363; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6364; GFX90A-NEXT: v_or_b32_e32 v3, v3, v4 6365; GFX90A-NEXT: v_or_b32_e32 v0, v3, v0 6366; GFX90A-NEXT: global_store_dword v2, v0, s[2:3] 6367; GFX90A-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6368; GFX90A-NEXT: global_store_short v2, v0, s[2:3] offset:4 6369; GFX90A-NEXT: s_endpgm 6370 %r = urem <3 x i15> %x, %y 6371 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 6372 ret void 6373} 6374 6375define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 6376; CHECK-LABEL: @sdiv_v3i15( 6377; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 6378; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 6379; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 6380; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 6381; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 6382; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 6383; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 6384; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 6385; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 6386; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 6387; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 6388; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 6389; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 6390; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 6391; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 6392; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 6393; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 6394; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 6395; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 6396; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 6397; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 17 6398; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 17 6399; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15 6400; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i15> undef, i15 [[TMP23]], i64 0 6401; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1 6402; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1 6403; CHECK-NEXT: [[TMP27:%.*]] = sext i15 [[TMP25]] to i32 6404; CHECK-NEXT: [[TMP28:%.*]] = sext i15 [[TMP26]] to i32 6405; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 6406; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 6407; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 6408; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 6409; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 6410; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 6411; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 6412; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 6413; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 6414; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 6415; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 6416; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 6417; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 6418; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 6419; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 6420; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 6421; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 17 6422; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 17 6423; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15 6424; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1 6425; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2 6426; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2 6427; CHECK-NEXT: [[TMP51:%.*]] = sext i15 [[TMP49]] to i32 6428; CHECK-NEXT: [[TMP52:%.*]] = sext i15 [[TMP50]] to i32 6429; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 6430; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 6431; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 6432; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 6433; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 6434; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 6435; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 6436; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 6437; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 6438; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 6439; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 6440; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 6441; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 6442; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 6443; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 6444; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 6445; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 17 6446; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 17 6447; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15 6448; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2 6449; CHECK-NEXT: store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 6450; CHECK-NEXT: ret void 6451; 6452; GFX6-LABEL: sdiv_v3i15: 6453; GFX6: ; %bb.0: 6454; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6455; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 6456; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 6457; GFX6-NEXT: s_mov_b32 s7, 0xf000 6458; GFX6-NEXT: s_mov_b32 s6, -1 6459; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6460; GFX6-NEXT: v_mov_b32_e32 v0, s2 6461; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 6462; GFX6-NEXT: s_bfe_i32 s3, s0, 0xf0000 6463; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s3 6464; GFX6-NEXT: v_mov_b32_e32 v1, s0 6465; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 30 6466; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf0000 6467; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s1 6468; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 6469; GFX6-NEXT: s_xor_b32 s1, s1, s3 6470; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f 6471; GFX6-NEXT: s_ashr_i32 s1, s1, 30 6472; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 6473; GFX6-NEXT: v_trunc_f32_e32 v4, v4 6474; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 6475; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 6476; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 6477; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s0 6478; GFX6-NEXT: s_or_b32 s1, s1, 1 6479; GFX6-NEXT: v_mov_b32_e32 v5, s1 6480; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 6481; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf000f 6482; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 6483; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 6484; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 6485; GFX6-NEXT: s_xor_b32 s0, s1, s0 6486; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 6487; GFX6-NEXT: s_ashr_i32 s0, s0, 30 6488; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 6489; GFX6-NEXT: v_trunc_f32_e32 v5, v5 6490; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 6491; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 6492; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| 6493; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v1 6494; GFX6-NEXT: s_or_b32 s0, s0, 1 6495; GFX6-NEXT: v_mov_b32_e32 v6, s0 6496; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 6497; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 6498; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 6499; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v0 6500; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 6501; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 6502; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 6503; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 6504; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 6505; GFX6-NEXT: v_trunc_f32_e32 v1, v1 6506; GFX6-NEXT: v_mad_f32 v5, -v1, v4, v5 6507; GFX6-NEXT: v_cvt_i32_f32_e32 v1, v1 6508; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| 6509; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 6510; GFX6-NEXT: s_movk_i32 s0, 0x7fff 6511; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6512; GFX6-NEXT: v_and_b32_e32 v3, s0, v3 6513; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 6514; GFX6-NEXT: v_and_b32_e32 v2, s0, v2 6515; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 6516; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 6517; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 6518; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 6519; GFX6-NEXT: s_waitcnt expcnt(0) 6520; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6521; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 6522; GFX6-NEXT: s_endpgm 6523; 6524; GFX9-LABEL: sdiv_v3i15: 6525; GFX9: ; %bb.0: 6526; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6527; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 6528; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 6529; GFX9-NEXT: v_mov_b32_e32 v2, 0 6530; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6531; GFX9-NEXT: s_bfe_i32 s1, s4, 0xf0000 6532; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf0000 6533; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 6534; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 6535; GFX9-NEXT: s_xor_b32 s0, s1, s0 6536; GFX9-NEXT: v_mov_b32_e32 v0, s4 6537; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 6538; GFX9-NEXT: s_ashr_i32 s0, s0, 30 6539; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 6540; GFX9-NEXT: s_or_b32 s5, s0, 1 6541; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 6542; GFX9-NEXT: v_trunc_f32_e32 v5, v5 6543; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 6544; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 6545; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 6546; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 6547; GFX9-NEXT: s_cselect_b32 s0, s5, 0 6548; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf000f 6549; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s1 6550; GFX9-NEXT: v_add_u32_e32 v4, s0, v5 6551; GFX9-NEXT: s_bfe_i32 s0, s4, 0xf000f 6552; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 6553; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 6554; GFX9-NEXT: v_mov_b32_e32 v1, s6 6555; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 6556; GFX9-NEXT: s_xor_b32 s0, s0, s1 6557; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 6558; GFX9-NEXT: v_trunc_f32_e32 v6, v6 6559; GFX9-NEXT: s_ashr_i32 s0, s0, 30 6560; GFX9-NEXT: v_mad_f32 v5, -v6, v3, v5 6561; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 6562; GFX9-NEXT: s_or_b32 s4, s0, 1 6563; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 6564; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| 6565; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 6566; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 6567; GFX9-NEXT: s_cselect_b32 s0, s4, 0 6568; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 15 6569; GFX9-NEXT: v_add_u32_e32 v5, s0, v6 6570; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v0 6571; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 6572; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 6573; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 6574; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 6575; GFX9-NEXT: v_mul_f32_e32 v1, v6, v7 6576; GFX9-NEXT: v_trunc_f32_e32 v1, v1 6577; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v1 6578; GFX9-NEXT: v_mad_f32 v1, -v1, v3, v6 6579; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| 6580; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 6581; GFX9-NEXT: s_movk_i32 s0, 0x7fff 6582; GFX9-NEXT: v_add_u32_e32 v0, v7, v0 6583; GFX9-NEXT: v_and_b32_e32 v3, s0, v4 6584; GFX9-NEXT: v_and_b32_e32 v4, s0, v5 6585; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6586; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 6587; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 6588; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 6589; GFX9-NEXT: global_store_dword v2, v0, s[2:3] 6590; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6591; GFX9-NEXT: global_store_short v2, v0, s[2:3] offset:4 6592; GFX9-NEXT: s_endpgm 6593; 6594; GFX90A-LABEL: sdiv_v3i15: 6595; GFX90A: ; %bb.0: 6596; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6597; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 6598; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 6599; GFX90A-NEXT: v_mov_b32_e32 v2, 0 6600; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 6601; GFX90A-NEXT: s_bfe_i32 s1, s4, 0xf0000 6602; GFX90A-NEXT: s_bfe_i32 s0, s6, 0xf0000 6603; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s0 6604; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s1 6605; GFX90A-NEXT: s_xor_b32 s0, s1, s0 6606; GFX90A-NEXT: v_mov_b32_e32 v0, s4 6607; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v3 6608; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 6609; GFX90A-NEXT: v_alignbit_b32 v0, s5, v0, 30 6610; GFX90A-NEXT: s_or_b32 s5, s0, 1 6611; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 6612; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 6613; GFX90A-NEXT: v_mad_f32 v4, -v5, v3, v4 6614; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 6615; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 6616; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 6617; GFX90A-NEXT: s_cselect_b32 s0, s5, 0 6618; GFX90A-NEXT: s_bfe_i32 s1, s6, 0xf000f 6619; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s1 6620; GFX90A-NEXT: v_add_u32_e32 v4, s0, v5 6621; GFX90A-NEXT: s_bfe_i32 s0, s4, 0xf000f 6622; GFX90A-NEXT: v_cvt_f32_i32_e32 v5, s0 6623; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v3 6624; GFX90A-NEXT: v_mov_b32_e32 v1, s6 6625; GFX90A-NEXT: v_alignbit_b32 v1, s7, v1, 30 6626; GFX90A-NEXT: s_xor_b32 s0, s0, s1 6627; GFX90A-NEXT: v_mul_f32_e32 v6, v5, v6 6628; GFX90A-NEXT: v_trunc_f32_e32 v6, v6 6629; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 6630; GFX90A-NEXT: v_mad_f32 v5, -v6, v3, v5 6631; GFX90A-NEXT: v_bfe_i32 v1, v1, 0, 15 6632; GFX90A-NEXT: s_or_b32 s4, s0, 1 6633; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 6634; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| 6635; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, v1 6636; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 6637; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 6638; GFX90A-NEXT: v_bfe_i32 v0, v0, 0, 15 6639; GFX90A-NEXT: v_add_u32_e32 v5, s0, v6 6640; GFX90A-NEXT: v_cvt_f32_i32_e32 v6, v0 6641; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v3 6642; GFX90A-NEXT: v_xor_b32_e32 v0, v0, v1 6643; GFX90A-NEXT: v_ashrrev_i32_e32 v0, 30, v0 6644; GFX90A-NEXT: v_or_b32_e32 v0, 1, v0 6645; GFX90A-NEXT: v_mul_f32_e32 v1, v6, v7 6646; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 6647; GFX90A-NEXT: v_cvt_i32_f32_e32 v7, v1 6648; GFX90A-NEXT: v_mad_f32 v1, -v1, v3, v6 6649; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| 6650; GFX90A-NEXT: s_movk_i32 s0, 0x7fff 6651; GFX90A-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 6652; GFX90A-NEXT: v_and_b32_e32 v3, s0, v4 6653; GFX90A-NEXT: v_and_b32_e32 v4, s0, v5 6654; GFX90A-NEXT: v_add_u32_e32 v0, v7, v0 6655; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 6656; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6657; GFX90A-NEXT: v_or_b32_e32 v3, v3, v4 6658; GFX90A-NEXT: v_or_b32_e32 v0, v3, v0 6659; GFX90A-NEXT: global_store_dword v2, v0, s[2:3] 6660; GFX90A-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6661; GFX90A-NEXT: global_store_short v2, v0, s[2:3] offset:4 6662; GFX90A-NEXT: s_endpgm 6663 %r = sdiv <3 x i15> %x, %y 6664 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 6665 ret void 6666} 6667 6668define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 6669; CHECK-LABEL: @srem_v3i15( 6670; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 6671; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 6672; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 6673; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 6674; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 6675; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 6676; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 6677; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 6678; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 6679; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 6680; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 6681; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 6682; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 6683; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 6684; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 6685; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 6686; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 6687; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 6688; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 6689; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 6690; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 6691; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 6692; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 17 6693; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 17 6694; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15 6695; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i15> undef, i15 [[TMP25]], i64 0 6696; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1 6697; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1 6698; CHECK-NEXT: [[TMP29:%.*]] = sext i15 [[TMP27]] to i32 6699; CHECK-NEXT: [[TMP30:%.*]] = sext i15 [[TMP28]] to i32 6700; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 6701; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 6702; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 6703; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 6704; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 6705; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 6706; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 6707; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 6708; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 6709; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 6710; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 6711; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 6712; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 6713; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 6714; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 6715; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 6716; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 6717; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 6718; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 17 6719; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 17 6720; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15 6721; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1 6722; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2 6723; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2 6724; CHECK-NEXT: [[TMP55:%.*]] = sext i15 [[TMP53]] to i32 6725; CHECK-NEXT: [[TMP56:%.*]] = sext i15 [[TMP54]] to i32 6726; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 6727; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 6728; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 6729; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 6730; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 6731; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 6732; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 6733; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 6734; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 6735; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 6736; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 6737; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 6738; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 6739; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 6740; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 6741; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 6742; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 6743; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 6744; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 17 6745; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 17 6746; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15 6747; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2 6748; CHECK-NEXT: store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 6749; CHECK-NEXT: ret void 6750; 6751; GFX6-LABEL: srem_v3i15: 6752; GFX6: ; %bb.0: 6753; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6754; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 6755; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 6756; GFX6-NEXT: s_mov_b32 s7, 0xf000 6757; GFX6-NEXT: s_mov_b32 s6, -1 6758; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6759; GFX6-NEXT: v_mov_b32_e32 v0, s2 6760; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 6761; GFX6-NEXT: s_movk_i32 s3, 0x7fff 6762; GFX6-NEXT: s_and_b32 s11, s0, s3 6763; GFX6-NEXT: s_bfe_i32 s11, s11, 0xf0000 6764; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s11 6765; GFX6-NEXT: s_and_b32 s9, s2, s3 6766; GFX6-NEXT: s_bfe_i32 s9, s9, 0xf0000 6767; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s9 6768; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 6769; GFX6-NEXT: s_xor_b32 s9, s9, s11 6770; GFX6-NEXT: s_ashr_i32 s9, s9, 30 6771; GFX6-NEXT: s_or_b32 s9, s9, 1 6772; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 6773; GFX6-NEXT: v_trunc_f32_e32 v4, v4 6774; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 6775; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 6776; GFX6-NEXT: v_mov_b32_e32 v5, s9 6777; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 6778; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 6779; GFX6-NEXT: v_mov_b32_e32 v1, s0 6780; GFX6-NEXT: s_bfe_u32 s12, s0, 0xf000f 6781; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 6782; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 30 6783; GFX6-NEXT: s_lshr_b32 s1, s0, 15 6784; GFX6-NEXT: v_mul_lo_u32 v2, v2, s0 6785; GFX6-NEXT: s_bfe_i32 s0, s12, 0xf0000 6786; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s0 6787; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf000f 6788; GFX6-NEXT: s_lshr_b32 s8, s2, 15 6789; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 6790; GFX6-NEXT: s_bfe_i32 s2, s10, 0xf0000 6791; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s2 6792; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 6793; GFX6-NEXT: s_xor_b32 s0, s2, s0 6794; GFX6-NEXT: s_ashr_i32 s0, s0, 30 6795; GFX6-NEXT: s_or_b32 s0, s0, 1 6796; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 6797; GFX6-NEXT: v_trunc_f32_e32 v5, v5 6798; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 6799; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 6800; GFX6-NEXT: v_and_b32_e32 v1, s3, v1 6801; GFX6-NEXT: v_mov_b32_e32 v6, s0 6802; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| 6803; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 6804; GFX6-NEXT: v_bfe_i32 v4, v1, 0, 15 6805; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 6806; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v4 6807; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 6808; GFX6-NEXT: v_bfe_i32 v6, v0, 0, 15 6809; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v6 6810; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v5 6811; GFX6-NEXT: v_xor_b32_e32 v4, v6, v4 6812; GFX6-NEXT: v_ashrrev_i32_e32 v4, 30, v4 6813; GFX6-NEXT: v_or_b32_e32 v4, 1, v4 6814; GFX6-NEXT: v_mul_f32_e32 v6, v7, v8 6815; GFX6-NEXT: v_trunc_f32_e32 v6, v6 6816; GFX6-NEXT: v_mad_f32 v7, -v6, v5, v7 6817; GFX6-NEXT: v_cvt_i32_f32_e32 v6, v6 6818; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v5| 6819; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc 6820; GFX6-NEXT: v_mul_lo_u32 v3, v3, s1 6821; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 6822; GFX6-NEXT: v_mul_lo_u32 v1, v4, v1 6823; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 6824; GFX6-NEXT: v_and_b32_e32 v3, s3, v3 6825; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 6826; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 6827; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 6828; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 6829; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 6830; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 6831; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 6832; GFX6-NEXT: s_waitcnt expcnt(0) 6833; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6834; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 6835; GFX6-NEXT: s_endpgm 6836; 6837; GFX9-LABEL: srem_v3i15: 6838; GFX9: ; %bb.0: 6839; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6840; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 6841; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 6842; GFX9-NEXT: s_movk_i32 s8, 0x7fff 6843; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6844; GFX9-NEXT: s_and_b32 s0, s4, s8 6845; GFX9-NEXT: s_and_b32 s1, s6, s8 6846; GFX9-NEXT: s_bfe_i32 s1, s1, 0xf0000 6847; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 6848; GFX9-NEXT: s_bfe_i32 s0, s0, 0xf0000 6849; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 6850; GFX9-NEXT: s_xor_b32 s0, s0, s1 6851; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 6852; GFX9-NEXT: v_mov_b32_e32 v0, s4 6853; GFX9-NEXT: v_mov_b32_e32 v1, s6 6854; GFX9-NEXT: s_ashr_i32 s0, s0, 30 6855; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 6856; GFX9-NEXT: v_trunc_f32_e32 v4, v4 6857; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 6858; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 6859; GFX9-NEXT: s_lshr_b32 s9, s4, 15 6860; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 6861; GFX9-NEXT: s_bfe_u32 s5, s4, 0xf000f 6862; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 6863; GFX9-NEXT: s_lshr_b32 s7, s6, 15 6864; GFX9-NEXT: s_bfe_u32 s10, s6, 0xf000f 6865; GFX9-NEXT: s_or_b32 s11, s0, 1 6866; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| 6867; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 6868; GFX9-NEXT: s_cselect_b32 s0, s11, 0 6869; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 6870; GFX9-NEXT: s_bfe_i32 s0, s10, 0xf0000 6871; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 6872; GFX9-NEXT: s_bfe_i32 s1, s5, 0xf0000 6873; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 6874; GFX9-NEXT: s_xor_b32 s0, s1, s0 6875; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 6876; GFX9-NEXT: s_ashr_i32 s0, s0, 30 6877; GFX9-NEXT: s_or_b32 s5, s0, 1 6878; GFX9-NEXT: v_and_b32_e32 v1, s8, v1 6879; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 6880; GFX9-NEXT: v_trunc_f32_e32 v5, v5 6881; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 6882; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 6883; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 6884; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 6885; GFX9-NEXT: s_cselect_b32 s0, s5, 0 6886; GFX9-NEXT: v_bfe_i32 v4, v1, 0, 15 6887; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 6888; GFX9-NEXT: v_cvt_f32_i32_e32 v5, v4 6889; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 6890; GFX9-NEXT: v_bfe_i32 v6, v0, 0, 15 6891; GFX9-NEXT: v_cvt_f32_i32_e32 v7, v6 6892; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 6893; GFX9-NEXT: v_xor_b32_e32 v4, v6, v4 6894; GFX9-NEXT: v_ashrrev_i32_e32 v4, 30, v4 6895; GFX9-NEXT: v_or_b32_e32 v4, 1, v4 6896; GFX9-NEXT: v_mul_f32_e32 v6, v7, v8 6897; GFX9-NEXT: v_trunc_f32_e32 v6, v6 6898; GFX9-NEXT: v_cvt_i32_f32_e32 v8, v6 6899; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v7 6900; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v5| 6901; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc 6902; GFX9-NEXT: v_mul_lo_u32 v3, v3, s7 6903; GFX9-NEXT: v_add_u32_e32 v4, v8, v4 6904; GFX9-NEXT: v_mul_lo_u32 v2, v2, s6 6905; GFX9-NEXT: v_mul_lo_u32 v1, v4, v1 6906; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3 6907; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 6908; GFX9-NEXT: v_sub_u32_e32 v2, s4, v2 6909; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 6910; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6911; GFX9-NEXT: v_and_b32_e32 v2, s8, v2 6912; GFX9-NEXT: v_lshlrev_b32_e32 v3, 15, v3 6913; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 6914; GFX9-NEXT: v_mov_b32_e32 v4, 0 6915; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 6916; GFX9-NEXT: global_store_dword v4, v0, s[2:3] 6917; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6918; GFX9-NEXT: global_store_short v4, v0, s[2:3] offset:4 6919; GFX9-NEXT: s_endpgm 6920; 6921; GFX90A-LABEL: srem_v3i15: 6922; GFX90A: ; %bb.0: 6923; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6924; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 6925; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 6926; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 6927; GFX90A-NEXT: v_mov_b32_e32 v2, 0 6928; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 6929; GFX90A-NEXT: s_and_b32 s0, s4, s8 6930; GFX90A-NEXT: s_and_b32 s1, s6, s8 6931; GFX90A-NEXT: s_bfe_i32 s1, s1, 0xf0000 6932; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s1 6933; GFX90A-NEXT: s_bfe_i32 s0, s0, 0xf0000 6934; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s0 6935; GFX90A-NEXT: s_xor_b32 s0, s0, s1 6936; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v3 6937; GFX90A-NEXT: v_mov_b32_e32 v0, s4 6938; GFX90A-NEXT: v_mov_b32_e32 v1, s6 6939; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 6940; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 6941; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 6942; GFX90A-NEXT: v_mad_f32 v4, -v5, v3, v4 6943; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 6944; GFX90A-NEXT: v_alignbit_b32 v0, s5, v0, 30 6945; GFX90A-NEXT: s_lshr_b32 s5, s4, 15 6946; GFX90A-NEXT: s_bfe_u32 s9, s4, 0xf000f 6947; GFX90A-NEXT: v_alignbit_b32 v1, s7, v1, 30 6948; GFX90A-NEXT: s_lshr_b32 s7, s6, 15 6949; GFX90A-NEXT: s_bfe_u32 s10, s6, 0xf000f 6950; GFX90A-NEXT: s_or_b32 s11, s0, 1 6951; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 6952; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 6953; GFX90A-NEXT: s_cselect_b32 s0, s11, 0 6954; GFX90A-NEXT: v_add_u32_e32 v3, s0, v5 6955; GFX90A-NEXT: s_bfe_i32 s0, s10, 0xf0000 6956; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s0 6957; GFX90A-NEXT: s_bfe_i32 s1, s9, 0xf0000 6958; GFX90A-NEXT: v_cvt_f32_i32_e32 v5, s1 6959; GFX90A-NEXT: s_xor_b32 s0, s1, s0 6960; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 6961; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s6 6962; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 6963; GFX90A-NEXT: v_sub_u32_e32 v3, s4, v3 6964; GFX90A-NEXT: v_mul_f32_e32 v6, v5, v6 6965; GFX90A-NEXT: v_trunc_f32_e32 v6, v6 6966; GFX90A-NEXT: v_mad_f32 v5, -v6, v4, v5 6967; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 6968; GFX90A-NEXT: s_or_b32 s4, s0, 1 6969; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| 6970; GFX90A-NEXT: v_and_b32_e32 v1, s8, v1 6971; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 6972; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 6973; GFX90A-NEXT: v_bfe_i32 v5, v1, 0, 15 6974; GFX90A-NEXT: v_add_u32_e32 v4, s0, v6 6975; GFX90A-NEXT: v_cvt_f32_i32_e32 v6, v5 6976; GFX90A-NEXT: v_and_b32_e32 v0, s8, v0 6977; GFX90A-NEXT: v_bfe_i32 v7, v0, 0, 15 6978; GFX90A-NEXT: v_cvt_f32_i32_e32 v8, v7 6979; GFX90A-NEXT: v_rcp_iflag_f32_e32 v9, v6 6980; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v5 6981; GFX90A-NEXT: v_ashrrev_i32_e32 v5, 30, v5 6982; GFX90A-NEXT: v_or_b32_e32 v5, 1, v5 6983; GFX90A-NEXT: v_mul_f32_e32 v7, v8, v9 6984; GFX90A-NEXT: v_trunc_f32_e32 v7, v7 6985; GFX90A-NEXT: v_cvt_i32_f32_e32 v9, v7 6986; GFX90A-NEXT: v_mad_f32 v7, -v7, v6, v8 6987; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| 6988; GFX90A-NEXT: v_mul_lo_u32 v4, v4, s7 6989; GFX90A-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc 6990; GFX90A-NEXT: v_sub_u32_e32 v4, s5, v4 6991; GFX90A-NEXT: v_add_u32_e32 v5, v9, v5 6992; GFX90A-NEXT: v_mul_lo_u32 v1, v5, v1 6993; GFX90A-NEXT: v_and_b32_e32 v4, s8, v4 6994; GFX90A-NEXT: v_sub_u32_e32 v0, v0, v1 6995; GFX90A-NEXT: v_and_b32_e32 v3, s8, v3 6996; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 6997; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6998; GFX90A-NEXT: v_or_b32_e32 v3, v3, v4 6999; GFX90A-NEXT: v_or_b32_e32 v0, v3, v0 7000; GFX90A-NEXT: global_store_dword v2, v0, s[2:3] 7001; GFX90A-NEXT: v_and_b32_e32 v0, 0x1fff, v1 7002; GFX90A-NEXT: global_store_short v2, v0, s[2:3] offset:4 7003; GFX90A-NEXT: s_endpgm 7004 %r = srem <3 x i15> %x, %y 7005 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 7006 ret void 7007} 7008 7009define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 7010; CHECK-LABEL: @udiv_i32_oddk_denom( 7011; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 1235195 7012; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 7013; CHECK-NEXT: ret void 7014; 7015; GFX6-LABEL: udiv_i32_oddk_denom: 7016; GFX6: ; %bb.0: 7017; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7018; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 7019; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 7020; GFX6-NEXT: s_mov_b32 s7, 0xf000 7021; GFX6-NEXT: s_mov_b32 s6, -1 7022; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7023; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 7024; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v0 7025; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 7026; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7027; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 7028; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 7029; GFX6-NEXT: s_endpgm 7030; 7031; GFX9-LABEL: udiv_i32_oddk_denom: 7032; GFX9: ; %bb.0: 7033; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7034; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 7035; GFX9-NEXT: v_mov_b32_e32 v0, 0 7036; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7037; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 7038; GFX9-NEXT: s_sub_i32 s1, s4, s0 7039; GFX9-NEXT: s_lshr_b32 s1, s1, 1 7040; GFX9-NEXT: s_add_i32 s1, s1, s0 7041; GFX9-NEXT: s_lshr_b32 s0, s1, 20 7042; GFX9-NEXT: v_mov_b32_e32 v1, s0 7043; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 7044; GFX9-NEXT: s_endpgm 7045; 7046; GFX90A-LABEL: udiv_i32_oddk_denom: 7047; GFX90A: ; %bb.0: 7048; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7049; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 7050; GFX90A-NEXT: v_mov_b32_e32 v0, 0 7051; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7052; GFX90A-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 7053; GFX90A-NEXT: s_sub_i32 s1, s4, s0 7054; GFX90A-NEXT: s_lshr_b32 s1, s1, 1 7055; GFX90A-NEXT: s_add_i32 s1, s1, s0 7056; GFX90A-NEXT: s_lshr_b32 s0, s1, 20 7057; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7058; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 7059; GFX90A-NEXT: s_endpgm 7060 %r = udiv i32 %x, 1235195 7061 store i32 %r, i32 addrspace(1)* %out 7062 ret void 7063} 7064 7065define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 7066; CHECK-LABEL: @udiv_i32_pow2k_denom( 7067; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 4096 7068; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 7069; CHECK-NEXT: ret void 7070; 7071; GFX6-LABEL: udiv_i32_pow2k_denom: 7072; GFX6: ; %bb.0: 7073; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7074; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 7075; GFX6-NEXT: s_mov_b32 s7, 0xf000 7076; GFX6-NEXT: s_mov_b32 s6, -1 7077; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7078; GFX6-NEXT: s_lshr_b32 s0, s0, 12 7079; GFX6-NEXT: v_mov_b32_e32 v0, s0 7080; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 7081; GFX6-NEXT: s_endpgm 7082; 7083; GFX9-LABEL: udiv_i32_pow2k_denom: 7084; GFX9: ; %bb.0: 7085; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7086; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 7087; GFX9-NEXT: v_mov_b32_e32 v0, 0 7088; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7089; GFX9-NEXT: s_lshr_b32 s0, s4, 12 7090; GFX9-NEXT: v_mov_b32_e32 v1, s0 7091; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 7092; GFX9-NEXT: s_endpgm 7093; 7094; GFX90A-LABEL: udiv_i32_pow2k_denom: 7095; GFX90A: ; %bb.0: 7096; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7097; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 7098; GFX90A-NEXT: v_mov_b32_e32 v0, 0 7099; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7100; GFX90A-NEXT: s_lshr_b32 s0, s4, 12 7101; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7102; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 7103; GFX90A-NEXT: s_endpgm 7104 %r = udiv i32 %x, 4096 7105 store i32 %r, i32 addrspace(1)* %out 7106 ret void 7107} 7108 7109define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 7110; CHECK-LABEL: @udiv_i32_pow2_shl_denom( 7111; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 7112; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]] 7113; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 7114; CHECK-NEXT: ret void 7115; 7116; GFX6-LABEL: udiv_i32_pow2_shl_denom: 7117; GFX6: ; %bb.0: 7118; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7119; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7120; GFX6-NEXT: s_mov_b32 s7, 0xf000 7121; GFX6-NEXT: s_mov_b32 s6, -1 7122; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7123; GFX6-NEXT: s_add_i32 s1, s1, 12 7124; GFX6-NEXT: s_lshr_b32 s0, s0, s1 7125; GFX6-NEXT: v_mov_b32_e32 v0, s0 7126; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 7127; GFX6-NEXT: s_endpgm 7128; 7129; GFX9-LABEL: udiv_i32_pow2_shl_denom: 7130; GFX9: ; %bb.0: 7131; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7132; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 7133; GFX9-NEXT: v_mov_b32_e32 v0, 0 7134; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7135; GFX9-NEXT: s_add_i32 s0, s5, 12 7136; GFX9-NEXT: s_lshr_b32 s0, s4, s0 7137; GFX9-NEXT: v_mov_b32_e32 v1, s0 7138; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 7139; GFX9-NEXT: s_endpgm 7140; 7141; GFX90A-LABEL: udiv_i32_pow2_shl_denom: 7142; GFX90A: ; %bb.0: 7143; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7144; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 7145; GFX90A-NEXT: v_mov_b32_e32 v0, 0 7146; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7147; GFX90A-NEXT: s_add_i32 s0, s5, 12 7148; GFX90A-NEXT: s_lshr_b32 s0, s4, s0 7149; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7150; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 7151; GFX90A-NEXT: s_endpgm 7152 %shl.y = shl i32 4096, %y 7153 %r = udiv i32 %x, %shl.y 7154 store i32 %r, i32 addrspace(1)* %out 7155 ret void 7156} 7157 7158define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 7159; CHECK-LABEL: @udiv_v2i32_pow2k_denom( 7160; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 7161; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 7162; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 7163; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 7164; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096 7165; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 7166; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 7167; CHECK-NEXT: ret void 7168; 7169; GFX6-LABEL: udiv_v2i32_pow2k_denom: 7170; GFX6: ; %bb.0: 7171; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7172; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7173; GFX6-NEXT: s_mov_b32 s7, 0xf000 7174; GFX6-NEXT: s_mov_b32 s6, -1 7175; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7176; GFX6-NEXT: s_lshr_b32 s0, s0, 12 7177; GFX6-NEXT: s_lshr_b32 s1, s1, 12 7178; GFX6-NEXT: v_mov_b32_e32 v0, s0 7179; GFX6-NEXT: v_mov_b32_e32 v1, s1 7180; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7181; GFX6-NEXT: s_endpgm 7182; 7183; GFX9-LABEL: udiv_v2i32_pow2k_denom: 7184; GFX9: ; %bb.0: 7185; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7186; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 7187; GFX9-NEXT: v_mov_b32_e32 v2, 0 7188; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7189; GFX9-NEXT: s_lshr_b32 s0, s4, 12 7190; GFX9-NEXT: s_lshr_b32 s1, s5, 12 7191; GFX9-NEXT: v_mov_b32_e32 v0, s0 7192; GFX9-NEXT: v_mov_b32_e32 v1, s1 7193; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 7194; GFX9-NEXT: s_endpgm 7195; 7196; GFX90A-LABEL: udiv_v2i32_pow2k_denom: 7197; GFX90A: ; %bb.0: 7198; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7199; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 7200; GFX90A-NEXT: v_mov_b32_e32 v2, 0 7201; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7202; GFX90A-NEXT: s_lshr_b32 s0, s4, 12 7203; GFX90A-NEXT: s_lshr_b32 s1, s5, 12 7204; GFX90A-NEXT: v_mov_b32_e32 v0, s0 7205; GFX90A-NEXT: v_mov_b32_e32 v1, s1 7206; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 7207; GFX90A-NEXT: s_endpgm 7208 %r = udiv <2 x i32> %x, <i32 4096, i32 4096> 7209 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 7210 ret void 7211} 7212 7213define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 7214; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom( 7215; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 7216; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 7217; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 7218; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 7219; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095 7220; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 7221; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 7222; CHECK-NEXT: ret void 7223; 7224; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom: 7225; GFX6: ; %bb.0: 7226; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7227; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7228; GFX6-NEXT: v_mov_b32_e32 v0, 0x100101 7229; GFX6-NEXT: s_mov_b32 s7, 0xf000 7230; GFX6-NEXT: s_mov_b32 s6, -1 7231; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7232; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 7233; GFX6-NEXT: s_lshr_b32 s0, s0, 12 7234; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v0 7235; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 7236; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7237; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v0 7238; GFX6-NEXT: v_mov_b32_e32 v0, s0 7239; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7240; GFX6-NEXT: s_endpgm 7241; 7242; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: 7243; GFX9: ; %bb.0: 7244; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7245; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 7246; GFX9-NEXT: v_mov_b32_e32 v2, 0 7247; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7248; GFX9-NEXT: s_mul_hi_u32 s1, s5, 0x100101 7249; GFX9-NEXT: s_lshr_b32 s0, s4, 12 7250; GFX9-NEXT: s_sub_i32 s4, s5, s1 7251; GFX9-NEXT: s_lshr_b32 s4, s4, 1 7252; GFX9-NEXT: s_add_i32 s4, s4, s1 7253; GFX9-NEXT: s_lshr_b32 s1, s4, 11 7254; GFX9-NEXT: v_mov_b32_e32 v0, s0 7255; GFX9-NEXT: v_mov_b32_e32 v1, s1 7256; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 7257; GFX9-NEXT: s_endpgm 7258; 7259; GFX90A-LABEL: udiv_v2i32_mixed_pow2k_denom: 7260; GFX90A: ; %bb.0: 7261; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7262; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 7263; GFX90A-NEXT: v_mov_b32_e32 v2, 0 7264; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7265; GFX90A-NEXT: s_mul_hi_u32 s1, s5, 0x100101 7266; GFX90A-NEXT: s_lshr_b32 s0, s4, 12 7267; GFX90A-NEXT: s_sub_i32 s4, s5, s1 7268; GFX90A-NEXT: s_lshr_b32 s4, s4, 1 7269; GFX90A-NEXT: s_add_i32 s4, s4, s1 7270; GFX90A-NEXT: s_lshr_b32 s1, s4, 11 7271; GFX90A-NEXT: v_mov_b32_e32 v0, s0 7272; GFX90A-NEXT: v_mov_b32_e32 v1, s1 7273; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 7274; GFX90A-NEXT: s_endpgm 7275 %r = udiv <2 x i32> %x, <i32 4096, i32 4095> 7276 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 7277 ret void 7278} 7279 7280define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 7281; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom( 7282; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 7283; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 7284; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 7285; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 7286; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 7287; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 7288; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 7289; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 7290; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 7291; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 7292; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 7293; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 7294; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 7295; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 7296; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 7297; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 7298; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 7299; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 7300; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 7301; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 7302; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 7303; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 7304; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 7305; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 7306; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 7307; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1 7308; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]] 7309; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]] 7310; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]] 7311; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]] 7312; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 7313; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 7314; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> undef, i32 [[TMP31]], i64 0 7315; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x i32> [[X]], i64 1 7316; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 7317; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float 7318; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 7319; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000 7320; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32 7321; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]] 7322; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]] 7323; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64 7324; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64 7325; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]] 7326; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 7327; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32 7328; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32 7329; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]] 7330; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64 7331; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64 7332; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]] 7333; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 7334; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32 7335; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32 7336; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]] 7337; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]] 7338; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]] 7339; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1 7340; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]] 7341; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]] 7342; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]] 7343; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]] 7344; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1 7345; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]] 7346; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP63]], i64 1 7347; CHECK-NEXT: store <2 x i32> [[TMP64]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 7348; CHECK-NEXT: ret void 7349; 7350; GFX6-LABEL: udiv_v2i32_pow2_shl_denom: 7351; GFX6: ; %bb.0: 7352; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 7353; GFX6-NEXT: s_movk_i32 s4, 0x1000 7354; GFX6-NEXT: s_mov_b32 s7, 0xf000 7355; GFX6-NEXT: s_mov_b32 s6, -1 7356; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7357; GFX6-NEXT: s_lshl_b32 s8, s4, s2 7358; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 7359; GFX6-NEXT: s_lshl_b32 s9, s4, s3 7360; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 7361; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7362; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 7363; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 7364; GFX6-NEXT: s_mov_b32 s0, 0x4f7ffffe 7365; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 7366; GFX6-NEXT: v_mul_f32_e32 v0, s0, v0 7367; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 7368; GFX6-NEXT: v_mul_f32_e32 v1, s0, v1 7369; GFX6-NEXT: s_sub_i32 s0, 0, s8 7370; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 7371; GFX6-NEXT: v_mul_lo_u32 v2, s0, v0 7372; GFX6-NEXT: s_sub_i32 s0, 0, s9 7373; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1 7374; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 7375; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 7376; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 7377; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7378; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 7379; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 7380; GFX6-NEXT: v_mul_hi_u32 v1, s3, v1 7381; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 7382; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 7383; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9 7384; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 7385; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 7386; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 7387; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v2 7388; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 7389; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 7390; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 7391; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 7392; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v4 7393; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 7394; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 7395; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 7396; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v2 7397; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 7398; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 7399; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 7400; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 7401; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7402; GFX6-NEXT: s_endpgm 7403; 7404; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: 7405; GFX9: ; %bb.0: 7406; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 7407; GFX9-NEXT: s_movk_i32 s4, 0x1000 7408; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7409; GFX9-NEXT: s_lshl_b32 s5, s4, s3 7410; GFX9-NEXT: s_lshl_b32 s4, s4, s2 7411; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 7412; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 7413; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe 7414; GFX9-NEXT: s_sub_i32 s3, 0, s5 7415; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 7416; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 7417; GFX9-NEXT: v_mul_f32_e32 v0, s2, v0 7418; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 7419; GFX9-NEXT: v_mul_f32_e32 v1, s2, v1 7420; GFX9-NEXT: s_sub_i32 s2, 0, s4 7421; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 7422; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 7423; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 7424; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 7425; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 7426; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 7427; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 7428; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 7429; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7430; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 7431; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 7432; GFX9-NEXT: v_mul_hi_u32 v1, s3, v1 7433; GFX9-NEXT: v_mov_b32_e32 v2, 0 7434; GFX9-NEXT: v_mul_lo_u32 v3, v0, s4 7435; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 7436; GFX9-NEXT: v_mul_lo_u32 v4, v1, s5 7437; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 7438; GFX9-NEXT: v_sub_u32_e32 v3, s2, v3 7439; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 7440; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 7441; GFX9-NEXT: v_subrev_u32_e32 v5, s4, v3 7442; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 7443; GFX9-NEXT: v_sub_u32_e32 v4, s3, v4 7444; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 7445; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 7446; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 7447; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v4 7448; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v4 7449; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 7450; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc 7451; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 7452; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 7453; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 7454; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 7455; GFX9-NEXT: s_endpgm 7456; 7457; GFX90A-LABEL: udiv_v2i32_pow2_shl_denom: 7458; GFX90A: ; %bb.0: 7459; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 7460; GFX90A-NEXT: s_movk_i32 s8, 0x1000 7461; GFX90A-NEXT: s_mov_b32 s9, 0x4f7ffffe 7462; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 7463; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 7464; GFX90A-NEXT: v_mov_b32_e32 v2, 0 7465; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7466; GFX90A-NEXT: s_lshl_b32 s2, s8, s2 7467; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s2 7468; GFX90A-NEXT: s_lshl_b32 s0, s8, s3 7469; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s0 7470; GFX90A-NEXT: s_sub_i32 s1, 0, s2 7471; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 7472; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 7473; GFX90A-NEXT: v_mul_f32_e32 v0, s9, v0 7474; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 7475; GFX90A-NEXT: v_mul_f32_e32 v1, s9, v1 7476; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 7477; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v0 7478; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v3 7479; GFX90A-NEXT: v_add_u32_e32 v0, v0, v3 7480; GFX90A-NEXT: v_mul_hi_u32 v0, s6, v0 7481; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s2 7482; GFX90A-NEXT: v_sub_u32_e32 v3, s6, v3 7483; GFX90A-NEXT: v_add_u32_e32 v4, 1, v0 7484; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 7485; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 7486; GFX90A-NEXT: v_subrev_u32_e32 v4, s2, v3 7487; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 7488; GFX90A-NEXT: s_sub_i32 s1, 0, s0 7489; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 7490; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v1 7491; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 7492; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 7493; GFX90A-NEXT: v_mul_hi_u32 v1, s7, v1 7494; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s0 7495; GFX90A-NEXT: v_add_u32_e32 v4, 1, v0 7496; GFX90A-NEXT: v_sub_u32_e32 v3, s7, v3 7497; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 7498; GFX90A-NEXT: v_add_u32_e32 v4, 1, v1 7499; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 7500; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 7501; GFX90A-NEXT: v_subrev_u32_e32 v4, s0, v3 7502; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 7503; GFX90A-NEXT: v_add_u32_e32 v4, 1, v1 7504; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 7505; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 7506; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 7507; GFX90A-NEXT: s_endpgm 7508 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 7509 %r = udiv <2 x i32> %x, %shl.y 7510 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 7511 ret void 7512} 7513 7514define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 7515; CHECK-LABEL: @urem_i32_oddk_denom( 7516; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 1235195 7517; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 7518; CHECK-NEXT: ret void 7519; 7520; GFX6-LABEL: urem_i32_oddk_denom: 7521; GFX6: ; %bb.0: 7522; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 7523; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 7524; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb 7525; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 7526; GFX6-NEXT: s_mov_b32 s3, 0xf000 7527; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7528; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 7529; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v0 7530; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 7531; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7532; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 7533; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 7534; GFX6-NEXT: s_mov_b32 s2, -1 7535; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 7536; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 7537; GFX6-NEXT: s_endpgm 7538; 7539; GFX9-LABEL: urem_i32_oddk_denom: 7540; GFX9: ; %bb.0: 7541; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7542; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 7543; GFX9-NEXT: v_mov_b32_e32 v0, 0 7544; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7545; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 7546; GFX9-NEXT: s_sub_i32 s1, s4, s0 7547; GFX9-NEXT: s_lshr_b32 s1, s1, 1 7548; GFX9-NEXT: s_add_i32 s1, s1, s0 7549; GFX9-NEXT: s_lshr_b32 s0, s1, 20 7550; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb 7551; GFX9-NEXT: s_sub_i32 s0, s4, s0 7552; GFX9-NEXT: v_mov_b32_e32 v1, s0 7553; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 7554; GFX9-NEXT: s_endpgm 7555; 7556; GFX90A-LABEL: urem_i32_oddk_denom: 7557; GFX90A: ; %bb.0: 7558; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7559; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 7560; GFX90A-NEXT: v_mov_b32_e32 v0, 0 7561; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7562; GFX90A-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 7563; GFX90A-NEXT: s_sub_i32 s1, s4, s0 7564; GFX90A-NEXT: s_lshr_b32 s1, s1, 1 7565; GFX90A-NEXT: s_add_i32 s1, s1, s0 7566; GFX90A-NEXT: s_lshr_b32 s0, s1, 20 7567; GFX90A-NEXT: s_mul_i32 s0, s0, 0x12d8fb 7568; GFX90A-NEXT: s_sub_i32 s0, s4, s0 7569; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7570; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 7571; GFX90A-NEXT: s_endpgm 7572 %r = urem i32 %x, 1235195 7573 store i32 %r, i32 addrspace(1)* %out 7574 ret void 7575} 7576 7577define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 7578; CHECK-LABEL: @urem_i32_pow2k_denom( 7579; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 4096 7580; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 7581; CHECK-NEXT: ret void 7582; 7583; GFX6-LABEL: urem_i32_pow2k_denom: 7584; GFX6: ; %bb.0: 7585; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7586; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 7587; GFX6-NEXT: s_mov_b32 s7, 0xf000 7588; GFX6-NEXT: s_mov_b32 s6, -1 7589; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7590; GFX6-NEXT: s_and_b32 s0, s0, 0xfff 7591; GFX6-NEXT: v_mov_b32_e32 v0, s0 7592; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 7593; GFX6-NEXT: s_endpgm 7594; 7595; GFX9-LABEL: urem_i32_pow2k_denom: 7596; GFX9: ; %bb.0: 7597; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7598; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 7599; GFX9-NEXT: v_mov_b32_e32 v0, 0 7600; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7601; GFX9-NEXT: s_and_b32 s0, s4, 0xfff 7602; GFX9-NEXT: v_mov_b32_e32 v1, s0 7603; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 7604; GFX9-NEXT: s_endpgm 7605; 7606; GFX90A-LABEL: urem_i32_pow2k_denom: 7607; GFX90A: ; %bb.0: 7608; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7609; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 7610; GFX90A-NEXT: v_mov_b32_e32 v0, 0 7611; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7612; GFX90A-NEXT: s_and_b32 s0, s4, 0xfff 7613; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7614; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 7615; GFX90A-NEXT: s_endpgm 7616 %r = urem i32 %x, 4096 7617 store i32 %r, i32 addrspace(1)* %out 7618 ret void 7619} 7620 7621define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 7622; CHECK-LABEL: @urem_i32_pow2_shl_denom( 7623; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 7624; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]] 7625; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 7626; CHECK-NEXT: ret void 7627; 7628; GFX6-LABEL: urem_i32_pow2_shl_denom: 7629; GFX6: ; %bb.0: 7630; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7631; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7632; GFX6-NEXT: s_mov_b32 s7, 0xf000 7633; GFX6-NEXT: s_mov_b32 s6, -1 7634; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7635; GFX6-NEXT: s_lshl_b32 s1, 0x1000, s1 7636; GFX6-NEXT: s_add_i32 s1, s1, -1 7637; GFX6-NEXT: s_and_b32 s0, s0, s1 7638; GFX6-NEXT: v_mov_b32_e32 v0, s0 7639; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 7640; GFX6-NEXT: s_endpgm 7641; 7642; GFX9-LABEL: urem_i32_pow2_shl_denom: 7643; GFX9: ; %bb.0: 7644; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7645; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 7646; GFX9-NEXT: v_mov_b32_e32 v0, 0 7647; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7648; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s5 7649; GFX9-NEXT: s_add_i32 s0, s0, -1 7650; GFX9-NEXT: s_and_b32 s0, s4, s0 7651; GFX9-NEXT: v_mov_b32_e32 v1, s0 7652; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 7653; GFX9-NEXT: s_endpgm 7654; 7655; GFX90A-LABEL: urem_i32_pow2_shl_denom: 7656; GFX90A: ; %bb.0: 7657; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7658; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 7659; GFX90A-NEXT: v_mov_b32_e32 v0, 0 7660; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7661; GFX90A-NEXT: s_lshl_b32 s0, 0x1000, s5 7662; GFX90A-NEXT: s_add_i32 s0, s0, -1 7663; GFX90A-NEXT: s_and_b32 s0, s4, s0 7664; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7665; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 7666; GFX90A-NEXT: s_endpgm 7667 %shl.y = shl i32 4096, %y 7668 %r = urem i32 %x, %shl.y 7669 store i32 %r, i32 addrspace(1)* %out 7670 ret void 7671} 7672 7673define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 7674; CHECK-LABEL: @urem_v2i32_pow2k_denom( 7675; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 7676; CHECK-NEXT: [[TMP2:%.*]] = urem i32 [[TMP1]], 4096 7677; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 7678; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 7679; CHECK-NEXT: [[TMP5:%.*]] = urem i32 [[TMP4]], 4096 7680; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 7681; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 7682; CHECK-NEXT: ret void 7683; 7684; GFX6-LABEL: urem_v2i32_pow2k_denom: 7685; GFX6: ; %bb.0: 7686; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7687; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7688; GFX6-NEXT: s_movk_i32 s2, 0xfff 7689; GFX6-NEXT: s_mov_b32 s7, 0xf000 7690; GFX6-NEXT: s_mov_b32 s6, -1 7691; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7692; GFX6-NEXT: s_and_b32 s0, s0, s2 7693; GFX6-NEXT: s_and_b32 s1, s1, s2 7694; GFX6-NEXT: v_mov_b32_e32 v0, s0 7695; GFX6-NEXT: v_mov_b32_e32 v1, s1 7696; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7697; GFX6-NEXT: s_endpgm 7698; 7699; GFX9-LABEL: urem_v2i32_pow2k_denom: 7700; GFX9: ; %bb.0: 7701; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7702; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 7703; GFX9-NEXT: s_movk_i32 s0, 0xfff 7704; GFX9-NEXT: v_mov_b32_e32 v2, 0 7705; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7706; GFX9-NEXT: s_and_b32 s1, s4, s0 7707; GFX9-NEXT: s_and_b32 s0, s5, s0 7708; GFX9-NEXT: v_mov_b32_e32 v0, s1 7709; GFX9-NEXT: v_mov_b32_e32 v1, s0 7710; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 7711; GFX9-NEXT: s_endpgm 7712; 7713; GFX90A-LABEL: urem_v2i32_pow2k_denom: 7714; GFX90A: ; %bb.0: 7715; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7716; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 7717; GFX90A-NEXT: s_movk_i32 s0, 0xfff 7718; GFX90A-NEXT: v_mov_b32_e32 v2, 0 7719; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7720; GFX90A-NEXT: s_and_b32 s1, s4, s0 7721; GFX90A-NEXT: s_and_b32 s0, s5, s0 7722; GFX90A-NEXT: v_mov_b32_e32 v0, s1 7723; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7724; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 7725; GFX90A-NEXT: s_endpgm 7726 %r = urem <2 x i32> %x, <i32 4096, i32 4096> 7727 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 7728 ret void 7729} 7730 7731define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 7732; CHECK-LABEL: @urem_v2i32_pow2_shl_denom( 7733; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 7734; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 7735; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 7736; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 7737; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 7738; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 7739; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 7740; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 7741; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 7742; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 7743; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 7744; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 7745; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 7746; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 7747; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 7748; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 7749; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 7750; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 7751; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 7752; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 7753; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 7754; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 7755; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 7756; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 7757; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 7758; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]] 7759; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]] 7760; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]] 7761; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]] 7762; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]] 7763; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> undef, i32 [[TMP29]], i64 0 7764; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[X]], i64 1 7765; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 7766; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float 7767; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 7768; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000 7769; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32 7770; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]] 7771; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]] 7772; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64 7773; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64 7774; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]] 7775; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32 7776; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32 7777; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 7778; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]] 7779; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64 7780; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64 7781; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]] 7782; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32 7783; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32 7784; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 7785; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]] 7786; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]] 7787; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]] 7788; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]] 7789; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]] 7790; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]] 7791; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]] 7792; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]] 7793; CHECK-NEXT: [[TMP60:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP59]], i64 1 7794; CHECK-NEXT: store <2 x i32> [[TMP60]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 7795; CHECK-NEXT: ret void 7796; 7797; GFX6-LABEL: urem_v2i32_pow2_shl_denom: 7798; GFX6: ; %bb.0: 7799; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 7800; GFX6-NEXT: s_movk_i32 s4, 0x1000 7801; GFX6-NEXT: s_mov_b32 s5, 0x4f7ffffe 7802; GFX6-NEXT: s_mov_b32 s7, 0xf000 7803; GFX6-NEXT: s_mov_b32 s6, -1 7804; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7805; GFX6-NEXT: s_lshl_b32 s2, s4, s2 7806; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 7807; GFX6-NEXT: s_lshl_b32 s3, s4, s3 7808; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 7809; GFX6-NEXT: s_sub_i32 s4, 0, s2 7810; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 7811; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 7812; GFX6-NEXT: v_mul_f32_e32 v0, s5, v0 7813; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 7814; GFX6-NEXT: v_mul_f32_e32 v1, s5, v1 7815; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 7816; GFX6-NEXT: v_mul_lo_u32 v2, s4, v0 7817; GFX6-NEXT: s_sub_i32 s4, 0, s3 7818; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1 7819; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7820; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7821; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 7822; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 7823; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 7824; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7825; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 7826; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 7827; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 7828; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 7829; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 7830; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 7831; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v0 7832; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 7833; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 7834; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v0 7835; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 7836; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 7837; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 7838; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 7839; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 7840; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 7841; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 7842; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 7843; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 7844; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7845; GFX6-NEXT: s_endpgm 7846; 7847; GFX9-LABEL: urem_v2i32_pow2_shl_denom: 7848; GFX9: ; %bb.0: 7849; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 7850; GFX9-NEXT: s_movk_i32 s4, 0x1000 7851; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7852; GFX9-NEXT: s_lshl_b32 s5, s4, s3 7853; GFX9-NEXT: s_lshl_b32 s4, s4, s2 7854; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 7855; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 7856; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe 7857; GFX9-NEXT: s_sub_i32 s3, 0, s5 7858; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 7859; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 7860; GFX9-NEXT: v_mul_f32_e32 v0, s2, v0 7861; GFX9-NEXT: v_mul_f32_e32 v1, s2, v1 7862; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 7863; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 7864; GFX9-NEXT: s_sub_i32 s2, 0, s4 7865; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 7866; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 7867; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 7868; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 7869; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 7870; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 7871; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 7872; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 7873; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7874; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 7875; GFX9-NEXT: v_mul_hi_u32 v1, s3, v1 7876; GFX9-NEXT: v_mov_b32_e32 v2, 0 7877; GFX9-NEXT: v_mul_lo_u32 v0, v0, s4 7878; GFX9-NEXT: v_mul_lo_u32 v1, v1, s5 7879; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 7880; GFX9-NEXT: v_sub_u32_e32 v1, s3, v1 7881; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v0 7882; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 7883; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v1 7884; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 7885; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 7886; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 7887; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v0 7888; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 7889; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v1 7890; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 7891; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 7892; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 7893; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 7894; GFX9-NEXT: s_endpgm 7895; 7896; GFX90A-LABEL: urem_v2i32_pow2_shl_denom: 7897; GFX90A: ; %bb.0: 7898; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 7899; GFX90A-NEXT: s_movk_i32 s8, 0x1000 7900; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 7901; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 7902; GFX90A-NEXT: v_mov_b32_e32 v2, 0 7903; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7904; GFX90A-NEXT: s_lshl_b32 s2, s8, s2 7905; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s2 7906; GFX90A-NEXT: s_lshl_b32 s0, s8, s3 7907; GFX90A-NEXT: s_mov_b32 s3, 0x4f7ffffe 7908; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s0 7909; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 7910; GFX90A-NEXT: s_sub_i32 s1, 0, s2 7911; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 7912; GFX90A-NEXT: v_mul_f32_e32 v0, s3, v0 7913; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 7914; GFX90A-NEXT: v_mul_f32_e32 v1, s3, v1 7915; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 7916; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v0 7917; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v3 7918; GFX90A-NEXT: v_add_u32_e32 v0, v0, v3 7919; GFX90A-NEXT: v_mul_hi_u32 v0, s6, v0 7920; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s2 7921; GFX90A-NEXT: v_sub_u32_e32 v0, s6, v0 7922; GFX90A-NEXT: v_subrev_u32_e32 v3, s2, v0 7923; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 7924; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 7925; GFX90A-NEXT: v_subrev_u32_e32 v3, s2, v0 7926; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 7927; GFX90A-NEXT: s_sub_i32 s1, 0, s0 7928; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 7929; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v1 7930; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 7931; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 7932; GFX90A-NEXT: v_mul_hi_u32 v1, s7, v1 7933; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s0 7934; GFX90A-NEXT: v_sub_u32_e32 v1, s7, v1 7935; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v1 7936; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v1 7937; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 7938; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v1 7939; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v1 7940; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 7941; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 7942; GFX90A-NEXT: s_endpgm 7943 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 7944 %r = urem <2 x i32> %x, %shl.y 7945 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 7946 ret void 7947} 7948 7949define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 7950; CHECK-LABEL: @sdiv_i32_oddk_denom( 7951; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195 7952; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 7953; CHECK-NEXT: ret void 7954; 7955; GFX6-LABEL: sdiv_i32_oddk_denom: 7956; GFX6: ; %bb.0: 7957; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7958; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 7959; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 7960; GFX6-NEXT: s_mov_b32 s7, 0xf000 7961; GFX6-NEXT: s_mov_b32 s6, -1 7962; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7963; GFX6-NEXT: v_mul_hi_i32 v0, s0, v0 7964; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 7965; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 7966; GFX6-NEXT: v_ashrrev_i32_e32 v0, 20, v0 7967; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 7968; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 7969; GFX6-NEXT: s_endpgm 7970; 7971; GFX9-LABEL: sdiv_i32_oddk_denom: 7972; GFX9: ; %bb.0: 7973; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7974; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 7975; GFX9-NEXT: v_mov_b32_e32 v0, 0 7976; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7977; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 7978; GFX9-NEXT: s_add_i32 s0, s0, s4 7979; GFX9-NEXT: s_lshr_b32 s1, s0, 31 7980; GFX9-NEXT: s_ashr_i32 s0, s0, 20 7981; GFX9-NEXT: s_add_i32 s0, s0, s1 7982; GFX9-NEXT: v_mov_b32_e32 v1, s0 7983; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 7984; GFX9-NEXT: s_endpgm 7985; 7986; GFX90A-LABEL: sdiv_i32_oddk_denom: 7987; GFX90A: ; %bb.0: 7988; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7989; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 7990; GFX90A-NEXT: v_mov_b32_e32 v0, 0 7991; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7992; GFX90A-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 7993; GFX90A-NEXT: s_add_i32 s0, s0, s4 7994; GFX90A-NEXT: s_lshr_b32 s1, s0, 31 7995; GFX90A-NEXT: s_ashr_i32 s0, s0, 20 7996; GFX90A-NEXT: s_add_i32 s0, s0, s1 7997; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7998; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 7999; GFX90A-NEXT: s_endpgm 8000 %r = sdiv i32 %x, 1235195 8001 store i32 %r, i32 addrspace(1)* %out 8002 ret void 8003} 8004 8005define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 8006; CHECK-LABEL: @sdiv_i32_pow2k_denom( 8007; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 4096 8008; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 8009; CHECK-NEXT: ret void 8010; 8011; GFX6-LABEL: sdiv_i32_pow2k_denom: 8012; GFX6: ; %bb.0: 8013; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8014; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 8015; GFX6-NEXT: s_mov_b32 s7, 0xf000 8016; GFX6-NEXT: s_mov_b32 s6, -1 8017; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8018; GFX6-NEXT: s_ashr_i32 s1, s0, 31 8019; GFX6-NEXT: s_lshr_b32 s1, s1, 20 8020; GFX6-NEXT: s_add_i32 s0, s0, s1 8021; GFX6-NEXT: s_ashr_i32 s0, s0, 12 8022; GFX6-NEXT: v_mov_b32_e32 v0, s0 8023; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 8024; GFX6-NEXT: s_endpgm 8025; 8026; GFX9-LABEL: sdiv_i32_pow2k_denom: 8027; GFX9: ; %bb.0: 8028; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8029; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 8030; GFX9-NEXT: v_mov_b32_e32 v0, 0 8031; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8032; GFX9-NEXT: s_ashr_i32 s0, s4, 31 8033; GFX9-NEXT: s_lshr_b32 s0, s0, 20 8034; GFX9-NEXT: s_add_i32 s4, s4, s0 8035; GFX9-NEXT: s_ashr_i32 s0, s4, 12 8036; GFX9-NEXT: v_mov_b32_e32 v1, s0 8037; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 8038; GFX9-NEXT: s_endpgm 8039; 8040; GFX90A-LABEL: sdiv_i32_pow2k_denom: 8041; GFX90A: ; %bb.0: 8042; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8043; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 8044; GFX90A-NEXT: v_mov_b32_e32 v0, 0 8045; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8046; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 8047; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 8048; GFX90A-NEXT: s_add_i32 s4, s4, s0 8049; GFX90A-NEXT: s_ashr_i32 s0, s4, 12 8050; GFX90A-NEXT: v_mov_b32_e32 v1, s0 8051; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 8052; GFX90A-NEXT: s_endpgm 8053 %r = sdiv i32 %x, 4096 8054 store i32 %r, i32 addrspace(1)* %out 8055 ret void 8056} 8057 8058define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 8059; CHECK-LABEL: @sdiv_i32_pow2_shl_denom( 8060; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 8061; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]] 8062; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 8063; CHECK-NEXT: ret void 8064; 8065; GFX6-LABEL: sdiv_i32_pow2_shl_denom: 8066; GFX6: ; %bb.0: 8067; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 8068; GFX6-NEXT: s_mov_b32 s7, 0xf000 8069; GFX6-NEXT: s_mov_b32 s6, -1 8070; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8071; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 8072; GFX6-NEXT: s_ashr_i32 s8, s3, 31 8073; GFX6-NEXT: s_add_i32 s3, s3, s8 8074; GFX6-NEXT: s_xor_b32 s3, s3, s8 8075; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 8076; GFX6-NEXT: s_sub_i32 s4, 0, s3 8077; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 8078; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 8079; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 8080; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 8081; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8082; GFX6-NEXT: s_ashr_i32 s0, s2, 31 8083; GFX6-NEXT: s_add_i32 s1, s2, s0 8084; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 8085; GFX6-NEXT: s_xor_b32 s1, s1, s0 8086; GFX6-NEXT: s_xor_b32 s2, s0, s8 8087; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 8088; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 8089; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 8090; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 8091; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 8092; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 8093; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 8094; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 8095; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 8096; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 8097; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 8098; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 8099; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 8100; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 8101; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8102; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 8103; GFX6-NEXT: s_endpgm 8104; 8105; GFX9-LABEL: sdiv_i32_pow2_shl_denom: 8106; GFX9: ; %bb.0: 8107; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 8108; GFX9-NEXT: v_mov_b32_e32 v2, 0 8109; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 8110; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8111; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 8112; GFX9-NEXT: s_ashr_i32 s4, s3, 31 8113; GFX9-NEXT: s_add_i32 s3, s3, s4 8114; GFX9-NEXT: s_xor_b32 s3, s3, s4 8115; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 8116; GFX9-NEXT: s_sub_i32 s5, 0, s3 8117; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 8118; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 8119; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 8120; GFX9-NEXT: v_mul_lo_u32 v1, s5, v0 8121; GFX9-NEXT: s_ashr_i32 s5, s2, 31 8122; GFX9-NEXT: s_add_i32 s2, s2, s5 8123; GFX9-NEXT: s_xor_b32 s2, s2, s5 8124; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 8125; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 8126; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 8127; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3 8128; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 8129; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 8130; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 8131; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 8132; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 8133; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 8134; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 8135; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 8136; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 8137; GFX9-NEXT: s_xor_b32 s2, s5, s4 8138; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 8139; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 8140; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 8141; GFX9-NEXT: s_endpgm 8142; 8143; GFX90A-LABEL: sdiv_i32_pow2_shl_denom: 8144; GFX90A: ; %bb.0: 8145; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 8146; GFX90A-NEXT: v_mov_b32_e32 v1, 0 8147; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 8148; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8149; GFX90A-NEXT: s_lshl_b32 s3, 0x1000, s3 8150; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 8151; GFX90A-NEXT: s_add_i32 s3, s3, s4 8152; GFX90A-NEXT: s_xor_b32 s3, s3, s4 8153; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 8154; GFX90A-NEXT: s_sub_i32 s6, 0, s3 8155; GFX90A-NEXT: s_ashr_i32 s5, s2, 31 8156; GFX90A-NEXT: s_add_i32 s2, s2, s5 8157; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 8158; GFX90A-NEXT: s_xor_b32 s2, s2, s5 8159; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 8160; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 8161; GFX90A-NEXT: v_mul_lo_u32 v2, s6, v0 8162; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 8163; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 8164; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 8165; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s3 8166; GFX90A-NEXT: v_sub_u32_e32 v3, s2, v3 8167; GFX90A-NEXT: v_add_u32_e32 v2, 1, v0 8168; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 8169; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 8170; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v3 8171; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 8172; GFX90A-NEXT: v_add_u32_e32 v4, 1, v0 8173; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 8174; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 8175; GFX90A-NEXT: s_xor_b32 s2, s5, s4 8176; GFX90A-NEXT: v_xor_b32_e32 v0, s2, v0 8177; GFX90A-NEXT: v_subrev_u32_e32 v0, s2, v0 8178; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] 8179; GFX90A-NEXT: s_endpgm 8180 %shl.y = shl i32 4096, %y 8181 %r = sdiv i32 %x, %shl.y 8182 store i32 %r, i32 addrspace(1)* %out 8183 ret void 8184} 8185 8186define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 8187; CHECK-LABEL: @sdiv_v2i32_pow2k_denom( 8188; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 8189; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 8190; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 8191; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 8192; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096 8193; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 8194; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 8195; CHECK-NEXT: ret void 8196; 8197; GFX6-LABEL: sdiv_v2i32_pow2k_denom: 8198; GFX6: ; %bb.0: 8199; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8200; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 8201; GFX6-NEXT: s_mov_b32 s7, 0xf000 8202; GFX6-NEXT: s_mov_b32 s6, -1 8203; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8204; GFX6-NEXT: s_ashr_i32 s2, s0, 31 8205; GFX6-NEXT: s_ashr_i32 s3, s1, 31 8206; GFX6-NEXT: s_lshr_b32 s2, s2, 20 8207; GFX6-NEXT: s_add_i32 s0, s0, s2 8208; GFX6-NEXT: s_lshr_b32 s2, s3, 20 8209; GFX6-NEXT: s_add_i32 s1, s1, s2 8210; GFX6-NEXT: s_ashr_i32 s0, s0, 12 8211; GFX6-NEXT: s_ashr_i32 s1, s1, 12 8212; GFX6-NEXT: v_mov_b32_e32 v0, s0 8213; GFX6-NEXT: v_mov_b32_e32 v1, s1 8214; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 8215; GFX6-NEXT: s_endpgm 8216; 8217; GFX9-LABEL: sdiv_v2i32_pow2k_denom: 8218; GFX9: ; %bb.0: 8219; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8220; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 8221; GFX9-NEXT: v_mov_b32_e32 v2, 0 8222; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8223; GFX9-NEXT: s_ashr_i32 s0, s4, 31 8224; GFX9-NEXT: s_ashr_i32 s1, s5, 31 8225; GFX9-NEXT: s_lshr_b32 s0, s0, 20 8226; GFX9-NEXT: s_lshr_b32 s1, s1, 20 8227; GFX9-NEXT: s_add_i32 s0, s4, s0 8228; GFX9-NEXT: s_add_i32 s1, s5, s1 8229; GFX9-NEXT: s_ashr_i32 s0, s0, 12 8230; GFX9-NEXT: s_ashr_i32 s1, s1, 12 8231; GFX9-NEXT: v_mov_b32_e32 v0, s0 8232; GFX9-NEXT: v_mov_b32_e32 v1, s1 8233; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 8234; GFX9-NEXT: s_endpgm 8235; 8236; GFX90A-LABEL: sdiv_v2i32_pow2k_denom: 8237; GFX90A: ; %bb.0: 8238; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8239; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 8240; GFX90A-NEXT: v_mov_b32_e32 v2, 0 8241; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8242; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 8243; GFX90A-NEXT: s_ashr_i32 s1, s5, 31 8244; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 8245; GFX90A-NEXT: s_lshr_b32 s1, s1, 20 8246; GFX90A-NEXT: s_add_i32 s0, s4, s0 8247; GFX90A-NEXT: s_add_i32 s1, s5, s1 8248; GFX90A-NEXT: s_ashr_i32 s0, s0, 12 8249; GFX90A-NEXT: s_ashr_i32 s1, s1, 12 8250; GFX90A-NEXT: v_mov_b32_e32 v0, s0 8251; GFX90A-NEXT: v_mov_b32_e32 v1, s1 8252; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 8253; GFX90A-NEXT: s_endpgm 8254 %r = sdiv <2 x i32> %x, <i32 4096, i32 4096> 8255 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 8256 ret void 8257} 8258 8259define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 8260; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom( 8261; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 8262; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 8263; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 8264; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 8265; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095 8266; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 8267; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 8268; CHECK-NEXT: ret void 8269; 8270; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 8271; GFX6: ; %bb.0: 8272; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8273; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 8274; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080081 8275; GFX6-NEXT: s_mov_b32 s7, 0xf000 8276; GFX6-NEXT: s_mov_b32 s6, -1 8277; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8278; GFX6-NEXT: v_mul_hi_i32 v0, s1, v0 8279; GFX6-NEXT: s_ashr_i32 s2, s0, 31 8280; GFX6-NEXT: s_lshr_b32 s2, s2, 20 8281; GFX6-NEXT: s_add_i32 s0, s0, s2 8282; GFX6-NEXT: v_add_i32_e32 v0, vcc, s1, v0 8283; GFX6-NEXT: s_ashr_i32 s0, s0, 12 8284; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 8285; GFX6-NEXT: v_ashrrev_i32_e32 v0, 11, v0 8286; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v0 8287; GFX6-NEXT: v_mov_b32_e32 v0, s0 8288; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 8289; GFX6-NEXT: s_endpgm 8290; 8291; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 8292; GFX9: ; %bb.0: 8293; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8294; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 8295; GFX9-NEXT: v_mov_b32_e32 v2, 0 8296; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8297; GFX9-NEXT: s_ashr_i32 s0, s4, 31 8298; GFX9-NEXT: s_mul_hi_i32 s1, s5, 0x80080081 8299; GFX9-NEXT: s_lshr_b32 s0, s0, 20 8300; GFX9-NEXT: s_add_i32 s1, s1, s5 8301; GFX9-NEXT: s_add_i32 s0, s4, s0 8302; GFX9-NEXT: s_lshr_b32 s4, s1, 31 8303; GFX9-NEXT: s_ashr_i32 s1, s1, 11 8304; GFX9-NEXT: s_ashr_i32 s0, s0, 12 8305; GFX9-NEXT: s_add_i32 s1, s1, s4 8306; GFX9-NEXT: v_mov_b32_e32 v0, s0 8307; GFX9-NEXT: v_mov_b32_e32 v1, s1 8308; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 8309; GFX9-NEXT: s_endpgm 8310; 8311; GFX90A-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 8312; GFX90A: ; %bb.0: 8313; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8314; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 8315; GFX90A-NEXT: v_mov_b32_e32 v2, 0 8316; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8317; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 8318; GFX90A-NEXT: s_mul_hi_i32 s1, s5, 0x80080081 8319; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 8320; GFX90A-NEXT: s_add_i32 s1, s1, s5 8321; GFX90A-NEXT: s_add_i32 s0, s4, s0 8322; GFX90A-NEXT: s_lshr_b32 s4, s1, 31 8323; GFX90A-NEXT: s_ashr_i32 s1, s1, 11 8324; GFX90A-NEXT: s_ashr_i32 s0, s0, 12 8325; GFX90A-NEXT: s_add_i32 s1, s1, s4 8326; GFX90A-NEXT: v_mov_b32_e32 v0, s0 8327; GFX90A-NEXT: v_mov_b32_e32 v1, s1 8328; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 8329; GFX90A-NEXT: s_endpgm 8330 %r = sdiv <2 x i32> %x, <i32 4096, i32 4095> 8331 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 8332 ret void 8333} 8334 8335define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 8336; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom( 8337; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 8338; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 8339; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 8340; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 8341; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 8342; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 8343; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 8344; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 8345; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 8346; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 8347; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 8348; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 8349; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000 8350; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 8351; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]] 8352; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]] 8353; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64 8354; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 8355; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 8356; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 8357; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 8358; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 8359; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]] 8360; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64 8361; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64 8362; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 8363; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 8364; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 8365; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 8366; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]] 8367; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]] 8368; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]] 8369; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1 8370; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]] 8371; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]] 8372; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]] 8373; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]] 8374; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1 8375; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]] 8376; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]] 8377; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]] 8378; CHECK-NEXT: [[TMP41:%.*]] = insertelement <2 x i32> undef, i32 [[TMP40]], i64 0 8379; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[X]], i64 1 8380; CHECK-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 8381; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31 8382; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31 8383; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]] 8384; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]] 8385; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]] 8386; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]] 8387; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]] 8388; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float 8389; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]]) 8390; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000 8391; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32 8392; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]] 8393; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]] 8394; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64 8395; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64 8396; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]] 8397; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32 8398; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32 8399; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32 8400; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]] 8401; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64 8402; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64 8403; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]] 8404; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32 8405; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32 8406; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32 8407; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]] 8408; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]] 8409; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]] 8410; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1 8411; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]] 8412; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]] 8413; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]] 8414; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]] 8415; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1 8416; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]] 8417; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]] 8418; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]] 8419; CHECK-NEXT: [[TMP82:%.*]] = insertelement <2 x i32> [[TMP41]], i32 [[TMP81]], i64 1 8420; CHECK-NEXT: store <2 x i32> [[TMP82]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 8421; CHECK-NEXT: ret void 8422; 8423; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom: 8424; GFX6: ; %bb.0: 8425; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 8426; GFX6-NEXT: s_movk_i32 s10, 0x1000 8427; GFX6-NEXT: s_mov_b32 s12, 0x4f7ffffe 8428; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8429; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb 8430; GFX6-NEXT: s_mov_b32 s7, 0xf000 8431; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8432; GFX6-NEXT: s_lshl_b32 s2, s10, s2 8433; GFX6-NEXT: s_ashr_i32 s11, s2, 31 8434; GFX6-NEXT: s_add_i32 s2, s2, s11 8435; GFX6-NEXT: s_xor_b32 s2, s2, s11 8436; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 8437; GFX6-NEXT: s_lshl_b32 s0, s10, s3 8438; GFX6-NEXT: s_sub_i32 s10, 0, s2 8439; GFX6-NEXT: s_ashr_i32 s3, s0, 31 8440; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 8441; GFX6-NEXT: s_add_i32 s0, s0, s3 8442; GFX6-NEXT: s_ashr_i32 s1, s8, 31 8443; GFX6-NEXT: s_mov_b32 s6, -1 8444; GFX6-NEXT: v_mul_f32_e32 v0, s12, v0 8445; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 8446; GFX6-NEXT: v_mul_lo_u32 v1, s10, v0 8447; GFX6-NEXT: s_xor_b32 s10, s0, s3 8448; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s10 8449; GFX6-NEXT: s_add_i32 s0, s8, s1 8450; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 8451; GFX6-NEXT: s_xor_b32 s0, s0, s1 8452; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 8453; GFX6-NEXT: s_xor_b32 s8, s1, s11 8454; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 8455; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 8456; GFX6-NEXT: v_mul_f32_e32 v1, s12, v2 8457; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 8458; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 8459; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 8460; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s0, v2 8461; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 8462; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 8463; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v2 8464; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 8465; GFX6-NEXT: s_sub_i32 s0, 0, s10 8466; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1 8467; GFX6-NEXT: s_ashr_i32 s0, s9, 31 8468; GFX6-NEXT: s_add_i32 s1, s9, s0 8469; GFX6-NEXT: s_xor_b32 s1, s1, s0 8470; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 8471; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 8472; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 8473; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 8474; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 8475; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 8476; GFX6-NEXT: s_xor_b32 s2, s0, s3 8477; GFX6-NEXT: v_mul_lo_u32 v2, v1, s10 8478; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 8479; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 8480; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 8481; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 8482; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 8483; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s10, v2 8484; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 8485; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 8486; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 8487; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 8488; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 8489; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 8490; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 8491; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 8492; GFX6-NEXT: s_endpgm 8493; 8494; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: 8495; GFX9: ; %bb.0: 8496; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 8497; GFX9-NEXT: s_movk_i32 s8, 0x1000 8498; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 8499; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 8500; GFX9-NEXT: s_mov_b32 s10, 0x4f7ffffe 8501; GFX9-NEXT: v_mov_b32_e32 v2, 0 8502; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8503; GFX9-NEXT: s_lshl_b32 s2, s8, s2 8504; GFX9-NEXT: s_ashr_i32 s9, s2, 31 8505; GFX9-NEXT: s_add_i32 s2, s2, s9 8506; GFX9-NEXT: s_xor_b32 s2, s2, s9 8507; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 8508; GFX9-NEXT: s_lshl_b32 s0, s8, s3 8509; GFX9-NEXT: s_ashr_i32 s1, s0, 31 8510; GFX9-NEXT: s_add_i32 s0, s0, s1 8511; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 8512; GFX9-NEXT: s_xor_b32 s0, s0, s1 8513; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 8514; GFX9-NEXT: s_sub_i32 s3, 0, s2 8515; GFX9-NEXT: v_mul_f32_e32 v0, s10, v0 8516; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 8517; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 8518; GFX9-NEXT: s_sub_i32 s8, 0, s0 8519; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 8520; GFX9-NEXT: v_mul_f32_e32 v1, s10, v1 8521; GFX9-NEXT: s_ashr_i32 s3, s6, 31 8522; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 8523; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 8524; GFX9-NEXT: s_add_i32 s6, s6, s3 8525; GFX9-NEXT: s_xor_b32 s6, s6, s3 8526; GFX9-NEXT: s_xor_b32 s3, s3, s9 8527; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 8528; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 8529; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 8530; GFX9-NEXT: s_ashr_i32 s8, s7, 31 8531; GFX9-NEXT: s_xor_b32 s1, s8, s1 8532; GFX9-NEXT: v_mul_lo_u32 v4, v0, s2 8533; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 8534; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 8535; GFX9-NEXT: v_sub_u32_e32 v4, s6, v4 8536; GFX9-NEXT: s_add_i32 s6, s7, s8 8537; GFX9-NEXT: s_xor_b32 s6, s6, s8 8538; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 8539; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 8540; GFX9-NEXT: v_mul_hi_u32 v1, s6, v1 8541; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 8542; GFX9-NEXT: v_subrev_u32_e32 v5, s2, v4 8543; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc 8544; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 8545; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 8546; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 8547; GFX9-NEXT: v_mul_lo_u32 v3, v1, s0 8548; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 8549; GFX9-NEXT: v_xor_b32_e32 v0, s3, v0 8550; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0 8551; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 8552; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 8553; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 8554; GFX9-NEXT: v_subrev_u32_e32 v4, s0, v3 8555; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 8556; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 8557; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 8558; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 8559; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 8560; GFX9-NEXT: v_subrev_u32_e32 v1, s1, v1 8561; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 8562; GFX9-NEXT: s_endpgm 8563; 8564; GFX90A-LABEL: sdiv_v2i32_pow2_shl_denom: 8565; GFX90A: ; %bb.0: 8566; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 8567; GFX90A-NEXT: s_movk_i32 s8, 0x1000 8568; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 8569; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 8570; GFX90A-NEXT: s_mov_b32 s10, 0x4f7ffffe 8571; GFX90A-NEXT: v_mov_b32_e32 v2, 0 8572; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8573; GFX90A-NEXT: s_lshl_b32 s2, s8, s2 8574; GFX90A-NEXT: s_ashr_i32 s9, s2, 31 8575; GFX90A-NEXT: s_add_i32 s2, s2, s9 8576; GFX90A-NEXT: s_xor_b32 s2, s2, s9 8577; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s2 8578; GFX90A-NEXT: s_ashr_i32 s1, s6, 31 8579; GFX90A-NEXT: s_lshl_b32 s0, s8, s3 8580; GFX90A-NEXT: s_add_i32 s3, s6, s1 8581; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 8582; GFX90A-NEXT: s_xor_b32 s6, s1, s9 8583; GFX90A-NEXT: s_xor_b32 s1, s3, s1 8584; GFX90A-NEXT: s_sub_i32 s3, 0, s2 8585; GFX90A-NEXT: v_mul_f32_e32 v0, s10, v0 8586; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 8587; GFX90A-NEXT: v_mul_lo_u32 v1, s3, v0 8588; GFX90A-NEXT: v_mul_hi_u32 v1, v0, v1 8589; GFX90A-NEXT: v_add_u32_e32 v0, v0, v1 8590; GFX90A-NEXT: v_mul_hi_u32 v0, s1, v0 8591; GFX90A-NEXT: v_mul_lo_u32 v1, v0, s2 8592; GFX90A-NEXT: v_sub_u32_e32 v1, s1, v1 8593; GFX90A-NEXT: s_ashr_i32 s1, s0, 31 8594; GFX90A-NEXT: s_add_i32 s0, s0, s1 8595; GFX90A-NEXT: s_xor_b32 s0, s0, s1 8596; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s0 8597; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 8598; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 8599; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 8600; GFX90A-NEXT: v_subrev_u32_e32 v3, s2, v1 8601; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 8602; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 8603; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v4 8604; GFX90A-NEXT: s_ashr_i32 s2, s7, 31 8605; GFX90A-NEXT: s_add_i32 s3, s7, s2 8606; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 8607; GFX90A-NEXT: v_mul_f32_e32 v1, s10, v1 8608; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 8609; GFX90A-NEXT: s_xor_b32 s1, s2, s1 8610; GFX90A-NEXT: s_xor_b32 s2, s3, s2 8611; GFX90A-NEXT: s_sub_i32 s3, 0, s0 8612; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 8613; GFX90A-NEXT: v_mul_lo_u32 v3, s3, v1 8614; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 8615; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 8616; GFX90A-NEXT: v_mul_hi_u32 v1, s2, v1 8617; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s0 8618; GFX90A-NEXT: v_sub_u32_e32 v3, s2, v3 8619; GFX90A-NEXT: v_add_u32_e32 v4, 1, v1 8620; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 8621; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 8622; GFX90A-NEXT: v_subrev_u32_e32 v4, s0, v3 8623; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 8624; GFX90A-NEXT: v_add_u32_e32 v4, 1, v1 8625; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 8626; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 8627; GFX90A-NEXT: v_xor_b32_e32 v0, s6, v0 8628; GFX90A-NEXT: v_xor_b32_e32 v1, s1, v1 8629; GFX90A-NEXT: v_subrev_u32_e32 v0, s6, v0 8630; GFX90A-NEXT: v_subrev_u32_e32 v1, s1, v1 8631; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 8632; GFX90A-NEXT: s_endpgm 8633 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 8634 %r = sdiv <2 x i32> %x, %shl.y 8635 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 8636 ret void 8637} 8638 8639define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 8640; CHECK-LABEL: @srem_i32_oddk_denom( 8641; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 1235195 8642; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 8643; CHECK-NEXT: ret void 8644; 8645; GFX6-LABEL: srem_i32_oddk_denom: 8646; GFX6: ; %bb.0: 8647; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 8648; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 8649; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb 8650; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 8651; GFX6-NEXT: s_mov_b32 s3, 0xf000 8652; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8653; GFX6-NEXT: v_mul_hi_i32 v0, s4, v0 8654; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 8655; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 8656; GFX6-NEXT: v_ashrrev_i32_e32 v0, 20, v0 8657; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 8658; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 8659; GFX6-NEXT: s_mov_b32 s2, -1 8660; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 8661; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 8662; GFX6-NEXT: s_endpgm 8663; 8664; GFX9-LABEL: srem_i32_oddk_denom: 8665; GFX9: ; %bb.0: 8666; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8667; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 8668; GFX9-NEXT: v_mov_b32_e32 v0, 0 8669; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8670; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 8671; GFX9-NEXT: s_add_i32 s0, s0, s4 8672; GFX9-NEXT: s_lshr_b32 s1, s0, 31 8673; GFX9-NEXT: s_ashr_i32 s0, s0, 20 8674; GFX9-NEXT: s_add_i32 s0, s0, s1 8675; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb 8676; GFX9-NEXT: s_sub_i32 s0, s4, s0 8677; GFX9-NEXT: v_mov_b32_e32 v1, s0 8678; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 8679; GFX9-NEXT: s_endpgm 8680; 8681; GFX90A-LABEL: srem_i32_oddk_denom: 8682; GFX90A: ; %bb.0: 8683; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8684; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 8685; GFX90A-NEXT: v_mov_b32_e32 v0, 0 8686; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8687; GFX90A-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 8688; GFX90A-NEXT: s_add_i32 s0, s0, s4 8689; GFX90A-NEXT: s_lshr_b32 s1, s0, 31 8690; GFX90A-NEXT: s_ashr_i32 s0, s0, 20 8691; GFX90A-NEXT: s_add_i32 s0, s0, s1 8692; GFX90A-NEXT: s_mul_i32 s0, s0, 0x12d8fb 8693; GFX90A-NEXT: s_sub_i32 s0, s4, s0 8694; GFX90A-NEXT: v_mov_b32_e32 v1, s0 8695; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 8696; GFX90A-NEXT: s_endpgm 8697 %r = srem i32 %x, 1235195 8698 store i32 %r, i32 addrspace(1)* %out 8699 ret void 8700} 8701 8702define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 8703; CHECK-LABEL: @srem_i32_pow2k_denom( 8704; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 4096 8705; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 8706; CHECK-NEXT: ret void 8707; 8708; GFX6-LABEL: srem_i32_pow2k_denom: 8709; GFX6: ; %bb.0: 8710; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8711; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 8712; GFX6-NEXT: s_mov_b32 s7, 0xf000 8713; GFX6-NEXT: s_mov_b32 s6, -1 8714; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8715; GFX6-NEXT: s_ashr_i32 s1, s0, 31 8716; GFX6-NEXT: s_lshr_b32 s1, s1, 20 8717; GFX6-NEXT: s_add_i32 s1, s0, s1 8718; GFX6-NEXT: s_and_b32 s1, s1, 0xfffff000 8719; GFX6-NEXT: s_sub_i32 s0, s0, s1 8720; GFX6-NEXT: v_mov_b32_e32 v0, s0 8721; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 8722; GFX6-NEXT: s_endpgm 8723; 8724; GFX9-LABEL: srem_i32_pow2k_denom: 8725; GFX9: ; %bb.0: 8726; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8727; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 8728; GFX9-NEXT: v_mov_b32_e32 v0, 0 8729; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8730; GFX9-NEXT: s_ashr_i32 s0, s4, 31 8731; GFX9-NEXT: s_lshr_b32 s0, s0, 20 8732; GFX9-NEXT: s_add_i32 s0, s4, s0 8733; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 8734; GFX9-NEXT: s_sub_i32 s0, s4, s0 8735; GFX9-NEXT: v_mov_b32_e32 v1, s0 8736; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 8737; GFX9-NEXT: s_endpgm 8738; 8739; GFX90A-LABEL: srem_i32_pow2k_denom: 8740; GFX90A: ; %bb.0: 8741; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8742; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 8743; GFX90A-NEXT: v_mov_b32_e32 v0, 0 8744; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8745; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 8746; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 8747; GFX90A-NEXT: s_add_i32 s0, s4, s0 8748; GFX90A-NEXT: s_and_b32 s0, s0, 0xfffff000 8749; GFX90A-NEXT: s_sub_i32 s0, s4, s0 8750; GFX90A-NEXT: v_mov_b32_e32 v1, s0 8751; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 8752; GFX90A-NEXT: s_endpgm 8753 %r = srem i32 %x, 4096 8754 store i32 %r, i32 addrspace(1)* %out 8755 ret void 8756} 8757 8758define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 8759; CHECK-LABEL: @srem_i32_pow2_shl_denom( 8760; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 8761; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]] 8762; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 8763; CHECK-NEXT: ret void 8764; 8765; GFX6-LABEL: srem_i32_pow2_shl_denom: 8766; GFX6: ; %bb.0: 8767; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 8768; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 8769; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8770; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 8771; GFX6-NEXT: s_ashr_i32 s4, s3, 31 8772; GFX6-NEXT: s_add_i32 s3, s3, s4 8773; GFX6-NEXT: s_xor_b32 s4, s3, s4 8774; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 8775; GFX6-NEXT: s_sub_i32 s3, 0, s4 8776; GFX6-NEXT: s_ashr_i32 s5, s2, 31 8777; GFX6-NEXT: s_add_i32 s2, s2, s5 8778; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 8779; GFX6-NEXT: s_xor_b32 s6, s2, s5 8780; GFX6-NEXT: s_mov_b32 s2, -1 8781; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 8782; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 8783; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 8784; GFX6-NEXT: s_mov_b32 s3, 0xf000 8785; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 8786; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 8787; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 8788; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 8789; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 8790; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 8791; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 8792; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 8793; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 8794; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 8795; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 8796; GFX6-NEXT: v_xor_b32_e32 v0, s5, v0 8797; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s5, v0 8798; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 8799; GFX6-NEXT: s_endpgm 8800; 8801; GFX9-LABEL: srem_i32_pow2_shl_denom: 8802; GFX9: ; %bb.0: 8803; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 8804; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8805; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 8806; GFX9-NEXT: s_ashr_i32 s4, s3, 31 8807; GFX9-NEXT: s_add_i32 s3, s3, s4 8808; GFX9-NEXT: s_xor_b32 s3, s3, s4 8809; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 8810; GFX9-NEXT: s_sub_i32 s4, 0, s3 8811; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 8812; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 8813; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 8814; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 8815; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 8816; GFX9-NEXT: s_ashr_i32 s4, s2, 31 8817; GFX9-NEXT: s_add_i32 s2, s2, s4 8818; GFX9-NEXT: s_xor_b32 s2, s2, s4 8819; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 8820; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 8821; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 8822; GFX9-NEXT: v_mov_b32_e32 v1, 0 8823; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 8824; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 8825; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 8826; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 8827; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 8828; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 8829; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 8830; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 8831; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 8832; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 8833; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8834; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 8835; GFX9-NEXT: s_endpgm 8836; 8837; GFX90A-LABEL: srem_i32_pow2_shl_denom: 8838; GFX90A: ; %bb.0: 8839; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 8840; GFX90A-NEXT: v_mov_b32_e32 v1, 0 8841; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 8842; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8843; GFX90A-NEXT: s_lshl_b32 s3, 0x1000, s3 8844; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 8845; GFX90A-NEXT: s_add_i32 s3, s3, s4 8846; GFX90A-NEXT: s_xor_b32 s3, s3, s4 8847; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 8848; GFX90A-NEXT: s_sub_i32 s5, 0, s3 8849; GFX90A-NEXT: s_ashr_i32 s4, s2, 31 8850; GFX90A-NEXT: s_add_i32 s2, s2, s4 8851; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 8852; GFX90A-NEXT: s_xor_b32 s2, s2, s4 8853; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 8854; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 8855; GFX90A-NEXT: v_mul_lo_u32 v2, s5, v0 8856; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 8857; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 8858; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 8859; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s3 8860; GFX90A-NEXT: v_sub_u32_e32 v0, s2, v0 8861; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 8862; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 8863; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 8864; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 8865; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 8866; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 8867; GFX90A-NEXT: v_xor_b32_e32 v0, s4, v0 8868; GFX90A-NEXT: v_subrev_u32_e32 v0, s4, v0 8869; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] 8870; GFX90A-NEXT: s_endpgm 8871 %shl.y = shl i32 4096, %y 8872 %r = srem i32 %x, %shl.y 8873 store i32 %r, i32 addrspace(1)* %out 8874 ret void 8875} 8876 8877define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 8878; CHECK-LABEL: @srem_v2i32_pow2k_denom( 8879; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 8880; CHECK-NEXT: [[TMP2:%.*]] = srem i32 [[TMP1]], 4096 8881; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 8882; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 8883; CHECK-NEXT: [[TMP5:%.*]] = srem i32 [[TMP4]], 4096 8884; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 8885; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 8886; CHECK-NEXT: ret void 8887; 8888; GFX6-LABEL: srem_v2i32_pow2k_denom: 8889; GFX6: ; %bb.0: 8890; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8891; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 8892; GFX6-NEXT: s_movk_i32 s2, 0xf000 8893; GFX6-NEXT: s_mov_b32 s7, 0xf000 8894; GFX6-NEXT: s_mov_b32 s6, -1 8895; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8896; GFX6-NEXT: s_ashr_i32 s3, s0, 31 8897; GFX6-NEXT: s_lshr_b32 s3, s3, 20 8898; GFX6-NEXT: s_add_i32 s3, s0, s3 8899; GFX6-NEXT: s_and_b32 s3, s3, s2 8900; GFX6-NEXT: s_sub_i32 s0, s0, s3 8901; GFX6-NEXT: s_ashr_i32 s3, s1, 31 8902; GFX6-NEXT: s_lshr_b32 s3, s3, 20 8903; GFX6-NEXT: s_add_i32 s3, s1, s3 8904; GFX6-NEXT: s_and_b32 s2, s3, s2 8905; GFX6-NEXT: s_sub_i32 s1, s1, s2 8906; GFX6-NEXT: v_mov_b32_e32 v0, s0 8907; GFX6-NEXT: v_mov_b32_e32 v1, s1 8908; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 8909; GFX6-NEXT: s_endpgm 8910; 8911; GFX9-LABEL: srem_v2i32_pow2k_denom: 8912; GFX9: ; %bb.0: 8913; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8914; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 8915; GFX9-NEXT: s_movk_i32 s6, 0xf000 8916; GFX9-NEXT: v_mov_b32_e32 v2, 0 8917; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8918; GFX9-NEXT: s_ashr_i32 s0, s4, 31 8919; GFX9-NEXT: s_ashr_i32 s1, s5, 31 8920; GFX9-NEXT: s_lshr_b32 s0, s0, 20 8921; GFX9-NEXT: s_lshr_b32 s1, s1, 20 8922; GFX9-NEXT: s_add_i32 s0, s4, s0 8923; GFX9-NEXT: s_add_i32 s1, s5, s1 8924; GFX9-NEXT: s_and_b32 s0, s0, s6 8925; GFX9-NEXT: s_and_b32 s1, s1, s6 8926; GFX9-NEXT: s_sub_i32 s0, s4, s0 8927; GFX9-NEXT: s_sub_i32 s1, s5, s1 8928; GFX9-NEXT: v_mov_b32_e32 v0, s0 8929; GFX9-NEXT: v_mov_b32_e32 v1, s1 8930; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 8931; GFX9-NEXT: s_endpgm 8932; 8933; GFX90A-LABEL: srem_v2i32_pow2k_denom: 8934; GFX90A: ; %bb.0: 8935; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8936; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 8937; GFX90A-NEXT: s_movk_i32 s6, 0xf000 8938; GFX90A-NEXT: v_mov_b32_e32 v2, 0 8939; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8940; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 8941; GFX90A-NEXT: s_ashr_i32 s1, s5, 31 8942; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 8943; GFX90A-NEXT: s_lshr_b32 s1, s1, 20 8944; GFX90A-NEXT: s_add_i32 s0, s4, s0 8945; GFX90A-NEXT: s_add_i32 s1, s5, s1 8946; GFX90A-NEXT: s_and_b32 s0, s0, s6 8947; GFX90A-NEXT: s_and_b32 s1, s1, s6 8948; GFX90A-NEXT: s_sub_i32 s0, s4, s0 8949; GFX90A-NEXT: s_sub_i32 s1, s5, s1 8950; GFX90A-NEXT: v_mov_b32_e32 v0, s0 8951; GFX90A-NEXT: v_mov_b32_e32 v1, s1 8952; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 8953; GFX90A-NEXT: s_endpgm 8954 %r = srem <2 x i32> %x, <i32 4096, i32 4096> 8955 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 8956 ret void 8957} 8958 8959define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 8960; CHECK-LABEL: @srem_v2i32_pow2_shl_denom( 8961; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 8962; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 8963; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 8964; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 8965; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 8966; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 8967; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 8968; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 8969; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 8970; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 8971; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 8972; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000 8973; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 8974; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]] 8975; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]] 8976; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64 8977; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 8978; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]] 8979; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 8980; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32 8981; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 8982; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] 8983; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 8984; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64 8985; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 8986; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 8987; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 8988; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 8989; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]] 8990; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]] 8991; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]] 8992; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]] 8993; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]] 8994; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]] 8995; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]] 8996; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]] 8997; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]] 8998; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]] 8999; CHECK-NEXT: [[TMP38:%.*]] = insertelement <2 x i32> undef, i32 [[TMP37]], i64 0 9000; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[X]], i64 1 9001; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 9002; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31 9003; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31 9004; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]] 9005; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]] 9006; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]] 9007; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]] 9008; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float 9009; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]]) 9010; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000 9011; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32 9012; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]] 9013; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]] 9014; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64 9015; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64 9016; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]] 9017; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32 9018; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32 9019; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32 9020; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]] 9021; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64 9022; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64 9023; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]] 9024; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32 9025; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32 9026; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32 9027; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]] 9028; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]] 9029; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]] 9030; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]] 9031; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]] 9032; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]] 9033; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]] 9034; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]] 9035; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]] 9036; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]] 9037; CHECK-NEXT: [[TMP76:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP75]], i64 1 9038; CHECK-NEXT: store <2 x i32> [[TMP76]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 9039; CHECK-NEXT: ret void 9040; 9041; GFX6-LABEL: srem_v2i32_pow2_shl_denom: 9042; GFX6: ; %bb.0: 9043; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 9044; GFX6-NEXT: s_movk_i32 s6, 0x1000 9045; GFX6-NEXT: s_mov_b32 s10, 0x4f7ffffe 9046; GFX6-NEXT: s_mov_b32 s7, 0xf000 9047; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9048; GFX6-NEXT: s_lshl_b32 s2, s6, s2 9049; GFX6-NEXT: s_ashr_i32 s4, s2, 31 9050; GFX6-NEXT: s_add_i32 s2, s2, s4 9051; GFX6-NEXT: s_xor_b32 s2, s2, s4 9052; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 9053; GFX6-NEXT: s_lshl_b32 s3, s6, s3 9054; GFX6-NEXT: s_ashr_i32 s6, s3, 31 9055; GFX6-NEXT: s_add_i32 s3, s3, s6 9056; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 9057; GFX6-NEXT: s_sub_i32 s9, 0, s2 9058; GFX6-NEXT: s_xor_b32 s3, s3, s6 9059; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s3 9060; GFX6-NEXT: v_mul_f32_e32 v0, s10, v0 9061; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9062; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 9063; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 9064; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 9065; GFX6-NEXT: s_mov_b32 s6, -1 9066; GFX6-NEXT: v_mul_lo_u32 v1, s9, v0 9067; GFX6-NEXT: s_sub_i32 s9, 0, s3 9068; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9069; GFX6-NEXT: s_ashr_i32 s8, s0, 31 9070; GFX6-NEXT: s_add_i32 s0, s0, s8 9071; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 9072; GFX6-NEXT: s_xor_b32 s0, s0, s8 9073; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 9074; GFX6-NEXT: v_mul_f32_e32 v1, s10, v2 9075; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9076; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 9077; GFX6-NEXT: v_mul_lo_u32 v2, s9, v1 9078; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 9079; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 9080; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 9081; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v0 9082; GFX6-NEXT: s_ashr_i32 s0, s1, 31 9083; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 9084; GFX6-NEXT: s_add_i32 s1, s1, s0 9085; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 9086; GFX6-NEXT: s_xor_b32 s1, s1, s0 9087; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 9088; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 9089; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v0 9090; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 9091; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 9092; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 9093; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 9094; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 9095; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 9096; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 9097; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 9098; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 9099; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 9100; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 9101; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 9102; GFX6-NEXT: v_xor_b32_e32 v1, s0, v1 9103; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1 9104; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 9105; GFX6-NEXT: s_endpgm 9106; 9107; GFX9-LABEL: srem_v2i32_pow2_shl_denom: 9108; GFX9: ; %bb.0: 9109; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 9110; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 9111; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 9112; GFX9-NEXT: s_movk_i32 s8, 0x1000 9113; GFX9-NEXT: s_mov_b32 s9, 0x4f7ffffe 9114; GFX9-NEXT: v_mov_b32_e32 v2, 0 9115; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9116; GFX9-NEXT: s_lshl_b32 s0, s8, s6 9117; GFX9-NEXT: s_ashr_i32 s1, s0, 31 9118; GFX9-NEXT: s_add_i32 s0, s0, s1 9119; GFX9-NEXT: s_xor_b32 s0, s0, s1 9120; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 9121; GFX9-NEXT: s_lshl_b32 s1, s8, s7 9122; GFX9-NEXT: s_ashr_i32 s6, s1, 31 9123; GFX9-NEXT: s_add_i32 s1, s1, s6 9124; GFX9-NEXT: s_xor_b32 s1, s1, s6 9125; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 9126; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 9127; GFX9-NEXT: s_sub_i32 s7, 0, s0 9128; GFX9-NEXT: s_ashr_i32 s6, s4, 31 9129; GFX9-NEXT: v_mul_f32_e32 v0, s9, v0 9130; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 9131; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 9132; GFX9-NEXT: s_add_i32 s4, s4, s6 9133; GFX9-NEXT: s_xor_b32 s4, s4, s6 9134; GFX9-NEXT: v_mul_f32_e32 v1, s9, v1 9135; GFX9-NEXT: v_mul_lo_u32 v3, s7, v0 9136; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 9137; GFX9-NEXT: s_sub_i32 s7, 0, s1 9138; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 9139; GFX9-NEXT: v_mul_lo_u32 v4, s7, v1 9140; GFX9-NEXT: s_ashr_i32 s7, s5, 31 9141; GFX9-NEXT: s_add_i32 s5, s5, s7 9142; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 9143; GFX9-NEXT: v_mul_hi_u32 v3, v1, v4 9144; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 9145; GFX9-NEXT: s_xor_b32 s5, s5, s7 9146; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 9147; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 9148; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 9149; GFX9-NEXT: v_mul_lo_u32 v1, v1, s1 9150; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 9151; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v0 9152; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 9153; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 9154; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v0 9155; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 9156; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 9157; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 9158; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v1 9159; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 9160; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 9161; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v1 9162; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 9163; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 9164; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 9165; GFX9-NEXT: v_xor_b32_e32 v1, s7, v1 9166; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 9167; GFX9-NEXT: v_subrev_u32_e32 v1, s7, v1 9168; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 9169; GFX9-NEXT: s_endpgm 9170; 9171; GFX90A-LABEL: srem_v2i32_pow2_shl_denom: 9172; GFX90A: ; %bb.0: 9173; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 9174; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 9175; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 9176; GFX90A-NEXT: s_movk_i32 s8, 0x1000 9177; GFX90A-NEXT: s_mov_b32 s9, 0x4f7ffffe 9178; GFX90A-NEXT: v_mov_b32_e32 v2, 0 9179; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 9180; GFX90A-NEXT: s_lshl_b32 s0, s8, s6 9181; GFX90A-NEXT: s_ashr_i32 s1, s0, 31 9182; GFX90A-NEXT: s_add_i32 s0, s0, s1 9183; GFX90A-NEXT: s_xor_b32 s0, s0, s1 9184; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s0 9185; GFX90A-NEXT: s_lshl_b32 s1, s8, s7 9186; GFX90A-NEXT: s_sub_i32 s8, 0, s0 9187; GFX90A-NEXT: s_ashr_i32 s6, s4, 31 9188; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 9189; GFX90A-NEXT: s_add_i32 s4, s4, s6 9190; GFX90A-NEXT: s_xor_b32 s4, s4, s6 9191; GFX90A-NEXT: s_ashr_i32 s7, s1, 31 9192; GFX90A-NEXT: v_mul_f32_e32 v0, s9, v0 9193; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 9194; GFX90A-NEXT: s_add_i32 s1, s1, s7 9195; GFX90A-NEXT: s_xor_b32 s1, s1, s7 9196; GFX90A-NEXT: v_mul_lo_u32 v1, s8, v0 9197; GFX90A-NEXT: v_mul_hi_u32 v1, v0, v1 9198; GFX90A-NEXT: v_add_u32_e32 v0, v0, v1 9199; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 9200; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s0 9201; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 9202; GFX90A-NEXT: v_subrev_u32_e32 v1, s0, v0 9203; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 9204; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 9205; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s1 9206; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v0 9207; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 9208; GFX90A-NEXT: s_ashr_i32 s0, s5, 31 9209; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 9210; GFX90A-NEXT: s_add_i32 s4, s5, s0 9211; GFX90A-NEXT: s_sub_i32 s5, 0, s1 9212; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 9213; GFX90A-NEXT: v_mul_f32_e32 v1, s9, v1 9214; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 9215; GFX90A-NEXT: s_xor_b32 s4, s4, s0 9216; GFX90A-NEXT: v_xor_b32_e32 v0, s6, v0 9217; GFX90A-NEXT: v_subrev_u32_e32 v0, s6, v0 9218; GFX90A-NEXT: v_mul_lo_u32 v3, s5, v1 9219; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 9220; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 9221; GFX90A-NEXT: v_mul_hi_u32 v1, s4, v1 9222; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s1 9223; GFX90A-NEXT: v_sub_u32_e32 v1, s4, v1 9224; GFX90A-NEXT: v_subrev_u32_e32 v3, s1, v1 9225; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 9226; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 9227; GFX90A-NEXT: v_subrev_u32_e32 v3, s1, v1 9228; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 9229; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 9230; GFX90A-NEXT: v_xor_b32_e32 v1, s0, v1 9231; GFX90A-NEXT: v_subrev_u32_e32 v1, s0, v1 9232; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 9233; GFX90A-NEXT: s_endpgm 9234 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 9235 %r = srem <2 x i32> %x, %shl.y 9236 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 9237 ret void 9238} 9239 9240define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 9241; CHECK-LABEL: @udiv_i64_oddk_denom( 9242; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943 9243; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 9244; CHECK-NEXT: ret void 9245; 9246; GFX6-LABEL: udiv_i64_oddk_denom: 9247; GFX6: ; %bb.0: 9248; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f176a73 9249; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 9250; GFX6-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 9251; GFX6-NEXT: v_rcp_f32_e32 v0, v0 9252; GFX6-NEXT: s_movk_i32 s4, 0xfee0 9253; GFX6-NEXT: s_mov_b32 s5, 0x68958c89 9254; GFX6-NEXT: v_mov_b32_e32 v8, 0 9255; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9256; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9257; GFX6-NEXT: v_trunc_f32_e32 v1, v1 9258; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9259; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9260; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9261; GFX6-NEXT: v_mov_b32_e32 v7, 0 9262; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 9263; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 9264; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 9265; GFX6-NEXT: v_mul_lo_u32 v4, v1, s5 9266; GFX6-NEXT: s_movk_i32 s8, 0x11f 9267; GFX6-NEXT: s_mov_b32 s9, 0x976a7377 9268; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9269; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 9270; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 9271; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 9272; GFX6-NEXT: v_mul_hi_u32 v6, v0, v3 9273; GFX6-NEXT: v_mul_hi_u32 v4, v0, v2 9274; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 9275; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9276; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 9277; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 9278; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 9279; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 9280; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 9281; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc 9282; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 9283; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9284; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 9285; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9286; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9287; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 9288; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 9289; GFX6-NEXT: v_mul_lo_u32 v4, v1, s5 9290; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9291; GFX6-NEXT: s_mov_b32 s4, s0 9292; GFX6-NEXT: s_mov_b32 s7, 0xf000 9293; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9294; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 9295; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9296; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 9297; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 9298; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 9299; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 9300; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9301; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 9302; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v6, vcc 9303; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 9304; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 9305; GFX6-NEXT: s_mov_b32 s5, s1 9306; GFX6-NEXT: s_mov_b32 s6, -1 9307; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 9308; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 9309; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 9310; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9311; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 9312; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9313; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9314; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 9315; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 9316; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 9317; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 9318; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 9319; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9320; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 9321; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 9322; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 9323; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9324; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 9325; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 9326; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 9327; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 9328; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 9329; GFX6-NEXT: v_mul_hi_u32 v3, v0, s9 9330; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9 9331; GFX6-NEXT: v_mov_b32_e32 v5, s8 9332; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9333; GFX6-NEXT: v_mul_lo_u32 v3, v0, s9 9334; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9335; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 9336; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 9337; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 9338; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s9, v3 9339; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 9340; GFX6-NEXT: s_movk_i32 s2, 0x11e 9341; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s2, v4 9342; GFX6-NEXT: s_mov_b32 s9, 0x976a7376 9343; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 9344; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s9, v5 9345; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 9346; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s8, v4 9347; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 9348; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 9349; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 9350; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 9351; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 9352; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 9353; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] 9354; GFX6-NEXT: v_mov_b32_e32 v6, s3 9355; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 9356; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s2, v2 9357; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 9358; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s9, v3 9359; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 9360; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s8, v2 9361; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 9362; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 9363; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 9364; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 9365; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 9366; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 9367; GFX6-NEXT: s_endpgm 9368; 9369; GFX9-LABEL: udiv_i64_oddk_denom: 9370; GFX9: ; %bb.0: 9371; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f176a73 9372; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 9373; GFX9-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 9374; GFX9-NEXT: v_rcp_f32_e32 v0, v0 9375; GFX9-NEXT: s_movk_i32 s2, 0xfee0 9376; GFX9-NEXT: s_mov_b32 s3, 0x68958c89 9377; GFX9-NEXT: v_mov_b32_e32 v8, 0 9378; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9379; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9380; GFX9-NEXT: v_trunc_f32_e32 v1, v1 9381; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9382; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 9383; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 9384; GFX9-NEXT: v_mov_b32_e32 v5, 0 9385; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 9386; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 9387; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 9388; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 9389; GFX9-NEXT: v_mul_lo_u32 v6, v0, s3 9390; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9391; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 9392; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 9393; GFX9-NEXT: v_mul_hi_u32 v4, v0, v6 9394; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 9395; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 9396; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 9397; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 9398; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v7, vcc 9399; GFX9-NEXT: v_mul_lo_u32 v7, v1, v6 9400; GFX9-NEXT: v_mul_hi_u32 v6, v1, v6 9401; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 9402; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v6, vcc 9403; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc 9404; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9405; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v4, vcc 9406; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 9407; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 9408; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 9409; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 9410; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 9411; GFX9-NEXT: v_mul_lo_u32 v6, v0, s3 9412; GFX9-NEXT: s_movk_i32 s2, 0x11f 9413; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9414; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 9415; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 9416; GFX9-NEXT: v_mul_hi_u32 v4, v0, v6 9417; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 9418; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 9419; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 9420; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 9421; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v7, vcc 9422; GFX9-NEXT: v_mul_lo_u32 v7, v1, v6 9423; GFX9-NEXT: v_mul_hi_u32 v6, v1, v6 9424; GFX9-NEXT: s_mov_b32 s3, 0x976a7377 9425; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 9426; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v6, vcc 9427; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc 9428; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9429; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v4, vcc 9430; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 9431; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 9432; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9433; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 9434; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 9435; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 9436; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 9437; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 9438; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9439; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v4, vcc 9440; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 9441; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 9442; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 9443; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 9444; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc 9445; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 9446; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v2, vcc 9447; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 9448; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 9449; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 9450; GFX9-NEXT: v_mov_b32_e32 v6, s2 9451; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9452; GFX9-NEXT: v_mul_lo_u32 v3, v0, s3 9453; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 9454; GFX9-NEXT: v_sub_u32_e32 v4, s7, v2 9455; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 9456; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc 9457; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s3, v3 9458; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] 9459; GFX9-NEXT: s_movk_i32 s3, 0x11e 9460; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s3, v4 9461; GFX9-NEXT: s_mov_b32 s6, 0x976a7376 9462; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 9463; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s6, v6 9464; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 9465; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 9466; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] 9467; GFX9-NEXT: v_mov_b32_e32 v7, s7 9468; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v7, v2, vcc 9469; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s3, v2 9470; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 9471; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 9472; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v3 9473; GFX9-NEXT: v_cndmask_b32_e64 v4, 1, 2, s[0:1] 9474; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 9475; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 9476; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v0, v4 9477; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc 9478; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] 9479; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 9480; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 9481; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 9482; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] 9483; GFX9-NEXT: s_endpgm 9484; 9485; GFX90A-LABEL: udiv_i64_oddk_denom: 9486; GFX90A: ; %bb.0: 9487; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f176a73 9488; GFX90A-NEXT: v_mov_b32_e32 v1, 0x4f800000 9489; GFX90A-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 9490; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 9491; GFX90A-NEXT: s_movk_i32 s2, 0xfee0 9492; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 9493; GFX90A-NEXT: s_mov_b32 s0, 0x68958c89 9494; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9495; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9496; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 9497; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9498; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 9499; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 9500; GFX90A-NEXT: v_mov_b32_e32 v8, 0 9501; GFX90A-NEXT: v_mov_b32_e32 v2, 0 9502; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s2 9503; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s0 9504; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 9505; GFX90A-NEXT: v_mul_lo_u32 v4, v1, s0 9506; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 9507; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s0 9508; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 9509; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 9510; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 9511; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 9512; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc 9513; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 9514; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 9515; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 9516; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 9517; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v9, vcc 9518; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 9519; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 9520; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 9521; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc 9522; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 9523; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 9524; GFX90A-NEXT: v_mul_lo_u32 v4, v0, s2 9525; GFX90A-NEXT: v_mul_hi_u32 v5, v0, s0 9526; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s0 9527; GFX90A-NEXT: v_add_u32_e32 v4, v5, v4 9528; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 9529; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s0 9530; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 9531; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 9532; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 9533; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 9534; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc 9535; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 9536; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 9537; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 9538; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 9539; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v9, vcc 9540; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 9541; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 9542; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 9543; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc 9544; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 9545; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 9546; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 9547; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 9548; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 9549; GFX90A-NEXT: v_mul_hi_u32 v3, s6, v1 9550; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 9551; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc 9552; GFX90A-NEXT: v_mul_hi_u32 v6, s7, v0 9553; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 9554; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 9555; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 9556; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc 9557; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc 9558; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 9559; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 9560; GFX90A-NEXT: s_movk_i32 s2, 0x11f 9561; GFX90A-NEXT: s_mov_b32 s3, 0x976a7377 9562; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v3, vcc 9563; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s2 9564; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s3 9565; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 9566; GFX90A-NEXT: v_mul_lo_u32 v4, v1, s3 9567; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 9568; GFX90A-NEXT: v_mul_lo_u32 v5, v0, s3 9569; GFX90A-NEXT: v_sub_u32_e32 v4, s7, v3 9570; GFX90A-NEXT: v_mov_b32_e32 v6, s2 9571; GFX90A-NEXT: v_sub_co_u32_e32 v5, vcc, s6, v5 9572; GFX90A-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc 9573; GFX90A-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s3, v5 9574; GFX90A-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] 9575; GFX90A-NEXT: s_movk_i32 s3, 0x11e 9576; GFX90A-NEXT: v_cmp_lt_u32_e64 s[0:1], s3, v4 9577; GFX90A-NEXT: s_mov_b32 s6, 0x976a7376 9578; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 9579; GFX90A-NEXT: v_cmp_lt_u32_e64 s[0:1], s6, v6 9580; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 9581; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 9582; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] 9583; GFX90A-NEXT: v_mov_b32_e32 v7, s7 9584; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc 9585; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s3, v3 9586; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 9587; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 9588; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s6, v5 9589; GFX90A-NEXT: v_cndmask_b32_e64 v4, 1, 2, s[0:1] 9590; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 9591; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s2, v3 9592; GFX90A-NEXT: v_add_co_u32_e64 v4, s[0:1], v0, v4 9593; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc 9594; GFX90A-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] 9595; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 9596; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 9597; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 9598; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 9599; GFX90A-NEXT: s_endpgm 9600 %r = udiv i64 %x, 1235195949943 9601 store i64 %r, i64 addrspace(1)* %out 9602 ret void 9603} 9604 9605define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 9606; CHECK-LABEL: @udiv_i64_pow2k_denom( 9607; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 4096 9608; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 9609; CHECK-NEXT: ret void 9610; 9611; GFX6-LABEL: udiv_i64_pow2k_denom: 9612; GFX6: ; %bb.0: 9613; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 9614; GFX6-NEXT: s_mov_b32 s7, 0xf000 9615; GFX6-NEXT: s_mov_b32 s6, -1 9616; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9617; GFX6-NEXT: s_mov_b32 s4, s0 9618; GFX6-NEXT: s_mov_b32 s5, s1 9619; GFX6-NEXT: s_lshr_b64 s[0:1], s[2:3], 12 9620; GFX6-NEXT: v_mov_b32_e32 v0, s0 9621; GFX6-NEXT: v_mov_b32_e32 v1, s1 9622; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 9623; GFX6-NEXT: s_endpgm 9624; 9625; GFX9-LABEL: udiv_i64_pow2k_denom: 9626; GFX9: ; %bb.0: 9627; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 9628; GFX9-NEXT: v_mov_b32_e32 v2, 0 9629; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9630; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 9631; GFX9-NEXT: v_mov_b32_e32 v0, s2 9632; GFX9-NEXT: v_mov_b32_e32 v1, s3 9633; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 9634; GFX9-NEXT: s_endpgm 9635; 9636; GFX90A-LABEL: udiv_i64_pow2k_denom: 9637; GFX90A: ; %bb.0: 9638; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 9639; GFX90A-NEXT: v_mov_b32_e32 v2, 0 9640; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 9641; GFX90A-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 9642; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] 9643; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 9644; GFX90A-NEXT: s_endpgm 9645 %r = udiv i64 %x, 4096 9646 store i64 %r, i64 addrspace(1)* %out 9647 ret void 9648} 9649 9650define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 9651; CHECK-LABEL: @udiv_i64_pow2_shl_denom( 9652; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 9653; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]] 9654; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 9655; CHECK-NEXT: ret void 9656; 9657; GFX6-LABEL: udiv_i64_pow2_shl_denom: 9658; GFX6: ; %bb.0: 9659; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 9660; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd 9661; GFX6-NEXT: s_mov_b32 s3, 0xf000 9662; GFX6-NEXT: s_mov_b32 s2, -1 9663; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9664; GFX6-NEXT: s_mov_b32 s0, s4 9665; GFX6-NEXT: s_add_i32 s8, s8, 12 9666; GFX6-NEXT: s_mov_b32 s1, s5 9667; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 9668; GFX6-NEXT: v_mov_b32_e32 v0, s4 9669; GFX6-NEXT: v_mov_b32_e32 v1, s5 9670; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 9671; GFX6-NEXT: s_endpgm 9672; 9673; GFX9-LABEL: udiv_i64_pow2_shl_denom: 9674; GFX9: ; %bb.0: 9675; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 9676; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 9677; GFX9-NEXT: v_mov_b32_e32 v2, 0 9678; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9679; GFX9-NEXT: s_add_i32 s2, s2, 12 9680; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s2 9681; GFX9-NEXT: v_mov_b32_e32 v0, s0 9682; GFX9-NEXT: v_mov_b32_e32 v1, s1 9683; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 9684; GFX9-NEXT: s_endpgm 9685; 9686; GFX90A-LABEL: udiv_i64_pow2_shl_denom: 9687; GFX90A: ; %bb.0: 9688; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 9689; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x34 9690; GFX90A-NEXT: v_mov_b32_e32 v2, 0 9691; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 9692; GFX90A-NEXT: s_add_i32 s2, s2, 12 9693; GFX90A-NEXT: s_lshr_b64 s[0:1], s[6:7], s2 9694; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9695; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 9696; GFX90A-NEXT: s_endpgm 9697 %shl.y = shl i64 4096, %y 9698 %r = udiv i64 %x, %shl.y 9699 store i64 %r, i64 addrspace(1)* %out 9700 ret void 9701} 9702 9703define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 9704; CHECK-LABEL: @udiv_v2i64_pow2k_denom( 9705; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 9706; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 9707; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 9708; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 9709; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096 9710; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 9711; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 9712; CHECK-NEXT: ret void 9713; 9714; GFX6-LABEL: udiv_v2i64_pow2k_denom: 9715; GFX6: ; %bb.0: 9716; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 9717; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 9718; GFX6-NEXT: s_mov_b32 s7, 0xf000 9719; GFX6-NEXT: s_mov_b32 s6, -1 9720; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9721; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 12 9722; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 9723; GFX6-NEXT: v_mov_b32_e32 v0, s0 9724; GFX6-NEXT: v_mov_b32_e32 v1, s1 9725; GFX6-NEXT: v_mov_b32_e32 v2, s2 9726; GFX6-NEXT: v_mov_b32_e32 v3, s3 9727; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 9728; GFX6-NEXT: s_endpgm 9729; 9730; GFX9-LABEL: udiv_v2i64_pow2k_denom: 9731; GFX9: ; %bb.0: 9732; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 9733; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 9734; GFX9-NEXT: v_mov_b32_e32 v4, 0 9735; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9736; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 9737; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 12 9738; GFX9-NEXT: v_mov_b32_e32 v0, s0 9739; GFX9-NEXT: v_mov_b32_e32 v1, s1 9740; GFX9-NEXT: v_mov_b32_e32 v2, s4 9741; GFX9-NEXT: v_mov_b32_e32 v3, s5 9742; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 9743; GFX9-NEXT: s_endpgm 9744; 9745; GFX90A-LABEL: udiv_v2i64_pow2k_denom: 9746; GFX90A: ; %bb.0: 9747; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 9748; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 9749; GFX90A-NEXT: v_mov_b32_e32 v4, 0 9750; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 9751; GFX90A-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 9752; GFX90A-NEXT: s_lshr_b64 s[4:5], s[6:7], 12 9753; GFX90A-NEXT: v_mov_b32_e32 v0, s0 9754; GFX90A-NEXT: v_mov_b32_e32 v1, s1 9755; GFX90A-NEXT: v_mov_b32_e32 v2, s4 9756; GFX90A-NEXT: v_mov_b32_e32 v3, s5 9757; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 9758; GFX90A-NEXT: s_endpgm 9759 %r = udiv <2 x i64> %x, <i64 4096, i64 4096> 9760 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 9761 ret void 9762} 9763 9764define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 9765; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom( 9766; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 9767; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 9768; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 9769; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 9770; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095 9771; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 9772; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 9773; CHECK-NEXT: ret void 9774; 9775; GFX6-LABEL: udiv_v2i64_mixed_pow2k_denom: 9776; GFX6: ; %bb.0: 9777; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 9778; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 9779; GFX6-NEXT: v_rcp_f32_e32 v0, v0 9780; GFX6-NEXT: s_movk_i32 s6, 0xf001 9781; GFX6-NEXT: v_mov_b32_e32 v8, 0 9782; GFX6-NEXT: v_mov_b32_e32 v7, 0 9783; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9784; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9785; GFX6-NEXT: v_trunc_f32_e32 v1, v1 9786; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9787; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9788; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9789; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 9790; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 9791; GFX6-NEXT: s_mov_b32 s7, 0xf000 9792; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 9793; GFX6-NEXT: v_mul_lo_u32 v3, v1, s6 9794; GFX6-NEXT: v_mul_lo_u32 v4, v0, s6 9795; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9796; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], 12 9797; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 9798; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9799; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 9800; GFX6-NEXT: v_mul_hi_u32 v6, v0, v4 9801; GFX6-NEXT: v_mul_hi_u32 v3, v0, v2 9802; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 9803; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9804; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 9805; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 9806; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 9807; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc 9808; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 9809; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc 9810; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 9811; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9812; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 9813; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9814; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9815; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 9816; GFX6-NEXT: v_mul_lo_u32 v3, v1, s6 9817; GFX6-NEXT: v_mul_lo_u32 v4, v0, s6 9818; GFX6-NEXT: s_movk_i32 s0, 0xfff 9819; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 9820; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 9821; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 9822; GFX6-NEXT: v_mul_hi_u32 v5, v0, v4 9823; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 9824; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 9825; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9826; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 9827; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v6, vcc 9828; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 9829; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 9830; GFX6-NEXT: s_mov_b32 s6, -1 9831; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 9832; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 9833; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 9834; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9835; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 9836; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9837; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9838; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 9839; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 9840; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 9841; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 9842; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 9843; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9844; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 9845; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 9846; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 9847; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9848; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 9849; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 9850; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 9851; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 9852; GFX6-NEXT: v_mul_lo_u32 v4, v1, s0 9853; GFX6-NEXT: v_mul_hi_u32 v5, v0, s0 9854; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 9855; GFX6-NEXT: v_mul_lo_u32 v8, v0, s0 9856; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 9857; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 9858; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 9859; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 9860; GFX6-NEXT: v_mov_b32_e32 v5, s3 9861; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s2, v8 9862; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc 9863; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s0, v8 9864; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc 9865; GFX6-NEXT: s_movk_i32 s0, 0xffe 9866; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 9867; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 9868; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 9869; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 9870; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 9871; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 9872; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 9873; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 9874; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] 9875; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 9876; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 9877; GFX6-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] 9878; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc 9879; GFX6-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] 9880; GFX6-NEXT: v_mov_b32_e32 v0, s8 9881; GFX6-NEXT: v_mov_b32_e32 v1, s9 9882; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 9883; GFX6-NEXT: s_endpgm 9884; 9885; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom: 9886; GFX9: ; %bb.0: 9887; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 9888; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 9889; GFX9-NEXT: v_rcp_f32_e32 v0, v0 9890; GFX9-NEXT: s_movk_i32 s2, 0xf001 9891; GFX9-NEXT: v_mov_b32_e32 v7, 0 9892; GFX9-NEXT: v_mov_b32_e32 v5, 0 9893; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9894; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9895; GFX9-NEXT: v_trunc_f32_e32 v1, v1 9896; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9897; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 9898; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 9899; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 9900; GFX9-NEXT: s_movk_i32 s8, 0xfff 9901; GFX9-NEXT: v_mul_hi_u32 v2, v0, s2 9902; GFX9-NEXT: v_mul_lo_u32 v4, v1, s2 9903; GFX9-NEXT: v_mul_lo_u32 v3, v0, s2 9904; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 9905; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 9906; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 9907; GFX9-NEXT: v_mul_hi_u32 v6, v0, v3 9908; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2 9909; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 9910; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 9911; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 9912; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 9913; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v8, vcc 9914; GFX9-NEXT: v_mul_lo_u32 v8, v1, v3 9915; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 9916; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 9917; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc 9918; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc 9919; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9920; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 9921; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 9922; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 9923; GFX9-NEXT: v_mul_hi_u32 v2, v0, s2 9924; GFX9-NEXT: v_mul_lo_u32 v3, v1, s2 9925; GFX9-NEXT: v_mul_lo_u32 v4, v0, s2 9926; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9927; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 12 9928; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 9929; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 9930; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 9931; GFX9-NEXT: v_mul_hi_u32 v6, v0, v4 9932; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 9933; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 9934; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 9935; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 9936; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v8, vcc 9937; GFX9-NEXT: v_mul_lo_u32 v8, v1, v4 9938; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 9939; GFX9-NEXT: s_movk_i32 s4, 0xffe 9940; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 9941; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc 9942; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc 9943; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9944; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 9945; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 9946; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 9947; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 9948; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 9949; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 9950; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 9951; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 9952; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9953; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 9954; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 9955; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 9956; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 9957; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 9958; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc 9959; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 9960; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v2, vcc 9961; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 9962; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 9963; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 9964; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9965; GFX9-NEXT: v_mov_b32_e32 v3, s7 9966; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s6, v4 9967; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v2, vcc 9968; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s8, v4 9969; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc 9970; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s4, v3 9971; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 9972; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 9973; GFX9-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 9974; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 9975; GFX9-NEXT: v_cndmask_b32_e64 v3, 1, 2, vcc 9976; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 9977; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc 9978; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s4, v4 9979; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 9980; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 9981; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc 9982; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 9983; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v3, vcc 9984; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v6, vcc 9985; GFX9-NEXT: v_mov_b32_e32 v0, s2 9986; GFX9-NEXT: v_mov_b32_e32 v1, s3 9987; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] 9988; GFX9-NEXT: s_endpgm 9989; 9990; GFX90A-LABEL: udiv_v2i64_mixed_pow2k_denom: 9991; GFX90A: ; %bb.0: 9992; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f800000 9993; GFX90A-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 9994; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 9995; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 9996; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 9997; GFX90A-NEXT: v_mov_b32_e32 v8, 0 9998; GFX90A-NEXT: v_mov_b32_e32 v4, 0 9999; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10000; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10001; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 10002; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10003; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 10004; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 10005; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 10006; GFX90A-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 10007; GFX90A-NEXT: s_movk_i32 s4, 0xf001 10008; GFX90A-NEXT: v_mul_hi_u32 v2, v0, s4 10009; GFX90A-NEXT: v_sub_u32_e32 v2, v2, v0 10010; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s4 10011; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 10012; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s4 10013; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 10014; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 10015; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 10016; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 10017; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc 10018; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 10019; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 10020; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 10021; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 10022; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v9, vcc 10023; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc 10024; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 10025; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10026; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v5, vcc 10027; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 10028; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 10029; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s4 10030; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s4 10031; GFX90A-NEXT: v_sub_u32_e32 v3, v3, v0 10032; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 10033; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s4 10034; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 10035; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 10036; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 10037; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 10038; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc 10039; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 10040; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 10041; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 10042; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 10043; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v9, vcc 10044; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc 10045; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 10046; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10047; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v5, vcc 10048; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 10049; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 10050; GFX90A-NEXT: v_mul_lo_u32 v3, s6, v1 10051; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 10052; GFX90A-NEXT: v_mul_hi_u32 v2, s6, v1 10053; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 10054; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v8, v2, vcc 10055; GFX90A-NEXT: v_mul_hi_u32 v6, s7, v0 10056; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 10057; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 10058; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 10059; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v2, v6, vcc 10060; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc 10061; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 10062; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 10063; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v2, vcc 10064; GFX90A-NEXT: s_movk_i32 s4, 0xfff 10065; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s4 10066; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s4 10067; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 10068; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s4 10069; GFX90A-NEXT: v_mov_b32_e32 v5, s7 10070; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 10071; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v5, v2, vcc 10072; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s4, v3 10073; GFX90A-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc 10074; GFX90A-NEXT: s_movk_i32 s4, 0xffe 10075; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 10076; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 10077; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 10078; GFX90A-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 10079; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 10080; GFX90A-NEXT: v_cndmask_b32_e64 v5, 1, 2, vcc 10081; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v0, v5 10082; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc 10083; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s4, v3 10084; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 10085; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 10086; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v3, vcc 10087; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 10088; GFX90A-NEXT: v_cndmask_b32_e32 v2, v0, v5, vcc 10089; GFX90A-NEXT: v_cndmask_b32_e32 v3, v1, v6, vcc 10090; GFX90A-NEXT: v_mov_b32_e32 v0, s0 10091; GFX90A-NEXT: v_mov_b32_e32 v1, s1 10092; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 10093; GFX90A-NEXT: s_endpgm 10094 %r = udiv <2 x i64> %x, <i64 4096, i64 4095> 10095 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 10096 ret void 10097} 10098 10099define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 10100; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom( 10101; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 10102; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 10103; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 10104; CHECK-NEXT: [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]] 10105; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 10106; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 10107; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 10108; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]] 10109; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 10110; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 10111; CHECK-NEXT: ret void 10112; 10113; GFX6-LABEL: udiv_v2i64_pow2_shl_denom: 10114; GFX6: ; %bb.0: 10115; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 10116; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 10117; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 10118; GFX6-NEXT: s_mov_b32 s7, 0xf000 10119; GFX6-NEXT: s_mov_b32 s6, -1 10120; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10121; GFX6-NEXT: s_add_i32 s0, s0, 12 10122; GFX6-NEXT: s_add_i32 s2, s2, 12 10123; GFX6-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 10124; GFX6-NEXT: s_lshr_b64 s[2:3], s[10:11], s2 10125; GFX6-NEXT: v_mov_b32_e32 v0, s0 10126; GFX6-NEXT: v_mov_b32_e32 v1, s1 10127; GFX6-NEXT: v_mov_b32_e32 v2, s2 10128; GFX6-NEXT: v_mov_b32_e32 v3, s3 10129; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 10130; GFX6-NEXT: s_endpgm 10131; 10132; GFX9-LABEL: udiv_v2i64_pow2_shl_denom: 10133; GFX9: ; %bb.0: 10134; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10135; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 10136; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 10137; GFX9-NEXT: v_mov_b32_e32 v4, 0 10138; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10139; GFX9-NEXT: s_add_i32 s0, s8, 12 10140; GFX9-NEXT: s_add_i32 s8, s10, 12 10141; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], s0 10142; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 10143; GFX9-NEXT: v_mov_b32_e32 v0, s0 10144; GFX9-NEXT: v_mov_b32_e32 v1, s1 10145; GFX9-NEXT: v_mov_b32_e32 v2, s4 10146; GFX9-NEXT: v_mov_b32_e32 v3, s5 10147; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 10148; GFX9-NEXT: s_endpgm 10149; 10150; GFX90A-LABEL: udiv_v2i64_pow2_shl_denom: 10151; GFX90A: ; %bb.0: 10152; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10153; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 10154; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 10155; GFX90A-NEXT: v_mov_b32_e32 v4, 0 10156; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 10157; GFX90A-NEXT: s_add_i32 s0, s8, 12 10158; GFX90A-NEXT: s_add_i32 s8, s10, 12 10159; GFX90A-NEXT: s_lshr_b64 s[0:1], s[4:5], s0 10160; GFX90A-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 10161; GFX90A-NEXT: v_mov_b32_e32 v0, s0 10162; GFX90A-NEXT: v_mov_b32_e32 v1, s1 10163; GFX90A-NEXT: v_mov_b32_e32 v2, s4 10164; GFX90A-NEXT: v_mov_b32_e32 v3, s5 10165; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 10166; GFX90A-NEXT: s_endpgm 10167 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 10168 %r = udiv <2 x i64> %x, %shl.y 10169 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 10170 ret void 10171} 10172 10173define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 10174; CHECK-LABEL: @urem_i64_oddk_denom( 10175; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993 10176; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 10177; CHECK-NEXT: ret void 10178; 10179; GFX6-LABEL: urem_i64_oddk_denom: 10180; GFX6: ; %bb.0: 10181; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 10182; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 10183; GFX6-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 10184; GFX6-NEXT: v_rcp_f32_e32 v0, v0 10185; GFX6-NEXT: s_movk_i32 s2, 0xfee0 10186; GFX6-NEXT: s_mov_b32 s3, 0x689e0837 10187; GFX6-NEXT: v_mov_b32_e32 v8, 0 10188; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10189; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10190; GFX6-NEXT: v_trunc_f32_e32 v1, v1 10191; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10192; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 10193; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 10194; GFX6-NEXT: v_mov_b32_e32 v7, 0 10195; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 10196; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 10197; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 10198; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 10199; GFX6-NEXT: s_mov_b32 s12, 0x9761f7c9 10200; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10201; GFX6-NEXT: s_mov_b32 s8, s4 10202; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10203; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 10204; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 10205; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 10206; GFX6-NEXT: v_mul_hi_u32 v6, v0, v3 10207; GFX6-NEXT: v_mul_hi_u32 v4, v0, v2 10208; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 10209; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 10210; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 10211; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 10212; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 10213; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 10214; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 10215; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc 10216; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 10217; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10218; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 10219; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 10220; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 10221; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 10222; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 10223; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 10224; GFX6-NEXT: s_movk_i32 s4, 0x11f 10225; GFX6-NEXT: s_mov_b32 s9, s5 10226; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10227; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 10228; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 10229; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 10230; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 10231; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 10232; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 10233; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 10234; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 10235; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v6, vcc 10236; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 10237; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 10238; GFX6-NEXT: s_movk_i32 s5, 0x11e 10239; GFX6-NEXT: s_mov_b32 s11, 0xf000 10240; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 10241; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 10242; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 10243; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10244; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 10245; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 10246; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 10247; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 10248; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 10249; GFX6-NEXT: v_mul_hi_u32 v4, s6, v1 10250; GFX6-NEXT: v_mul_hi_u32 v5, s7, v1 10251; GFX6-NEXT: v_mul_lo_u32 v1, s7, v1 10252; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10253; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 10254; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 10255; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 10256; GFX6-NEXT: s_mov_b32 s10, -1 10257; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 10258; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 10259; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 10260; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 10261; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 10262; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 10263; GFX6-NEXT: v_mul_hi_u32 v3, v0, s12 10264; GFX6-NEXT: v_mul_lo_u32 v1, v1, s12 10265; GFX6-NEXT: v_mul_lo_u32 v0, v0, s12 10266; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10267; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 10268; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 10269; GFX6-NEXT: v_mov_b32_e32 v3, s4 10270; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 10271; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 10272; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 10273; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 10274; GFX6-NEXT: v_cmp_lt_u32_e64 s[2:3], s5, v5 10275; GFX6-NEXT: s_mov_b32 s6, 0x9761f7c8 10276; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 10277; GFX6-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v4 10278; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 10279; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 10280; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v5 10281; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 10282; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 10283; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 10284; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 10285; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 10286; GFX6-NEXT: v_mov_b32_e32 v5, s7 10287; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 10288; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s5, v1 10289; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 10290; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0 10291; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 10292; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 10293; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 10294; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 10295; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 10296; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 10297; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 10298; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 10299; GFX6-NEXT: s_endpgm 10300; 10301; GFX9-LABEL: urem_i64_oddk_denom: 10302; GFX9: ; %bb.0: 10303; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 10304; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 10305; GFX9-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 10306; GFX9-NEXT: v_rcp_f32_e32 v0, v0 10307; GFX9-NEXT: s_movk_i32 s2, 0xfee0 10308; GFX9-NEXT: s_mov_b32 s3, 0x689e0837 10309; GFX9-NEXT: v_mov_b32_e32 v8, 0 10310; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10311; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10312; GFX9-NEXT: v_trunc_f32_e32 v1, v1 10313; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10314; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 10315; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 10316; GFX9-NEXT: v_mov_b32_e32 v5, 0 10317; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 10318; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 10319; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 10320; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 10321; GFX9-NEXT: v_mul_lo_u32 v6, v0, s3 10322; GFX9-NEXT: s_movk_i32 s8, 0x11f 10323; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 10324; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 10325; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 10326; GFX9-NEXT: v_mul_hi_u32 v4, v0, v6 10327; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 10328; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 10329; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 10330; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 10331; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v7, vcc 10332; GFX9-NEXT: v_mul_lo_u32 v7, v1, v6 10333; GFX9-NEXT: v_mul_hi_u32 v6, v1, v6 10334; GFX9-NEXT: s_mov_b32 s9, 0x9761f7c9 10335; GFX9-NEXT: s_mov_b32 s10, 0x9761f7c8 10336; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 10337; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v6, vcc 10338; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc 10339; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10340; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v4, vcc 10341; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 10342; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 10343; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 10344; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 10345; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 10346; GFX9-NEXT: v_mul_lo_u32 v6, v0, s3 10347; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 10348; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 10349; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 10350; GFX9-NEXT: v_mul_hi_u32 v4, v0, v6 10351; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 10352; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 10353; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 10354; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 10355; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v7, vcc 10356; GFX9-NEXT: v_mul_lo_u32 v7, v1, v6 10357; GFX9-NEXT: v_mul_hi_u32 v6, v1, v6 10358; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 10359; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v6, vcc 10360; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc 10361; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10362; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v4, vcc 10363; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 10364; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 10365; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10366; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 10367; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 10368; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 10369; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 10370; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 10371; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10372; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v4, vcc 10373; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 10374; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 10375; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 10376; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 10377; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc 10378; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 10379; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v2, vcc 10380; GFX9-NEXT: v_mul_lo_u32 v2, v0, s8 10381; GFX9-NEXT: v_mul_hi_u32 v3, v0, s9 10382; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 10383; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 10384; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 10385; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 10386; GFX9-NEXT: v_sub_u32_e32 v2, s7, v1 10387; GFX9-NEXT: v_mov_b32_e32 v3, s8 10388; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 10389; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc 10390; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s9, v0 10391; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1] 10392; GFX9-NEXT: s_movk_i32 s6, 0x11e 10393; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v6 10394; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 10395; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s10, v4 10396; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] 10397; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 10398; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s8, v6 10399; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s9, v4 10400; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] 10401; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] 10402; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 10403; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] 10404; GFX9-NEXT: v_mov_b32_e32 v4, s7 10405; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v4, v1, vcc 10406; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 10407; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 10408; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 10409; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1] 10410; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 10411; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 10412; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc 10413; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 10414; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 10415; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 10416; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] 10417; GFX9-NEXT: s_endpgm 10418; 10419; GFX90A-LABEL: urem_i64_oddk_denom: 10420; GFX90A: ; %bb.0: 10421; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 10422; GFX90A-NEXT: v_mov_b32_e32 v1, 0x4f800000 10423; GFX90A-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 10424; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 10425; GFX90A-NEXT: s_movk_i32 s2, 0xfee0 10426; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 10427; GFX90A-NEXT: s_mov_b32 s0, 0x689e0837 10428; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10429; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10430; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 10431; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10432; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 10433; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 10434; GFX90A-NEXT: v_mov_b32_e32 v8, 0 10435; GFX90A-NEXT: v_mov_b32_e32 v2, 0 10436; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s2 10437; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s0 10438; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 10439; GFX90A-NEXT: v_mul_lo_u32 v4, v1, s0 10440; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 10441; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s0 10442; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 10443; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 10444; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 10445; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 10446; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc 10447; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 10448; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 10449; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 10450; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 10451; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v9, vcc 10452; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 10453; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 10454; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 10455; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc 10456; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 10457; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 10458; GFX90A-NEXT: v_mul_lo_u32 v4, v0, s2 10459; GFX90A-NEXT: v_mul_hi_u32 v5, v0, s0 10460; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s0 10461; GFX90A-NEXT: v_add_u32_e32 v4, v5, v4 10462; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 10463; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s0 10464; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 10465; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 10466; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 10467; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 10468; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc 10469; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 10470; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 10471; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 10472; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 10473; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v9, vcc 10474; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 10475; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 10476; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 10477; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc 10478; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 10479; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 10480; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 10481; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 10482; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 10483; GFX90A-NEXT: v_mul_hi_u32 v3, s6, v1 10484; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 10485; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc 10486; GFX90A-NEXT: v_mul_hi_u32 v6, s7, v0 10487; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 10488; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 10489; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 10490; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc 10491; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc 10492; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 10493; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 10494; GFX90A-NEXT: s_movk_i32 s8, 0x11f 10495; GFX90A-NEXT: s_mov_b32 s9, 0x9761f7c9 10496; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v3, vcc 10497; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s8 10498; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s9 10499; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 10500; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s9 10501; GFX90A-NEXT: v_add_u32_e32 v1, v3, v1 10502; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s9 10503; GFX90A-NEXT: v_sub_u32_e32 v3, s7, v1 10504; GFX90A-NEXT: v_mov_b32_e32 v4, s8 10505; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 10506; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc 10507; GFX90A-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s9, v0 10508; GFX90A-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1] 10509; GFX90A-NEXT: s_movk_i32 s6, 0x11e 10510; GFX90A-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v6 10511; GFX90A-NEXT: s_mov_b32 s10, 0x9761f7c8 10512; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 10513; GFX90A-NEXT: v_cmp_lt_u32_e64 s[2:3], s10, v5 10514; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] 10515; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 10516; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], s8, v6 10517; GFX90A-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s9, v5 10518; GFX90A-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] 10519; GFX90A-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] 10520; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 10521; GFX90A-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[0:1] 10522; GFX90A-NEXT: v_mov_b32_e32 v5, s7 10523; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc 10524; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 10525; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 10526; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 10527; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] 10528; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 10529; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 10530; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 10531; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 10532; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 10533; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 10534; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 10535; GFX90A-NEXT: s_endpgm 10536 %r = urem i64 %x, 1235195393993 10537 store i64 %r, i64 addrspace(1)* %out 10538 ret void 10539} 10540 10541define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 10542; CHECK-LABEL: @urem_i64_pow2k_denom( 10543; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 4096 10544; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 10545; CHECK-NEXT: ret void 10546; 10547; GFX6-LABEL: urem_i64_pow2k_denom: 10548; GFX6: ; %bb.0: 10549; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 10550; GFX6-NEXT: s_mov_b32 s3, 0xf000 10551; GFX6-NEXT: s_mov_b32 s2, -1 10552; GFX6-NEXT: v_mov_b32_e32 v1, 0 10553; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10554; GFX6-NEXT: s_mov_b32 s0, s4 10555; GFX6-NEXT: s_and_b32 s4, s6, 0xfff 10556; GFX6-NEXT: s_mov_b32 s1, s5 10557; GFX6-NEXT: v_mov_b32_e32 v0, s4 10558; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 10559; GFX6-NEXT: s_endpgm 10560; 10561; GFX9-LABEL: urem_i64_pow2k_denom: 10562; GFX9: ; %bb.0: 10563; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 10564; GFX9-NEXT: v_mov_b32_e32 v1, 0 10565; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10566; GFX9-NEXT: s_and_b32 s2, s2, 0xfff 10567; GFX9-NEXT: v_mov_b32_e32 v0, s2 10568; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 10569; GFX9-NEXT: s_endpgm 10570; 10571; GFX90A-LABEL: urem_i64_pow2k_denom: 10572; GFX90A: ; %bb.0: 10573; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 10574; GFX90A-NEXT: v_mov_b32_e32 v1, 0 10575; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 10576; GFX90A-NEXT: s_and_b32 s2, s2, 0xfff 10577; GFX90A-NEXT: v_mov_b32_e32 v0, s2 10578; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 10579; GFX90A-NEXT: s_endpgm 10580 %r = urem i64 %x, 4096 10581 store i64 %r, i64 addrspace(1)* %out 10582 ret void 10583} 10584 10585define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 10586; CHECK-LABEL: @urem_i64_pow2_shl_denom( 10587; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 10588; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]] 10589; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 10590; CHECK-NEXT: ret void 10591; 10592; GFX6-LABEL: urem_i64_pow2_shl_denom: 10593; GFX6: ; %bb.0: 10594; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 10595; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd 10596; GFX6-NEXT: s_mov_b32 s3, 0xf000 10597; GFX6-NEXT: s_mov_b32 s2, -1 10598; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10599; GFX6-NEXT: s_mov_b32 s0, s4 10600; GFX6-NEXT: s_mov_b32 s1, s5 10601; GFX6-NEXT: s_mov_b64 s[4:5], 0x1000 10602; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 10603; GFX6-NEXT: s_add_u32 s4, s4, -1 10604; GFX6-NEXT: s_addc_u32 s5, s5, -1 10605; GFX6-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 10606; GFX6-NEXT: v_mov_b32_e32 v0, s4 10607; GFX6-NEXT: v_mov_b32_e32 v1, s5 10608; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 10609; GFX6-NEXT: s_endpgm 10610; 10611; GFX9-LABEL: urem_i64_pow2_shl_denom: 10612; GFX9: ; %bb.0: 10613; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 10614; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 10615; GFX9-NEXT: s_mov_b64 s[0:1], 0x1000 10616; GFX9-NEXT: v_mov_b32_e32 v2, 0 10617; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10618; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 10619; GFX9-NEXT: s_add_u32 s0, s0, -1 10620; GFX9-NEXT: s_addc_u32 s1, s1, -1 10621; GFX9-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] 10622; GFX9-NEXT: v_mov_b32_e32 v0, s0 10623; GFX9-NEXT: v_mov_b32_e32 v1, s1 10624; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 10625; GFX9-NEXT: s_endpgm 10626; 10627; GFX90A-LABEL: urem_i64_pow2_shl_denom: 10628; GFX90A: ; %bb.0: 10629; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 10630; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x34 10631; GFX90A-NEXT: s_mov_b64 s[0:1], 0x1000 10632; GFX90A-NEXT: v_mov_b32_e32 v2, 0 10633; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 10634; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 10635; GFX90A-NEXT: s_add_u32 s0, s0, -1 10636; GFX90A-NEXT: s_addc_u32 s1, s1, -1 10637; GFX90A-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] 10638; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 10639; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 10640; GFX90A-NEXT: s_endpgm 10641 %shl.y = shl i64 4096, %y 10642 %r = urem i64 %x, %shl.y 10643 store i64 %r, i64 addrspace(1)* %out 10644 ret void 10645} 10646 10647define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 10648; CHECK-LABEL: @urem_v2i64_pow2k_denom( 10649; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 10650; CHECK-NEXT: [[TMP2:%.*]] = urem i64 [[TMP1]], 4096 10651; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 10652; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 10653; CHECK-NEXT: [[TMP5:%.*]] = urem i64 [[TMP4]], 4096 10654; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 10655; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 10656; CHECK-NEXT: ret void 10657; 10658; GFX6-LABEL: urem_v2i64_pow2k_denom: 10659; GFX6: ; %bb.0: 10660; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 10661; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 10662; GFX6-NEXT: s_movk_i32 s8, 0xfff 10663; GFX6-NEXT: v_mov_b32_e32 v1, 0 10664; GFX6-NEXT: s_mov_b32 s7, 0xf000 10665; GFX6-NEXT: s_mov_b32 s6, -1 10666; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10667; GFX6-NEXT: s_and_b32 s0, s0, s8 10668; GFX6-NEXT: s_and_b32 s1, s2, s8 10669; GFX6-NEXT: v_mov_b32_e32 v0, s0 10670; GFX6-NEXT: v_mov_b32_e32 v2, s1 10671; GFX6-NEXT: v_mov_b32_e32 v3, v1 10672; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 10673; GFX6-NEXT: s_endpgm 10674; 10675; GFX9-LABEL: urem_v2i64_pow2k_denom: 10676; GFX9: ; %bb.0: 10677; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10678; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 10679; GFX9-NEXT: s_movk_i32 s0, 0xfff 10680; GFX9-NEXT: v_mov_b32_e32 v1, 0 10681; GFX9-NEXT: v_mov_b32_e32 v3, v1 10682; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10683; GFX9-NEXT: s_and_b32 s1, s4, s0 10684; GFX9-NEXT: s_and_b32 s0, s6, s0 10685; GFX9-NEXT: v_mov_b32_e32 v0, s1 10686; GFX9-NEXT: v_mov_b32_e32 v2, s0 10687; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[2:3] 10688; GFX9-NEXT: s_endpgm 10689; 10690; GFX90A-LABEL: urem_v2i64_pow2k_denom: 10691; GFX90A: ; %bb.0: 10692; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10693; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 10694; GFX90A-NEXT: s_movk_i32 s0, 0xfff 10695; GFX90A-NEXT: v_mov_b32_e32 v1, 0 10696; GFX90A-NEXT: v_mov_b32_e32 v3, v1 10697; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 10698; GFX90A-NEXT: s_and_b32 s1, s4, s0 10699; GFX90A-NEXT: s_and_b32 s0, s6, s0 10700; GFX90A-NEXT: v_mov_b32_e32 v0, s1 10701; GFX90A-NEXT: v_mov_b32_e32 v2, s0 10702; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[2:3] 10703; GFX90A-NEXT: s_endpgm 10704 %r = urem <2 x i64> %x, <i64 4096, i64 4096> 10705 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 10706 ret void 10707} 10708 10709define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 10710; CHECK-LABEL: @urem_v2i64_pow2_shl_denom( 10711; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 10712; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 10713; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 10714; CHECK-NEXT: [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]] 10715; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 10716; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 10717; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 10718; CHECK-NEXT: [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]] 10719; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 10720; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 10721; CHECK-NEXT: ret void 10722; 10723; GFX6-LABEL: urem_v2i64_pow2_shl_denom: 10724; GFX6: ; %bb.0: 10725; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 10726; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 10727; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 10728; GFX6-NEXT: s_mov_b64 s[12:13], 0x1000 10729; GFX6-NEXT: s_mov_b32 s7, 0xf000 10730; GFX6-NEXT: s_mov_b32 s6, -1 10731; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10732; GFX6-NEXT: s_lshl_b64 s[2:3], s[12:13], s2 10733; GFX6-NEXT: s_lshl_b64 s[0:1], s[12:13], s0 10734; GFX6-NEXT: s_add_u32 s0, s0, -1 10735; GFX6-NEXT: s_addc_u32 s1, s1, -1 10736; GFX6-NEXT: s_and_b64 s[0:1], s[8:9], s[0:1] 10737; GFX6-NEXT: s_add_u32 s2, s2, -1 10738; GFX6-NEXT: s_addc_u32 s3, s3, -1 10739; GFX6-NEXT: s_and_b64 s[2:3], s[10:11], s[2:3] 10740; GFX6-NEXT: v_mov_b32_e32 v0, s0 10741; GFX6-NEXT: v_mov_b32_e32 v1, s1 10742; GFX6-NEXT: v_mov_b32_e32 v2, s2 10743; GFX6-NEXT: v_mov_b32_e32 v3, s3 10744; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 10745; GFX6-NEXT: s_endpgm 10746; 10747; GFX9-LABEL: urem_v2i64_pow2_shl_denom: 10748; GFX9: ; %bb.0: 10749; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10750; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 10751; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 10752; GFX9-NEXT: s_mov_b64 s[0:1], 0x1000 10753; GFX9-NEXT: v_mov_b32_e32 v4, 0 10754; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10755; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 10756; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 10757; GFX9-NEXT: s_add_u32 s0, s0, -1 10758; GFX9-NEXT: s_addc_u32 s1, s1, -1 10759; GFX9-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1] 10760; GFX9-NEXT: s_add_u32 s4, s10, -1 10761; GFX9-NEXT: s_addc_u32 s5, s11, -1 10762; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 10763; GFX9-NEXT: v_mov_b32_e32 v0, s0 10764; GFX9-NEXT: v_mov_b32_e32 v1, s1 10765; GFX9-NEXT: v_mov_b32_e32 v2, s4 10766; GFX9-NEXT: v_mov_b32_e32 v3, s5 10767; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 10768; GFX9-NEXT: s_endpgm 10769; 10770; GFX90A-LABEL: urem_v2i64_pow2_shl_denom: 10771; GFX90A: ; %bb.0: 10772; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10773; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 10774; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 10775; GFX90A-NEXT: s_mov_b64 s[0:1], 0x1000 10776; GFX90A-NEXT: v_mov_b32_e32 v4, 0 10777; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 10778; GFX90A-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 10779; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 10780; GFX90A-NEXT: s_add_u32 s0, s0, -1 10781; GFX90A-NEXT: s_addc_u32 s1, s1, -1 10782; GFX90A-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1] 10783; GFX90A-NEXT: s_add_u32 s4, s10, -1 10784; GFX90A-NEXT: s_addc_u32 s5, s11, -1 10785; GFX90A-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 10786; GFX90A-NEXT: v_mov_b32_e32 v0, s0 10787; GFX90A-NEXT: v_mov_b32_e32 v1, s1 10788; GFX90A-NEXT: v_mov_b32_e32 v2, s4 10789; GFX90A-NEXT: v_mov_b32_e32 v3, s5 10790; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 10791; GFX90A-NEXT: s_endpgm 10792 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 10793 %r = urem <2 x i64> %x, %shl.y 10794 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 10795 ret void 10796} 10797 10798define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 10799; CHECK-LABEL: @sdiv_i64_oddk_denom( 10800; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195 10801; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 10802; CHECK-NEXT: ret void 10803; 10804; GFX6-LABEL: sdiv_i64_oddk_denom: 10805; GFX6: ; %bb.0: 10806; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 10807; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 10808; GFX6-NEXT: v_rcp_f32_e32 v0, v0 10809; GFX6-NEXT: s_mov_b32 s5, 0xffed2705 10810; GFX6-NEXT: v_mov_b32_e32 v8, 0 10811; GFX6-NEXT: v_mov_b32_e32 v7, 0 10812; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10813; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10814; GFX6-NEXT: v_trunc_f32_e32 v1, v1 10815; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10816; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 10817; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 10818; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 10819; GFX6-NEXT: s_mov_b32 s7, 0xf000 10820; GFX6-NEXT: v_mul_lo_u32 v2, v1, s5 10821; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 10822; GFX6-NEXT: v_mul_lo_u32 v4, v0, s5 10823; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10824; GFX6-NEXT: s_ashr_i32 s8, s3, 31 10825; GFX6-NEXT: s_add_u32 s2, s2, s8 10826; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10827; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 10828; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 10829; GFX6-NEXT: v_mul_hi_u32 v6, v0, v4 10830; GFX6-NEXT: v_mul_hi_u32 v3, v0, v2 10831; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 10832; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 10833; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 10834; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 10835; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 10836; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc 10837; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 10838; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc 10839; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 10840; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10841; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 10842; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 10843; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 10844; GFX6-NEXT: v_mul_lo_u32 v2, v1, s5 10845; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 10846; GFX6-NEXT: s_mov_b32 s9, s8 10847; GFX6-NEXT: s_addc_u32 s3, s3, s8 10848; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] 10849; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10850; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 10851; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 10852; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 10853; GFX6-NEXT: v_mul_hi_u32 v9, v0, v3 10854; GFX6-NEXT: v_mul_hi_u32 v10, v0, v2 10855; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 10856; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 10857; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 10858; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 10859; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc 10860; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 10861; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 10862; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v9, v5, vcc 10863; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v7, vcc 10864; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10865; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 10866; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 10867; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 10868; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 10869; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 10870; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 10871; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 10872; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 10873; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10874; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 10875; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 10876; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 10877; GFX6-NEXT: s_mov_b32 s4, s0 10878; GFX6-NEXT: s_mov_b32 s0, 0x12d8fb 10879; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 10880; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 10881; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 10882; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 10883; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 10884; GFX6-NEXT: v_mul_lo_u32 v4, v1, s0 10885; GFX6-NEXT: v_mul_hi_u32 v5, v0, s0 10886; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 10887; GFX6-NEXT: v_mul_lo_u32 v8, v0, s0 10888; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 10889; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 10890; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 10891; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 10892; GFX6-NEXT: v_mov_b32_e32 v5, s3 10893; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s2, v8 10894; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc 10895; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s0, v8 10896; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc 10897; GFX6-NEXT: s_mov_b32 s0, 0x12d8fa 10898; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 10899; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 10900; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 10901; GFX6-NEXT: s_mov_b32 s5, s1 10902; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 10903; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 10904; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 10905; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 10906; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 10907; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] 10908; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 10909; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 10910; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 10911; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 10912; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 10913; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 10914; GFX6-NEXT: v_xor_b32_e32 v1, s8, v1 10915; GFX6-NEXT: v_mov_b32_e32 v2, s8 10916; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 10917; GFX6-NEXT: s_mov_b32 s6, -1 10918; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 10919; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 10920; GFX6-NEXT: s_endpgm 10921; 10922; GFX9-LABEL: sdiv_i64_oddk_denom: 10923; GFX9: ; %bb.0: 10924; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 10925; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 10926; GFX9-NEXT: v_rcp_f32_e32 v0, v0 10927; GFX9-NEXT: s_mov_b32 s4, 0xffed2705 10928; GFX9-NEXT: v_mov_b32_e32 v7, 0 10929; GFX9-NEXT: v_mov_b32_e32 v5, 0 10930; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10931; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10932; GFX9-NEXT: v_trunc_f32_e32 v1, v1 10933; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10934; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 10935; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 10936; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 10937; GFX9-NEXT: v_mul_lo_u32 v2, v1, s4 10938; GFX9-NEXT: v_mul_hi_u32 v3, v0, s4 10939; GFX9-NEXT: v_mul_lo_u32 v4, v0, s4 10940; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 10941; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 10942; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 10943; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 10944; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 10945; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 10946; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 10947; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 10948; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v8, vcc 10949; GFX9-NEXT: v_mul_lo_u32 v8, v1, v4 10950; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 10951; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 10952; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc 10953; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc 10954; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10955; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 10956; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 10957; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 10958; GFX9-NEXT: v_mul_lo_u32 v2, v1, s4 10959; GFX9-NEXT: v_mul_hi_u32 v3, v0, s4 10960; GFX9-NEXT: v_mul_lo_u32 v4, v0, s4 10961; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10962; GFX9-NEXT: s_ashr_i32 s4, s3, 31 10963; GFX9-NEXT: s_add_u32 s2, s2, s4 10964; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 10965; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 10966; GFX9-NEXT: v_mul_lo_u32 v8, v0, v2 10967; GFX9-NEXT: v_mul_hi_u32 v9, v0, v4 10968; GFX9-NEXT: v_mul_hi_u32 v10, v0, v2 10969; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 10970; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 10971; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 10972; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 10973; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v7, v10, vcc 10974; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 10975; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 10976; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v6, vcc 10977; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc 10978; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 10979; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v3, vcc 10980; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 10981; GFX9-NEXT: s_mov_b32 s5, s4 10982; GFX9-NEXT: s_addc_u32 s3, s3, s4 10983; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 10984; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] 10985; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 10986; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 10987; GFX9-NEXT: v_mul_hi_u32 v4, s2, v1 10988; GFX9-NEXT: v_mul_hi_u32 v6, s3, v1 10989; GFX9-NEXT: v_mul_lo_u32 v1, s3, v1 10990; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10991; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 10992; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 10993; GFX9-NEXT: v_mul_hi_u32 v0, s3, v0 10994; GFX9-NEXT: s_mov_b32 s5, 0x12d8fb 10995; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 10996; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 10997; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc 10998; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 10999; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v2, vcc 11000; GFX9-NEXT: v_mul_lo_u32 v2, v1, s5 11001; GFX9-NEXT: v_mul_hi_u32 v3, v0, s5 11002; GFX9-NEXT: v_mul_lo_u32 v4, v0, s5 11003; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 11004; GFX9-NEXT: v_mov_b32_e32 v3, s3 11005; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s2, v4 11006; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v2, vcc 11007; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s5, v4 11008; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc 11009; GFX9-NEXT: s_mov_b32 s2, 0x12d8fa 11010; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s2, v3 11011; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 11012; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 11013; GFX9-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 11014; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 11015; GFX9-NEXT: v_cndmask_b32_e64 v3, 1, 2, vcc 11016; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 11017; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc 11018; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s2, v4 11019; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 11020; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 11021; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc 11022; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 11023; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 11024; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 11025; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 11026; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 11027; GFX9-NEXT: v_mov_b32_e32 v2, s4 11028; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s4, v0 11029; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc 11030; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] 11031; GFX9-NEXT: s_endpgm 11032; 11033; GFX90A-LABEL: sdiv_i64_oddk_denom: 11034; GFX90A: ; %bb.0: 11035; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f800000 11036; GFX90A-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 11037; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 11038; GFX90A-NEXT: s_mov_b32 s4, 0xffed2705 11039; GFX90A-NEXT: v_mov_b32_e32 v8, 0 11040; GFX90A-NEXT: v_mov_b32_e32 v2, 0 11041; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 11042; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 11043; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 11044; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 11045; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 11046; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 11047; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 11048; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s4 11049; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s4 11050; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 11051; GFX90A-NEXT: v_sub_u32_e32 v3, v3, v0 11052; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s4 11053; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 11054; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 11055; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 11056; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 11057; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc 11058; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 11059; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 11060; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 11061; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 11062; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v9, vcc 11063; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 11064; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 11065; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 11066; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc 11067; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 11068; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 11069; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s4 11070; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s4 11071; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 11072; GFX90A-NEXT: v_sub_u32_e32 v3, v3, v0 11073; GFX90A-NEXT: v_mul_lo_u32 v5, v0, s4 11074; GFX90A-NEXT: v_mul_hi_u32 v6, v1, v5 11075; GFX90A-NEXT: v_mul_lo_u32 v7, v1, v5 11076; GFX90A-NEXT: v_mul_lo_u32 v10, v0, v3 11077; GFX90A-NEXT: v_mul_hi_u32 v5, v0, v5 11078; GFX90A-NEXT: v_mul_hi_u32 v9, v0, v3 11079; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v10 11080; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, v8, v9, vcc 11081; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 11082; GFX90A-NEXT: v_mul_hi_u32 v4, v1, v3 11083; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v6, vcc 11084; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc 11085; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 11086; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 11087; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 11088; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 11089; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc 11090; GFX90A-NEXT: s_add_u32 s2, s2, s4 11091; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 11092; GFX90A-NEXT: s_mov_b32 s5, s4 11093; GFX90A-NEXT: s_addc_u32 s3, s3, s4 11094; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 11095; GFX90A-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] 11096; GFX90A-NEXT: v_mul_lo_u32 v4, s2, v1 11097; GFX90A-NEXT: v_mul_hi_u32 v5, s2, v0 11098; GFX90A-NEXT: v_mul_hi_u32 v3, s2, v1 11099; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 11100; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc 11101; GFX90A-NEXT: v_mul_hi_u32 v6, s3, v0 11102; GFX90A-NEXT: v_mul_lo_u32 v0, s3, v0 11103; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 11104; GFX90A-NEXT: v_mul_hi_u32 v5, s3, v1 11105; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc 11106; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc 11107; GFX90A-NEXT: v_mul_lo_u32 v1, s3, v1 11108; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 11109; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v3, vcc 11110; GFX90A-NEXT: s_mov_b32 s5, 0x12d8fb 11111; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s5 11112; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s5 11113; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 11114; GFX90A-NEXT: v_mul_lo_u32 v4, v0, s5 11115; GFX90A-NEXT: v_mov_b32_e32 v5, s3 11116; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, s2, v4 11117; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v3, vcc 11118; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s5, v4 11119; GFX90A-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc 11120; GFX90A-NEXT: s_mov_b32 s2, 0x12d8fa 11121; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s2, v5 11122; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 11123; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 11124; GFX90A-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 11125; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 11126; GFX90A-NEXT: v_cndmask_b32_e64 v5, 1, 2, vcc 11127; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v0, v5 11128; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc 11129; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s2, v4 11130; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 11131; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 11132; GFX90A-NEXT: v_cndmask_b32_e32 v3, -1, v4, vcc 11133; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 11134; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 11135; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 11136; GFX90A-NEXT: v_xor_b32_e32 v0, s4, v0 11137; GFX90A-NEXT: v_xor_b32_e32 v1, s4, v1 11138; GFX90A-NEXT: v_mov_b32_e32 v3, s4 11139; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s4, v0 11140; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 11141; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 11142; GFX90A-NEXT: s_endpgm 11143 %r = sdiv i64 %x, 1235195 11144 store i64 %r, i64 addrspace(1)* %out 11145 ret void 11146} 11147 11148define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 11149; CHECK-LABEL: @sdiv_i64_pow2k_denom( 11150; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 4096 11151; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 11152; CHECK-NEXT: ret void 11153; 11154; GFX6-LABEL: sdiv_i64_pow2k_denom: 11155; GFX6: ; %bb.0: 11156; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 11157; GFX6-NEXT: s_mov_b32 s7, 0xf000 11158; GFX6-NEXT: s_mov_b32 s6, -1 11159; GFX6-NEXT: s_waitcnt lgkmcnt(0) 11160; GFX6-NEXT: s_mov_b32 s4, s0 11161; GFX6-NEXT: s_ashr_i32 s0, s3, 31 11162; GFX6-NEXT: s_lshr_b32 s0, s0, 20 11163; GFX6-NEXT: s_add_u32 s0, s2, s0 11164; GFX6-NEXT: s_mov_b32 s5, s1 11165; GFX6-NEXT: s_addc_u32 s1, s3, 0 11166; GFX6-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 11167; GFX6-NEXT: v_mov_b32_e32 v0, s0 11168; GFX6-NEXT: v_mov_b32_e32 v1, s1 11169; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 11170; GFX6-NEXT: s_endpgm 11171; 11172; GFX9-LABEL: sdiv_i64_pow2k_denom: 11173; GFX9: ; %bb.0: 11174; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 11175; GFX9-NEXT: v_mov_b32_e32 v2, 0 11176; GFX9-NEXT: s_waitcnt lgkmcnt(0) 11177; GFX9-NEXT: s_ashr_i32 s4, s3, 31 11178; GFX9-NEXT: s_lshr_b32 s4, s4, 20 11179; GFX9-NEXT: s_add_u32 s2, s2, s4 11180; GFX9-NEXT: s_addc_u32 s3, s3, 0 11181; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 11182; GFX9-NEXT: v_mov_b32_e32 v0, s2 11183; GFX9-NEXT: v_mov_b32_e32 v1, s3 11184; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 11185; GFX9-NEXT: s_endpgm 11186; 11187; GFX90A-LABEL: sdiv_i64_pow2k_denom: 11188; GFX90A: ; %bb.0: 11189; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 11190; GFX90A-NEXT: v_mov_b32_e32 v2, 0 11191; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 11192; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 11193; GFX90A-NEXT: s_lshr_b32 s4, s4, 20 11194; GFX90A-NEXT: s_add_u32 s2, s2, s4 11195; GFX90A-NEXT: s_addc_u32 s3, s3, 0 11196; GFX90A-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 11197; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] 11198; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 11199; GFX90A-NEXT: s_endpgm 11200 %r = sdiv i64 %x, 4096 11201 store i64 %r, i64 addrspace(1)* %out 11202 ret void 11203} 11204 11205define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 11206; CHECK-LABEL: @sdiv_i64_pow2_shl_denom( 11207; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 11208; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]] 11209; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 11210; CHECK-NEXT: ret void 11211; 11212; GFX6-LABEL: sdiv_i64_pow2_shl_denom: 11213; GFX6: ; %bb.0: 11214; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd 11215; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 11216; GFX6-NEXT: s_mov_b32 s7, 0xf000 11217; GFX6-NEXT: s_mov_b32 s6, -1 11218; GFX6-NEXT: s_waitcnt lgkmcnt(0) 11219; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 11220; GFX6-NEXT: s_ashr_i32 s8, s3, 31 11221; GFX6-NEXT: s_add_u32 s2, s2, s8 11222; GFX6-NEXT: s_mov_b32 s9, s8 11223; GFX6-NEXT: s_addc_u32 s3, s3, s8 11224; GFX6-NEXT: s_xor_b64 s[10:11], s[2:3], s[8:9] 11225; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 11226; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11 11227; GFX6-NEXT: s_sub_u32 s4, 0, s10 11228; GFX6-NEXT: s_subb_u32 s5, 0, s11 11229; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 11230; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 11231; GFX6-NEXT: v_rcp_f32_e32 v0, v0 11232; GFX6-NEXT: s_waitcnt lgkmcnt(0) 11233; GFX6-NEXT: s_ashr_i32 s12, s3, 31 11234; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 11235; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 11236; GFX6-NEXT: v_trunc_f32_e32 v1, v1 11237; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 11238; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 11239; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 11240; GFX6-NEXT: s_add_u32 s2, s2, s12 11241; GFX6-NEXT: s_mov_b32 s13, s12 11242; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 11243; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 11244; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 11245; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 11246; GFX6-NEXT: s_addc_u32 s3, s3, s12 11247; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11248; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 11249; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 11250; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 11251; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 11252; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 11253; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 11254; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 11255; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 11256; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 11257; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 11258; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] 11259; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 11260; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 11261; GFX6-NEXT: v_mov_b32_e32 v4, 0 11262; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 11263; GFX6-NEXT: v_mov_b32_e32 v6, 0 11264; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11265; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 11266; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 11267; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 11268; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 11269; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 11270; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 11271; GFX6-NEXT: s_mov_b32 s5, s1 11272; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11273; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 11274; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 11275; GFX6-NEXT: v_mul_lo_u32 v8, v0, v2 11276; GFX6-NEXT: v_mul_hi_u32 v9, v0, v3 11277; GFX6-NEXT: v_mul_hi_u32 v10, v0, v2 11278; GFX6-NEXT: v_mul_hi_u32 v7, v1, v3 11279; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 11280; GFX6-NEXT: v_mul_hi_u32 v5, v1, v2 11281; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 11282; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 11283; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 11284; GFX6-NEXT: v_add_i32_e32 v3, vcc, v8, v3 11285; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v9, v7, vcc 11286; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v4, vcc 11287; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11288; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 11289; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 11290; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 11291; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 11292; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 11293; GFX6-NEXT: v_mul_hi_u32 v5, s2, v1 11294; GFX6-NEXT: v_mul_hi_u32 v7, s3, v1 11295; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 11296; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11297; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 11298; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 11299; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 11300; GFX6-NEXT: s_mov_b32 s4, s0 11301; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 11302; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 11303; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 11304; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 11305; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 11306; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 11307; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 11308; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 11309; GFX6-NEXT: v_mov_b32_e32 v5, s11 11310; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11311; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 11312; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 11313; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 11314; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 11315; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 11316; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s10, v3 11317; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 11318; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v4 11319; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 11320; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v5 11321; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 11322; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v4 11323; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 11324; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 11325; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 11326; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 11327; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 11328; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 11329; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] 11330; GFX6-NEXT: v_mov_b32_e32 v6, s3 11331; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 11332; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 11333; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 11334; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 11335; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 11336; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s11, v2 11337; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 11338; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 11339; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 11340; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 11341; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[8:9] 11342; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 11343; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 11344; GFX6-NEXT: v_xor_b32_e32 v1, s1, v1 11345; GFX6-NEXT: v_mov_b32_e32 v2, s1 11346; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 11347; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 11348; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 11349; GFX6-NEXT: s_endpgm 11350; 11351; GFX9-LABEL: sdiv_i64_pow2_shl_denom: 11352; GFX9: ; %bb.0: 11353; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 11354; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 11355; GFX9-NEXT: v_mov_b32_e32 v2, 0 11356; GFX9-NEXT: s_waitcnt lgkmcnt(0) 11357; GFX9-NEXT: s_lshl_b64 s[4:5], s[2:3], s4 11358; GFX9-NEXT: s_ashr_i32 s2, s5, 31 11359; GFX9-NEXT: s_add_u32 s4, s4, s2 11360; GFX9-NEXT: s_mov_b32 s3, s2 11361; GFX9-NEXT: s_addc_u32 s5, s5, s2 11362; GFX9-NEXT: s_xor_b64 s[8:9], s[4:5], s[2:3] 11363; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 11364; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 11365; GFX9-NEXT: s_sub_u32 s10, 0, s8 11366; GFX9-NEXT: s_subb_u32 s4, 0, s9 11367; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 11368; GFX9-NEXT: v_rcp_f32_e32 v0, v0 11369; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 11370; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 11371; GFX9-NEXT: v_trunc_f32_e32 v1, v1 11372; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 11373; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 11374; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 11375; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 11376; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 11377; GFX9-NEXT: v_mul_lo_u32 v6, s4, v0 11378; GFX9-NEXT: v_mul_lo_u32 v5, s10, v0 11379; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 11380; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 11381; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 11382; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3 11383; GFX9-NEXT: v_mul_hi_u32 v8, v0, v3 11384; GFX9-NEXT: v_mul_hi_u32 v7, v1, v5 11385; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 11386; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 11387; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 11388; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc 11389; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 11390; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 11391; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v7, vcc 11392; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v2, vcc 11393; GFX9-NEXT: v_mov_b32_e32 v6, 0 11394; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 11395; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc 11396; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 11397; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 11398; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 11399; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 11400; GFX9-NEXT: v_mul_lo_u32 v5, s4, v0 11401; GFX9-NEXT: v_mul_lo_u32 v7, s10, v0 11402; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 11403; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 11404; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 11405; GFX9-NEXT: v_mul_lo_u32 v8, v0, v3 11406; GFX9-NEXT: v_mul_hi_u32 v9, v0, v7 11407; GFX9-NEXT: v_mul_hi_u32 v10, v0, v3 11408; GFX9-NEXT: v_mul_hi_u32 v5, v1, v7 11409; GFX9-NEXT: v_mul_lo_u32 v7, v1, v7 11410; GFX9-NEXT: v_mul_hi_u32 v4, v1, v3 11411; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 11412; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc 11413; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 11414; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 11415; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc 11416; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc 11417; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 11418; GFX9-NEXT: s_waitcnt lgkmcnt(0) 11419; GFX9-NEXT: s_ashr_i32 s10, s7, 31 11420; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v4, vcc 11421; GFX9-NEXT: s_add_u32 s0, s6, s10 11422; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 11423; GFX9-NEXT: s_mov_b32 s11, s10 11424; GFX9-NEXT: s_addc_u32 s1, s7, s10 11425; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 11426; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 11427; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 11428; GFX9-NEXT: v_mul_hi_u32 v4, s6, v0 11429; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 11430; GFX9-NEXT: v_mul_hi_u32 v7, s7, v1 11431; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 11432; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 11433; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 11434; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 11435; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 11436; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 11437; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v4, v0, vcc 11438; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v2, vcc 11439; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 11440; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v3, vcc 11441; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 11442; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 11443; GFX9-NEXT: v_mul_lo_u32 v5, s9, v0 11444; GFX9-NEXT: v_mov_b32_e32 v6, s9 11445; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 11446; GFX9-NEXT: v_mul_lo_u32 v4, s8, v0 11447; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 11448; GFX9-NEXT: v_sub_u32_e32 v5, s7, v3 11449; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s6, v4 11450; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc 11451; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s8, v4 11452; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] 11453; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v5 11454; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 11455; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v6 11456; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 11457; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v5 11458; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v6, s[0:1] 11459; GFX9-NEXT: v_mov_b32_e32 v7, s7 11460; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc 11461; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 11462; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 11463; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 11464; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v4 11465; GFX9-NEXT: v_cndmask_b32_e64 v5, 1, 2, s[0:1] 11466; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 11467; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 11468; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v0, v5 11469; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc 11470; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] 11471; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 11472; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 11473; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[2:3] 11474; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 11475; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 11476; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 11477; GFX9-NEXT: v_mov_b32_e32 v3, s1 11478; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 11479; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 11480; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 11481; GFX9-NEXT: s_endpgm 11482; 11483; GFX90A-LABEL: sdiv_i64_pow2_shl_denom: 11484; GFX90A: ; %bb.0: 11485; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x34 11486; GFX90A-NEXT: s_mov_b64 s[2:3], 0x1000 11487; GFX90A-NEXT: v_mov_b32_e32 v2, 0 11488; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 11489; GFX90A-NEXT: s_lshl_b64 s[4:5], s[2:3], s4 11490; GFX90A-NEXT: s_ashr_i32 s2, s5, 31 11491; GFX90A-NEXT: s_add_u32 s4, s4, s2 11492; GFX90A-NEXT: s_mov_b32 s3, s2 11493; GFX90A-NEXT: s_addc_u32 s5, s5, s2 11494; GFX90A-NEXT: s_xor_b64 s[8:9], s[4:5], s[2:3] 11495; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 11496; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 11497; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 11498; GFX90A-NEXT: s_sub_u32 s0, 0, s8 11499; GFX90A-NEXT: s_subb_u32 s1, 0, s9 11500; GFX90A-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 11501; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 11502; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 11503; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 11504; GFX90A-NEXT: s_mov_b32 s11, s10 11505; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 11506; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 11507; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 11508; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 11509; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 11510; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 11511; GFX90A-NEXT: v_mul_lo_u32 v3, s0, v1 11512; GFX90A-NEXT: v_mul_hi_u32 v5, s0, v0 11513; GFX90A-NEXT: v_mul_lo_u32 v4, s1, v0 11514; GFX90A-NEXT: v_add_u32_e32 v3, v5, v3 11515; GFX90A-NEXT: v_mul_lo_u32 v6, s0, v0 11516; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 11517; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 11518; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 11519; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 11520; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 11521; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 11522; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 11523; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 11524; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 11525; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 11526; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc 11527; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 11528; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 11529; GFX90A-NEXT: v_mov_b32_e32 v6, 0 11530; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 11531; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc 11532; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 11533; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 11534; GFX90A-NEXT: v_mul_lo_u32 v3, s0, v1 11535; GFX90A-NEXT: v_mul_hi_u32 v4, s0, v0 11536; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 11537; GFX90A-NEXT: v_mul_lo_u32 v4, s1, v0 11538; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 11539; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v0 11540; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v5 11541; GFX90A-NEXT: v_mul_lo_u32 v8, v1, v5 11542; GFX90A-NEXT: v_mul_lo_u32 v10, v0, v3 11543; GFX90A-NEXT: v_mul_hi_u32 v5, v0, v5 11544; GFX90A-NEXT: v_mul_hi_u32 v9, v0, v3 11545; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v10 11546; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc 11547; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 11548; GFX90A-NEXT: v_mul_hi_u32 v4, v1, v3 11549; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v7, vcc 11550; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc 11551; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 11552; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 11553; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v4, vcc 11554; GFX90A-NEXT: s_add_u32 s0, s6, s10 11555; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 11556; GFX90A-NEXT: s_addc_u32 s1, s7, s10 11557; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 11558; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 11559; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 11560; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 11561; GFX90A-NEXT: v_mul_hi_u32 v3, s6, v1 11562; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 11563; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 11564; GFX90A-NEXT: v_mul_hi_u32 v7, s7, v0 11565; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 11566; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 11567; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 11568; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v7, vcc 11569; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc 11570; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 11571; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 11572; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v3, vcc 11573; GFX90A-NEXT: v_mul_lo_u32 v3, s8, v1 11574; GFX90A-NEXT: v_mul_hi_u32 v4, s8, v0 11575; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 11576; GFX90A-NEXT: v_mul_lo_u32 v4, s9, v0 11577; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 11578; GFX90A-NEXT: v_mul_lo_u32 v5, s8, v0 11579; GFX90A-NEXT: v_sub_u32_e32 v4, s7, v3 11580; GFX90A-NEXT: v_mov_b32_e32 v6, s9 11581; GFX90A-NEXT: v_sub_co_u32_e32 v5, vcc, s6, v5 11582; GFX90A-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc 11583; GFX90A-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s8, v5 11584; GFX90A-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] 11585; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 11586; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 11587; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v6 11588; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 11589; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 11590; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] 11591; GFX90A-NEXT: v_mov_b32_e32 v7, s7 11592; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc 11593; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 11594; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 11595; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 11596; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 11597; GFX90A-NEXT: v_cndmask_b32_e64 v4, 1, 2, s[0:1] 11598; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 11599; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 11600; GFX90A-NEXT: v_add_co_u32_e64 v4, s[0:1], v0, v4 11601; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc 11602; GFX90A-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] 11603; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 11604; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 11605; GFX90A-NEXT: s_xor_b64 s[0:1], s[10:11], s[2:3] 11606; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 11607; GFX90A-NEXT: v_xor_b32_e32 v0, s0, v0 11608; GFX90A-NEXT: v_xor_b32_e32 v1, s1, v1 11609; GFX90A-NEXT: v_mov_b32_e32 v3, s1 11610; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 11611; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 11612; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 11613; GFX90A-NEXT: s_endpgm 11614 %shl.y = shl i64 4096, %y 11615 %r = sdiv i64 %x, %shl.y 11616 store i64 %r, i64 addrspace(1)* %out 11617 ret void 11618} 11619 11620define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 11621; CHECK-LABEL: @sdiv_v2i64_pow2k_denom( 11622; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 11623; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 11624; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 11625; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 11626; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096 11627; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 11628; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 11629; CHECK-NEXT: ret void 11630; 11631; GFX6-LABEL: sdiv_v2i64_pow2k_denom: 11632; GFX6: ; %bb.0: 11633; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 11634; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 11635; GFX6-NEXT: s_mov_b32 s7, 0xf000 11636; GFX6-NEXT: s_mov_b32 s6, -1 11637; GFX6-NEXT: s_waitcnt lgkmcnt(0) 11638; GFX6-NEXT: s_ashr_i32 s8, s1, 31 11639; GFX6-NEXT: s_lshr_b32 s8, s8, 20 11640; GFX6-NEXT: s_add_u32 s0, s0, s8 11641; GFX6-NEXT: s_addc_u32 s1, s1, 0 11642; GFX6-NEXT: s_ashr_i32 s8, s3, 31 11643; GFX6-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 11644; GFX6-NEXT: s_lshr_b32 s8, s8, 20 11645; GFX6-NEXT: s_add_u32 s2, s2, s8 11646; GFX6-NEXT: s_addc_u32 s3, s3, 0 11647; GFX6-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 11648; GFX6-NEXT: v_mov_b32_e32 v0, s0 11649; GFX6-NEXT: v_mov_b32_e32 v1, s1 11650; GFX6-NEXT: v_mov_b32_e32 v2, s2 11651; GFX6-NEXT: v_mov_b32_e32 v3, s3 11652; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 11653; GFX6-NEXT: s_endpgm 11654; 11655; GFX9-LABEL: sdiv_v2i64_pow2k_denom: 11656; GFX9: ; %bb.0: 11657; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 11658; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 11659; GFX9-NEXT: v_mov_b32_e32 v4, 0 11660; GFX9-NEXT: s_waitcnt lgkmcnt(0) 11661; GFX9-NEXT: s_ashr_i32 s0, s5, 31 11662; GFX9-NEXT: s_lshr_b32 s0, s0, 20 11663; GFX9-NEXT: s_add_u32 s0, s4, s0 11664; GFX9-NEXT: s_addc_u32 s1, s5, 0 11665; GFX9-NEXT: s_ashr_i32 s4, s7, 31 11666; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 11667; GFX9-NEXT: s_lshr_b32 s4, s4, 20 11668; GFX9-NEXT: s_add_u32 s4, s6, s4 11669; GFX9-NEXT: s_addc_u32 s5, s7, 0 11670; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 11671; GFX9-NEXT: v_mov_b32_e32 v0, s0 11672; GFX9-NEXT: v_mov_b32_e32 v1, s1 11673; GFX9-NEXT: v_mov_b32_e32 v2, s4 11674; GFX9-NEXT: v_mov_b32_e32 v3, s5 11675; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 11676; GFX9-NEXT: s_endpgm 11677; 11678; GFX90A-LABEL: sdiv_v2i64_pow2k_denom: 11679; GFX90A: ; %bb.0: 11680; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 11681; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 11682; GFX90A-NEXT: v_mov_b32_e32 v4, 0 11683; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 11684; GFX90A-NEXT: s_ashr_i32 s0, s5, 31 11685; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 11686; GFX90A-NEXT: s_add_u32 s0, s4, s0 11687; GFX90A-NEXT: s_addc_u32 s1, s5, 0 11688; GFX90A-NEXT: s_ashr_i32 s4, s7, 31 11689; GFX90A-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 11690; GFX90A-NEXT: s_lshr_b32 s4, s4, 20 11691; GFX90A-NEXT: s_add_u32 s4, s6, s4 11692; GFX90A-NEXT: s_addc_u32 s5, s7, 0 11693; GFX90A-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 11694; GFX90A-NEXT: v_mov_b32_e32 v0, s0 11695; GFX90A-NEXT: v_mov_b32_e32 v1, s1 11696; GFX90A-NEXT: v_mov_b32_e32 v2, s4 11697; GFX90A-NEXT: v_mov_b32_e32 v3, s5 11698; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 11699; GFX90A-NEXT: s_endpgm 11700 %r = sdiv <2 x i64> %x, <i64 4096, i64 4096> 11701 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 11702 ret void 11703} 11704 11705define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 11706; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom( 11707; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 11708; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 11709; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 11710; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 11711; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095 11712; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 11713; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 11714; CHECK-NEXT: ret void 11715; 11716; GFX6-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 11717; GFX6: ; %bb.0: 11718; GFX6-NEXT: v_mov_b32_e32 v0, 0x457ff000 11719; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 11720; GFX6-NEXT: v_mac_f32_e32 v0, 0, v1 11721; GFX6-NEXT: v_rcp_f32_e32 v0, v0 11722; GFX6-NEXT: s_movk_i32 s6, 0xf001 11723; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 11724; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 11725; GFX6-NEXT: s_mov_b32 s7, 0xf000 11726; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 11727; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 11728; GFX6-NEXT: v_trunc_f32_e32 v1, v1 11729; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 11730; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 11731; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 11732; GFX6-NEXT: s_waitcnt lgkmcnt(0) 11733; GFX6-NEXT: s_ashr_i32 s8, s1, 31 11734; GFX6-NEXT: s_lshr_b32 s8, s8, 20 11735; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 11736; GFX6-NEXT: v_mul_lo_u32 v3, v1, s6 11737; GFX6-NEXT: s_add_u32 s0, s0, s8 11738; GFX6-NEXT: s_addc_u32 s1, s1, 0 11739; GFX6-NEXT: s_ashr_i64 s[8:9], s[0:1], 12 11740; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 11741; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 11742; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 11743; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 11744; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 11745; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 11746; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 11747; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 11748; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 11749; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 11750; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 11751; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 11752; GFX6-NEXT: s_ashr_i32 s10, s3, 31 11753; GFX6-NEXT: s_add_u32 s0, s2, s10 11754; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 11755; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 11756; GFX6-NEXT: v_mov_b32_e32 v4, 0 11757; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 11758; GFX6-NEXT: v_mov_b32_e32 v6, 0 11759; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11760; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 11761; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 11762; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 11763; GFX6-NEXT: v_mul_lo_u32 v2, v1, s6 11764; GFX6-NEXT: v_mul_hi_u32 v3, v0, s6 11765; GFX6-NEXT: s_mov_b32 s11, s10 11766; GFX6-NEXT: s_addc_u32 s1, s3, s10 11767; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[10:11] 11768; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11769; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 11770; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 11771; GFX6-NEXT: v_mul_lo_u32 v8, v0, v2 11772; GFX6-NEXT: v_mul_hi_u32 v9, v0, v3 11773; GFX6-NEXT: v_mul_hi_u32 v10, v0, v2 11774; GFX6-NEXT: v_mul_hi_u32 v7, v1, v3 11775; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 11776; GFX6-NEXT: v_mul_hi_u32 v5, v1, v2 11777; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 11778; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 11779; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 11780; GFX6-NEXT: v_add_i32_e32 v3, vcc, v8, v3 11781; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v9, v7, vcc 11782; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v4, vcc 11783; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11784; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 11785; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 11786; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 11787; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 11788; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 11789; GFX6-NEXT: v_mul_hi_u32 v5, s0, v1 11790; GFX6-NEXT: v_mul_hi_u32 v7, s1, v1 11791; GFX6-NEXT: v_mul_lo_u32 v1, s1, v1 11792; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11793; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 11794; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 11795; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 11796; GFX6-NEXT: s_movk_i32 s2, 0xfff 11797; GFX6-NEXT: s_mov_b32 s6, -1 11798; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 11799; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 11800; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 11801; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 11802; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 11803; GFX6-NEXT: v_mul_lo_u32 v4, v1, s2 11804; GFX6-NEXT: v_mul_hi_u32 v5, v0, s2 11805; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 11806; GFX6-NEXT: v_mul_lo_u32 v8, v0, s2 11807; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 11808; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 11809; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 11810; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 11811; GFX6-NEXT: v_mov_b32_e32 v5, s1 11812; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s0, v8 11813; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc 11814; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s2, v8 11815; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc 11816; GFX6-NEXT: s_movk_i32 s0, 0xffe 11817; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 11818; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 11819; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 11820; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 11821; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 11822; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 11823; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 11824; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 11825; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] 11826; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 11827; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 11828; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 11829; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 11830; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 11831; GFX6-NEXT: v_xor_b32_e32 v0, s10, v0 11832; GFX6-NEXT: v_xor_b32_e32 v1, s10, v1 11833; GFX6-NEXT: v_mov_b32_e32 v3, s10 11834; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s10, v0 11835; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc 11836; GFX6-NEXT: v_mov_b32_e32 v0, s8 11837; GFX6-NEXT: v_mov_b32_e32 v1, s9 11838; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 11839; GFX6-NEXT: s_endpgm 11840; 11841; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 11842; GFX9: ; %bb.0: 11843; GFX9-NEXT: v_mov_b32_e32 v0, 0x457ff000 11844; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 11845; GFX9-NEXT: v_mac_f32_e32 v0, 0, v1 11846; GFX9-NEXT: v_rcp_f32_e32 v0, v0 11847; GFX9-NEXT: s_movk_i32 s8, 0xf001 11848; GFX9-NEXT: v_mov_b32_e32 v4, 0 11849; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 11850; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 11851; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 11852; GFX9-NEXT: v_trunc_f32_e32 v1, v1 11853; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 11854; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 11855; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 11856; GFX9-NEXT: s_waitcnt lgkmcnt(0) 11857; GFX9-NEXT: s_ashr_i32 s2, s5, 31 11858; GFX9-NEXT: s_lshr_b32 s2, s2, 20 11859; GFX9-NEXT: v_mul_hi_u32 v2, v0, s8 11860; GFX9-NEXT: v_mul_lo_u32 v3, v1, s8 11861; GFX9-NEXT: v_mul_lo_u32 v5, v0, s8 11862; GFX9-NEXT: s_add_u32 s2, s4, s2 11863; GFX9-NEXT: s_addc_u32 s3, s5, 0 11864; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 11865; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 11866; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 11867; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5 11868; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 11869; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 11870; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 11871; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 11872; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 11873; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 11874; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 11875; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 11876; GFX9-NEXT: s_ashr_i32 s4, s7, 31 11877; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 11878; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc 11879; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v4, vcc 11880; GFX9-NEXT: v_mov_b32_e32 v6, 0 11881; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 11882; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc 11883; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 11884; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 11885; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 11886; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 11887; GFX9-NEXT: v_mul_lo_u32 v5, v0, s8 11888; GFX9-NEXT: s_add_u32 s6, s6, s4 11889; GFX9-NEXT: s_mov_b32 s5, s4 11890; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 11891; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 11892; GFX9-NEXT: v_mul_lo_u32 v8, v0, v2 11893; GFX9-NEXT: v_mul_hi_u32 v9, v0, v5 11894; GFX9-NEXT: v_mul_hi_u32 v10, v0, v2 11895; GFX9-NEXT: v_mul_hi_u32 v7, v1, v5 11896; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 11897; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 11898; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 11899; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc 11900; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 11901; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v8, v5 11902; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v7, vcc 11903; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc 11904; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 11905; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc 11906; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 11907; GFX9-NEXT: s_addc_u32 s7, s7, s4 11908; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 11909; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] 11910; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 11911; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 11912; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 11913; GFX9-NEXT: v_mul_hi_u32 v7, s7, v1 11914; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 11915; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 11916; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc 11917; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 11918; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 11919; GFX9-NEXT: s_movk_i32 s5, 0xfff 11920; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 11921; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 11922; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 11923; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v4, vcc 11924; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 11925; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v2, vcc 11926; GFX9-NEXT: v_mul_lo_u32 v2, v1, s5 11927; GFX9-NEXT: v_mul_hi_u32 v3, v0, s5 11928; GFX9-NEXT: v_mul_lo_u32 v5, v0, s5 11929; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 11930; GFX9-NEXT: v_mov_b32_e32 v3, s7 11931; GFX9-NEXT: v_sub_co_u32_e32 v5, vcc, s6, v5 11932; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v2, vcc 11933; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s5, v5 11934; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc 11935; GFX9-NEXT: s_movk_i32 s5, 0xffe 11936; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s5, v3 11937; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 11938; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 11939; GFX9-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 11940; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 11941; GFX9-NEXT: v_cndmask_b32_e64 v3, 1, 2, vcc 11942; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 11943; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc 11944; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s5, v5 11945; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 11946; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 11947; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v5, vcc 11948; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 11949; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 11950; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 11951; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 11952; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 11953; GFX9-NEXT: v_mov_b32_e32 v3, s4 11954; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s4, v0 11955; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc 11956; GFX9-NEXT: v_mov_b32_e32 v0, s2 11957; GFX9-NEXT: v_mov_b32_e32 v1, s3 11958; GFX9-NEXT: s_waitcnt lgkmcnt(0) 11959; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 11960; GFX9-NEXT: s_endpgm 11961; 11962; GFX90A-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 11963; GFX90A: ; %bb.0: 11964; GFX90A-NEXT: v_mov_b32_e32 v0, 0x457ff000 11965; GFX90A-NEXT: v_mov_b32_e32 v1, 0x4f800000 11966; GFX90A-NEXT: v_mac_f32_e32 v0, 0, v1 11967; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 11968; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 11969; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 11970; GFX90A-NEXT: v_mov_b32_e32 v4, 0 11971; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 11972; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 11973; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 11974; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 11975; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 11976; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 11977; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 11978; GFX90A-NEXT: s_ashr_i32 s0, s5, 31 11979; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 11980; GFX90A-NEXT: s_add_u32 s0, s4, s0 11981; GFX90A-NEXT: s_movk_i32 s4, 0xf001 11982; GFX90A-NEXT: v_mul_hi_u32 v2, v0, s4 11983; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s4 11984; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 11985; GFX90A-NEXT: v_sub_u32_e32 v2, v2, v0 11986; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s4 11987; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 11988; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 11989; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 11990; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 11991; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 11992; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 11993; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 11994; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 11995; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 11996; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc 11997; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc 11998; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 11999; GFX90A-NEXT: v_mov_b32_e32 v6, 0 12000; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 12001; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc 12002; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 12003; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 12004; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s4 12005; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s4 12006; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 12007; GFX90A-NEXT: v_sub_u32_e32 v2, v2, v0 12008; GFX90A-NEXT: v_mul_lo_u32 v5, v0, s4 12009; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v5 12010; GFX90A-NEXT: v_mul_lo_u32 v8, v1, v5 12011; GFX90A-NEXT: v_mul_lo_u32 v10, v0, v2 12012; GFX90A-NEXT: v_mul_hi_u32 v5, v0, v5 12013; GFX90A-NEXT: v_mul_hi_u32 v9, v0, v2 12014; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v10 12015; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc 12016; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 12017; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v2 12018; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v7, vcc 12019; GFX90A-NEXT: s_addc_u32 s1, s5, 0 12020; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc 12021; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 12022; GFX90A-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 12023; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 12024; GFX90A-NEXT: s_ashr_i32 s4, s7, 31 12025; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc 12026; GFX90A-NEXT: s_add_u32 s6, s6, s4 12027; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 12028; GFX90A-NEXT: s_mov_b32 s5, s4 12029; GFX90A-NEXT: s_addc_u32 s7, s7, s4 12030; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 12031; GFX90A-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] 12032; GFX90A-NEXT: v_mul_lo_u32 v3, s6, v1 12033; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 12034; GFX90A-NEXT: v_mul_hi_u32 v2, s6, v1 12035; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 12036; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 12037; GFX90A-NEXT: v_mul_hi_u32 v7, s7, v0 12038; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 12039; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 12040; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 12041; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v2, v7, vcc 12042; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc 12043; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 12044; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 12045; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v2, vcc 12046; GFX90A-NEXT: s_movk_i32 s5, 0xfff 12047; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s5 12048; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s5 12049; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 12050; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s5 12051; GFX90A-NEXT: v_mov_b32_e32 v5, s7 12052; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 12053; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v5, v2, vcc 12054; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s5, v3 12055; GFX90A-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc 12056; GFX90A-NEXT: s_movk_i32 s5, 0xffe 12057; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s5, v5 12058; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 12059; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 12060; GFX90A-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 12061; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 12062; GFX90A-NEXT: v_cndmask_b32_e64 v5, 1, 2, vcc 12063; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v0, v5 12064; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc 12065; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s5, v3 12066; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 12067; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 12068; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v3, vcc 12069; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 12070; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 12071; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 12072; GFX90A-NEXT: v_xor_b32_e32 v0, s4, v0 12073; GFX90A-NEXT: v_xor_b32_e32 v1, s4, v1 12074; GFX90A-NEXT: v_mov_b32_e32 v3, s4 12075; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s4, v0 12076; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc 12077; GFX90A-NEXT: v_mov_b32_e32 v0, s0 12078; GFX90A-NEXT: v_mov_b32_e32 v1, s1 12079; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 12080; GFX90A-NEXT: s_endpgm 12081 %r = sdiv <2 x i64> %x, <i64 4096, i64 4095> 12082 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 12083 ret void 12084} 12085 12086define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 12087; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom( 12088; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 12089; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 12090; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 12091; CHECK-NEXT: [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]] 12092; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 12093; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 12094; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 12095; CHECK-NEXT: [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]] 12096; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 12097; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 12098; CHECK-NEXT: ret void 12099; 12100; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom: 12101; GFX6: ; %bb.0: 12102; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 12103; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 12104; GFX6-NEXT: s_mov_b32 s18, 0x4f800000 12105; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc 12106; GFX6-NEXT: s_mov_b32 s20, 0x2f800000 12107; GFX6-NEXT: s_waitcnt lgkmcnt(0) 12108; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s6 12109; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 12110; GFX6-NEXT: s_ashr_i32 s12, s3, 31 12111; GFX6-NEXT: s_add_u32 s2, s2, s12 12112; GFX6-NEXT: s_mov_b32 s13, s12 12113; GFX6-NEXT: s_addc_u32 s3, s3, s12 12114; GFX6-NEXT: s_xor_b64 s[10:11], s[2:3], s[12:13] 12115; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 12116; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11 12117; GFX6-NEXT: s_mov_b32 s21, 0xcf800000 12118; GFX6-NEXT: s_sub_u32 s6, 0, s10 12119; GFX6-NEXT: s_subb_u32 s7, 0, s11 12120; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1 12121; GFX6-NEXT: v_rcp_f32_e32 v0, v0 12122; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 12123; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 12124; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0 12125; GFX6-NEXT: v_mul_f32_e32 v1, s20, v0 12126; GFX6-NEXT: v_trunc_f32_e32 v1, v1 12127; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 12128; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1 12129; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v0 12130; GFX6-NEXT: s_waitcnt lgkmcnt(0) 12131; GFX6-NEXT: s_ashr_i32 s14, s1, 31 12132; GFX6-NEXT: s_add_u32 s0, s0, s14 12133; GFX6-NEXT: v_mul_lo_u32 v0, s6, v2 12134; GFX6-NEXT: v_mul_hi_u32 v1, s6, v3 12135; GFX6-NEXT: v_mul_lo_u32 v4, s7, v3 12136; GFX6-NEXT: v_mul_lo_u32 v5, s6, v3 12137; GFX6-NEXT: s_mov_b32 s15, s14 12138; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 12139; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v4 12140; GFX6-NEXT: v_mul_lo_u32 v0, v3, v1 12141; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 12142; GFX6-NEXT: v_mul_hi_u32 v6, v3, v1 12143; GFX6-NEXT: v_mul_hi_u32 v7, v2, v1 12144; GFX6-NEXT: s_addc_u32 s1, s1, s14 12145; GFX6-NEXT: v_add_i32_e32 v0, vcc, v4, v0 12146; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 12147; GFX6-NEXT: v_mul_lo_u32 v6, v2, v5 12148; GFX6-NEXT: v_mul_hi_u32 v5, v2, v5 12149; GFX6-NEXT: s_xor_b64 s[16:17], s[0:1], s[14:15] 12150; GFX6-NEXT: s_xor_b64 s[14:15], s[14:15], s[12:13] 12151; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 12152; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc 12153; GFX6-NEXT: v_mul_lo_u32 v5, v2, v1 12154; GFX6-NEXT: v_mov_b32_e32 v0, 0 12155; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v7, v0, vcc 12156; GFX6-NEXT: v_mov_b32_e32 v1, 0 12157; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 12158; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v6, vcc 12159; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 12160; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v2, v5, vcc 12161; GFX6-NEXT: v_mul_lo_u32 v4, s6, v2 12162; GFX6-NEXT: v_mul_hi_u32 v5, s6, v3 12163; GFX6-NEXT: v_mul_lo_u32 v6, s7, v3 12164; GFX6-NEXT: s_ashr_i32 s12, s9, 31 12165; GFX6-NEXT: s_add_u32 s8, s8, s12 12166; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 12167; GFX6-NEXT: v_mul_lo_u32 v5, s6, v3 12168; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 12169; GFX6-NEXT: v_mul_lo_u32 v8, v3, v4 12170; GFX6-NEXT: v_mul_hi_u32 v9, v3, v5 12171; GFX6-NEXT: v_mul_hi_u32 v10, v3, v4 12172; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 12173; GFX6-NEXT: v_mul_lo_u32 v5, v2, v5 12174; GFX6-NEXT: v_mul_hi_u32 v6, v2, v4 12175; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 12176; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 12177; GFX6-NEXT: v_mul_lo_u32 v4, v2, v4 12178; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 12179; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc 12180; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v6, v0, vcc 12181; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 12182; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v6, vcc 12183; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 12184; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v2, v5, vcc 12185; GFX6-NEXT: v_mul_lo_u32 v4, s16, v2 12186; GFX6-NEXT: v_mul_hi_u32 v5, s16, v3 12187; GFX6-NEXT: v_mul_hi_u32 v6, s16, v2 12188; GFX6-NEXT: v_mul_hi_u32 v7, s17, v2 12189; GFX6-NEXT: v_mul_lo_u32 v2, s17, v2 12190; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 12191; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 12192; GFX6-NEXT: v_mul_lo_u32 v6, s17, v3 12193; GFX6-NEXT: v_mul_hi_u32 v3, s17, v3 12194; GFX6-NEXT: s_mov_b32 s13, s12 12195; GFX6-NEXT: s_addc_u32 s9, s9, s12 12196; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 12197; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 12198; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v7, v0, vcc 12199; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 12200; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v1, v4, vcc 12201; GFX6-NEXT: v_mul_lo_u32 v4, s10, v3 12202; GFX6-NEXT: v_mul_hi_u32 v5, s10, v2 12203; GFX6-NEXT: v_mul_lo_u32 v6, s11, v2 12204; GFX6-NEXT: v_mov_b32_e32 v7, s11 12205; GFX6-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] 12206; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 12207; GFX6-NEXT: v_mul_lo_u32 v5, s10, v2 12208; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 12209; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s17, v4 12210; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s16, v5 12211; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc 12212; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s10, v5 12213; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] 12214; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v6 12215; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 12216; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v7 12217; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 12218; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v6 12219; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] 12220; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v2 12221; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] 12222; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v2 12223; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] 12224; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 12225; GFX6-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[0:1] 12226; GFX6-NEXT: v_mov_b32_e32 v8, s17 12227; GFX6-NEXT: v_cvt_f32_u32_e32 v10, s8 12228; GFX6-NEXT: v_cvt_f32_u32_e32 v11, s9 12229; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc 12230; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v4 12231; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 12232; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v5 12233; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 12234; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s11, v4 12235; GFX6-NEXT: v_mac_f32_e32 v10, s18, v11 12236; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc 12237; GFX6-NEXT: v_rcp_f32_e32 v5, v10 12238; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 12239; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 12240; GFX6-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] 12241; GFX6-NEXT: v_mul_f32_e32 v5, s19, v5 12242; GFX6-NEXT: v_mul_f32_e32 v6, s20, v5 12243; GFX6-NEXT: v_trunc_f32_e32 v6, v6 12244; GFX6-NEXT: v_mac_f32_e32 v5, s21, v6 12245; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 12246; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v6 12247; GFX6-NEXT: s_sub_u32 s0, 0, s8 12248; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 12249; GFX6-NEXT: v_mul_hi_u32 v4, s0, v5 12250; GFX6-NEXT: v_mul_lo_u32 v7, s0, v6 12251; GFX6-NEXT: s_subb_u32 s1, 0, s9 12252; GFX6-NEXT: v_mul_lo_u32 v8, s1, v5 12253; GFX6-NEXT: s_ashr_i32 s10, s3, 31 12254; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 12255; GFX6-NEXT: v_mul_lo_u32 v7, s0, v5 12256; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8 12257; GFX6-NEXT: v_mul_lo_u32 v8, v5, v4 12258; GFX6-NEXT: v_mul_hi_u32 v9, v5, v7 12259; GFX6-NEXT: v_mul_hi_u32 v10, v5, v4 12260; GFX6-NEXT: v_mul_hi_u32 v11, v6, v4 12261; GFX6-NEXT: v_mul_lo_u32 v4, v6, v4 12262; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 12263; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 12264; GFX6-NEXT: v_mul_lo_u32 v10, v6, v7 12265; GFX6-NEXT: v_mul_hi_u32 v7, v6, v7 12266; GFX6-NEXT: s_mov_b32 s11, s10 12267; GFX6-NEXT: v_xor_b32_e32 v2, s14, v2 12268; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v10 12269; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc 12270; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v11, v0, vcc 12271; GFX6-NEXT: v_add_i32_e32 v4, vcc, v7, v4 12272; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v1, v8, vcc 12273; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 12274; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v6, v7, vcc 12275; GFX6-NEXT: v_mul_lo_u32 v6, s0, v5 12276; GFX6-NEXT: v_mul_hi_u32 v7, s0, v4 12277; GFX6-NEXT: v_mul_lo_u32 v8, s1, v4 12278; GFX6-NEXT: v_xor_b32_e32 v3, s15, v3 12279; GFX6-NEXT: s_mov_b32 s7, 0xf000 12280; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 12281; GFX6-NEXT: v_mul_lo_u32 v7, s0, v4 12282; GFX6-NEXT: v_add_i32_e32 v6, vcc, v8, v6 12283; GFX6-NEXT: v_mul_lo_u32 v10, v4, v6 12284; GFX6-NEXT: v_mul_hi_u32 v11, v4, v7 12285; GFX6-NEXT: v_mul_hi_u32 v12, v4, v6 12286; GFX6-NEXT: v_mul_hi_u32 v9, v5, v7 12287; GFX6-NEXT: v_mul_lo_u32 v7, v5, v7 12288; GFX6-NEXT: v_mul_hi_u32 v8, v5, v6 12289; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 12290; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 12291; GFX6-NEXT: v_mul_lo_u32 v6, v5, v6 12292; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 12293; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 12294; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc 12295; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 12296; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v1, v8, vcc 12297; GFX6-NEXT: s_add_u32 s0, s2, s10 12298; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 12299; GFX6-NEXT: s_addc_u32 s1, s3, s10 12300; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc 12301; GFX6-NEXT: s_xor_b64 s[2:3], s[0:1], s[10:11] 12302; GFX6-NEXT: v_mul_lo_u32 v6, s2, v5 12303; GFX6-NEXT: v_mul_hi_u32 v7, s2, v4 12304; GFX6-NEXT: v_mul_hi_u32 v9, s2, v5 12305; GFX6-NEXT: v_mul_hi_u32 v10, s3, v5 12306; GFX6-NEXT: v_mul_lo_u32 v5, s3, v5 12307; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 12308; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc 12309; GFX6-NEXT: v_mul_lo_u32 v9, s3, v4 12310; GFX6-NEXT: v_mul_hi_u32 v4, s3, v4 12311; GFX6-NEXT: v_mov_b32_e32 v8, s15 12312; GFX6-NEXT: s_mov_b32 s6, -1 12313; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v9 12314; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc 12315; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v10, v0, vcc 12316; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 12317; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v0, vcc 12318; GFX6-NEXT: v_mul_lo_u32 v6, s8, v5 12319; GFX6-NEXT: v_mul_hi_u32 v7, s8, v4 12320; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s14, v2 12321; GFX6-NEXT: v_mul_lo_u32 v2, s9, v4 12322; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v3, v8, vcc 12323; GFX6-NEXT: v_add_i32_e32 v3, vcc, v7, v6 12324; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 12325; GFX6-NEXT: v_mul_lo_u32 v3, s8, v4 12326; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s3, v2 12327; GFX6-NEXT: v_mov_b32_e32 v7, s9 12328; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 12329; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc 12330; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s8, v3 12331; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] 12332; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 12333; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 12334; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 12335; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 12336; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 12337; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] 12338; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v4 12339; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v5, s[0:1] 12340; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v4 12341; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v5, s[0:1] 12342; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 12343; GFX6-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[0:1] 12344; GFX6-NEXT: v_mov_b32_e32 v8, s3 12345; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc 12346; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 12347; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 12348; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 12349; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 12350; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v2 12351; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc 12352; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 12353; GFX6-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[0:1] 12354; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc 12355; GFX6-NEXT: s_xor_b64 s[0:1], s[10:11], s[12:13] 12356; GFX6-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc 12357; GFX6-NEXT: v_xor_b32_e32 v3, s0, v3 12358; GFX6-NEXT: v_xor_b32_e32 v4, s1, v2 12359; GFX6-NEXT: v_mov_b32_e32 v5, s1 12360; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v3 12361; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v4, v5, vcc 12362; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 12363; GFX6-NEXT: s_endpgm 12364; 12365; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom: 12366; GFX9: ; %bb.0: 12367; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 12368; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 12369; GFX9-NEXT: s_mov_b32 s16, 0x4f800000 12370; GFX9-NEXT: s_mov_b32 s17, 0x5f7ffffc 12371; GFX9-NEXT: s_mov_b32 s18, 0x2f800000 12372; GFX9-NEXT: s_waitcnt lgkmcnt(0) 12373; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s6 12374; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 12375; GFX9-NEXT: s_ashr_i32 s12, s3, 31 12376; GFX9-NEXT: s_add_u32 s2, s2, s12 12377; GFX9-NEXT: s_mov_b32 s13, s12 12378; GFX9-NEXT: s_addc_u32 s3, s3, s12 12379; GFX9-NEXT: s_xor_b64 s[10:11], s[2:3], s[12:13] 12380; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 12381; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 12382; GFX9-NEXT: s_mov_b32 s19, 0xcf800000 12383; GFX9-NEXT: s_sub_u32 s2, 0, s10 12384; GFX9-NEXT: s_subb_u32 s3, 0, s11 12385; GFX9-NEXT: v_mac_f32_e32 v0, s16, v1 12386; GFX9-NEXT: v_rcp_f32_e32 v0, v0 12387; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 12388; GFX9-NEXT: v_mul_f32_e32 v0, s17, v0 12389; GFX9-NEXT: v_mul_f32_e32 v1, s18, v0 12390; GFX9-NEXT: v_trunc_f32_e32 v1, v1 12391; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 12392; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v1 12393; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 12394; GFX9-NEXT: s_waitcnt lgkmcnt(0) 12395; GFX9-NEXT: s_ashr_i32 s14, s5, 31 12396; GFX9-NEXT: s_mov_b32 s15, s14 12397; GFX9-NEXT: v_mul_lo_u32 v0, s2, v2 12398; GFX9-NEXT: v_mul_hi_u32 v1, s2, v3 12399; GFX9-NEXT: v_mul_lo_u32 v5, s3, v3 12400; GFX9-NEXT: v_mul_lo_u32 v4, s2, v3 12401; GFX9-NEXT: v_add_u32_e32 v0, v1, v0 12402; GFX9-NEXT: v_add_u32_e32 v5, v0, v5 12403; GFX9-NEXT: v_mul_hi_u32 v1, v3, v4 12404; GFX9-NEXT: v_mul_lo_u32 v6, v3, v5 12405; GFX9-NEXT: v_mul_hi_u32 v7, v3, v5 12406; GFX9-NEXT: v_mul_hi_u32 v8, v2, v5 12407; GFX9-NEXT: v_mul_lo_u32 v5, v2, v5 12408; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v6 12409; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 12410; GFX9-NEXT: v_mul_lo_u32 v7, v2, v4 12411; GFX9-NEXT: v_mul_hi_u32 v4, v2, v4 12412; GFX9-NEXT: v_mov_b32_e32 v0, 0 12413; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v7 12414; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v4, vcc 12415; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v0, vcc 12416; GFX9-NEXT: v_mov_b32_e32 v1, 0 12417; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 12418; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v6, vcc 12419; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 12420; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v5, vcc 12421; GFX9-NEXT: v_mul_lo_u32 v4, s2, v2 12422; GFX9-NEXT: v_mul_hi_u32 v5, s2, v3 12423; GFX9-NEXT: v_mul_lo_u32 v6, s3, v3 12424; GFX9-NEXT: v_mul_lo_u32 v7, s2, v3 12425; GFX9-NEXT: s_add_u32 s2, s4, s14 12426; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 12427; GFX9-NEXT: v_add_u32_e32 v4, v4, v6 12428; GFX9-NEXT: v_mul_lo_u32 v8, v3, v4 12429; GFX9-NEXT: v_mul_hi_u32 v9, v3, v7 12430; GFX9-NEXT: v_mul_hi_u32 v10, v3, v4 12431; GFX9-NEXT: v_mul_hi_u32 v6, v2, v7 12432; GFX9-NEXT: v_mul_lo_u32 v7, v2, v7 12433; GFX9-NEXT: v_mul_hi_u32 v5, v2, v4 12434; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 12435; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc 12436; GFX9-NEXT: v_mul_lo_u32 v4, v2, v4 12437; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 12438; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v9, v6, vcc 12439; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v0, vcc 12440; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 12441; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v5, vcc 12442; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 12443; GFX9-NEXT: s_addc_u32 s3, s5, s14 12444; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v5, vcc 12445; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], s[14:15] 12446; GFX9-NEXT: v_mul_lo_u32 v4, s4, v2 12447; GFX9-NEXT: v_mul_hi_u32 v5, s4, v3 12448; GFX9-NEXT: v_mul_hi_u32 v6, s4, v2 12449; GFX9-NEXT: v_mul_hi_u32 v7, s5, v2 12450; GFX9-NEXT: v_mul_lo_u32 v2, s5, v2 12451; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 12452; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc 12453; GFX9-NEXT: v_mul_lo_u32 v6, s5, v3 12454; GFX9-NEXT: v_mul_hi_u32 v3, s5, v3 12455; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 12456; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 12457; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc 12458; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v0, vcc 12459; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 12460; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v4, vcc 12461; GFX9-NEXT: v_mul_lo_u32 v4, s10, v3 12462; GFX9-NEXT: v_mul_hi_u32 v5, s10, v2 12463; GFX9-NEXT: v_mul_lo_u32 v6, s11, v2 12464; GFX9-NEXT: v_mov_b32_e32 v7, s11 12465; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 12466; GFX9-NEXT: v_mul_lo_u32 v5, s10, v2 12467; GFX9-NEXT: v_add_u32_e32 v4, v4, v6 12468; GFX9-NEXT: v_sub_u32_e32 v6, s5, v4 12469; GFX9-NEXT: v_sub_co_u32_e32 v5, vcc, s4, v5 12470; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v7, vcc 12471; GFX9-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s10, v5 12472; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1] 12473; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v6 12474; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 12475; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v7 12476; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 12477; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v6 12478; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] 12479; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 12480; GFX9-NEXT: v_cndmask_b32_e64 v6, 1, 2, s[0:1] 12481; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v2, v6 12482; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v3, s[0:1] 12483; GFX9-NEXT: s_xor_b64 s[0:1], s[14:15], s[12:13] 12484; GFX9-NEXT: s_ashr_i32 s4, s9, 31 12485; GFX9-NEXT: s_add_u32 s8, s8, s4 12486; GFX9-NEXT: v_mov_b32_e32 v8, s5 12487; GFX9-NEXT: s_mov_b32 s5, s4 12488; GFX9-NEXT: s_addc_u32 s9, s9, s4 12489; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5] 12490; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v8, v4, vcc 12491; GFX9-NEXT: v_cvt_f32_u32_e32 v8, s8 12492; GFX9-NEXT: v_cvt_f32_u32_e32 v9, s9 12493; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v4 12494; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc 12495; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v5 12496; GFX9-NEXT: v_mac_f32_e32 v8, s16, v9 12497; GFX9-NEXT: v_rcp_f32_e32 v8, v8 12498; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 12499; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v4 12500; GFX9-NEXT: v_cndmask_b32_e32 v4, v10, v5, vcc 12501; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 12502; GFX9-NEXT: v_mul_f32_e32 v4, s17, v8 12503; GFX9-NEXT: v_mul_f32_e32 v5, s18, v4 12504; GFX9-NEXT: v_trunc_f32_e32 v5, v5 12505; GFX9-NEXT: v_mac_f32_e32 v4, s19, v5 12506; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 12507; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 12508; GFX9-NEXT: s_sub_u32 s10, 0, s8 12509; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 12510; GFX9-NEXT: s_subb_u32 s11, 0, s9 12511; GFX9-NEXT: v_mul_hi_u32 v6, s10, v4 12512; GFX9-NEXT: v_mul_lo_u32 v8, s10, v5 12513; GFX9-NEXT: v_mul_lo_u32 v9, s11, v4 12514; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc 12515; GFX9-NEXT: v_mul_lo_u32 v7, s10, v4 12516; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 12517; GFX9-NEXT: v_add_u32_e32 v6, v6, v9 12518; GFX9-NEXT: v_mul_lo_u32 v8, v4, v6 12519; GFX9-NEXT: v_mul_hi_u32 v9, v4, v7 12520; GFX9-NEXT: v_mul_hi_u32 v10, v4, v6 12521; GFX9-NEXT: v_mul_hi_u32 v11, v5, v6 12522; GFX9-NEXT: v_mul_lo_u32 v6, v5, v6 12523; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 12524; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc 12525; GFX9-NEXT: v_mul_lo_u32 v10, v5, v7 12526; GFX9-NEXT: v_mul_hi_u32 v7, v5, v7 12527; GFX9-NEXT: v_xor_b32_e32 v2, s0, v2 12528; GFX9-NEXT: v_xor_b32_e32 v3, s1, v3 12529; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 12530; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v7, vcc 12531; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v0, vcc 12532; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 12533; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v8, vcc 12534; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 12535; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc 12536; GFX9-NEXT: v_mul_lo_u32 v6, s10, v5 12537; GFX9-NEXT: v_mul_hi_u32 v7, s10, v4 12538; GFX9-NEXT: v_mul_lo_u32 v8, s11, v4 12539; GFX9-NEXT: v_mul_lo_u32 v9, s10, v4 12540; GFX9-NEXT: s_ashr_i32 s10, s7, 31 12541; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 12542; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 12543; GFX9-NEXT: v_mul_lo_u32 v10, v4, v6 12544; GFX9-NEXT: v_mul_hi_u32 v11, v4, v9 12545; GFX9-NEXT: v_mul_hi_u32 v12, v4, v6 12546; GFX9-NEXT: v_mul_hi_u32 v8, v5, v9 12547; GFX9-NEXT: v_mul_lo_u32 v9, v5, v9 12548; GFX9-NEXT: v_mul_hi_u32 v7, v5, v6 12549; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 12550; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc 12551; GFX9-NEXT: v_mul_lo_u32 v6, v5, v6 12552; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 12553; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc 12554; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v0, vcc 12555; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6 12556; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v7, vcc 12557; GFX9-NEXT: s_add_u32 s6, s6, s10 12558; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 12559; GFX9-NEXT: s_mov_b32 s11, s10 12560; GFX9-NEXT: s_addc_u32 s7, s7, s10 12561; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc 12562; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] 12563; GFX9-NEXT: v_mul_lo_u32 v6, s6, v5 12564; GFX9-NEXT: v_mul_hi_u32 v7, s6, v4 12565; GFX9-NEXT: v_mul_hi_u32 v9, s6, v5 12566; GFX9-NEXT: v_mul_hi_u32 v10, s7, v5 12567; GFX9-NEXT: v_mul_lo_u32 v5, s7, v5 12568; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 12569; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v9, vcc 12570; GFX9-NEXT: v_mul_lo_u32 v9, s7, v4 12571; GFX9-NEXT: v_mul_hi_u32 v4, s7, v4 12572; GFX9-NEXT: v_mov_b32_e32 v8, s1 12573; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v9 12574; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc 12575; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v10, v0, vcc 12576; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 12577; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v6, vcc 12578; GFX9-NEXT: v_mul_lo_u32 v6, s8, v5 12579; GFX9-NEXT: v_mul_hi_u32 v7, s8, v4 12580; GFX9-NEXT: v_mul_lo_u32 v9, s9, v4 12581; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s0, v2 12582; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v8, vcc 12583; GFX9-NEXT: v_add_u32_e32 v3, v7, v6 12584; GFX9-NEXT: v_mul_lo_u32 v6, s8, v4 12585; GFX9-NEXT: v_add_u32_e32 v3, v3, v9 12586; GFX9-NEXT: v_sub_u32_e32 v7, s7, v3 12587; GFX9-NEXT: v_mov_b32_e32 v8, s9 12588; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, s6, v6 12589; GFX9-NEXT: v_subb_co_u32_e64 v7, s[0:1], v7, v8, vcc 12590; GFX9-NEXT: v_subrev_co_u32_e64 v8, s[0:1], s8, v6 12591; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[0:1], 0, v7, s[0:1] 12592; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v7 12593; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] 12594; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 12595; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 12596; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v7 12597; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v8, s[0:1] 12598; GFX9-NEXT: v_mov_b32_e32 v9, s7 12599; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v3, vcc 12600; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 12601; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 12602; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc 12603; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v6 12604; GFX9-NEXT: v_cndmask_b32_e64 v7, 1, 2, s[0:1] 12605; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 12606; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 12607; GFX9-NEXT: v_add_co_u32_e64 v7, s[0:1], v4, v7 12608; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v6, vcc 12609; GFX9-NEXT: v_addc_co_u32_e64 v8, s[0:1], 0, v5, s[0:1] 12610; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 12611; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc 12612; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[4:5] 12613; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v8, vcc 12614; GFX9-NEXT: v_xor_b32_e32 v3, s0, v3 12615; GFX9-NEXT: v_xor_b32_e32 v4, s1, v4 12616; GFX9-NEXT: v_mov_b32_e32 v5, s1 12617; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v3 12618; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v5, vcc 12619; GFX9-NEXT: s_waitcnt lgkmcnt(0) 12620; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] 12621; GFX9-NEXT: s_endpgm 12622; 12623; GFX90A-LABEL: sdiv_v2i64_pow2_shl_denom: 12624; GFX90A: ; %bb.0: 12625; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 12626; GFX90A-NEXT: s_mov_b64 s[2:3], 0x1000 12627; GFX90A-NEXT: s_mov_b32 s16, 0x4f800000 12628; GFX90A-NEXT: s_mov_b32 s17, 0x5f7ffffc 12629; GFX90A-NEXT: s_mov_b32 s18, 0x2f800000 12630; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 12631; GFX90A-NEXT: s_lshl_b64 s[8:9], s[2:3], s6 12632; GFX90A-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 12633; GFX90A-NEXT: s_ashr_i32 s10, s3, 31 12634; GFX90A-NEXT: s_add_u32 s2, s2, s10 12635; GFX90A-NEXT: s_mov_b32 s11, s10 12636; GFX90A-NEXT: s_addc_u32 s3, s3, s10 12637; GFX90A-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] 12638; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s12 12639; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s13 12640; GFX90A-NEXT: s_mov_b32 s19, 0xcf800000 12641; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 12642; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 12643; GFX90A-NEXT: s_sub_u32 s0, 0, s12 12644; GFX90A-NEXT: v_mac_f32_e32 v0, s16, v1 12645; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 12646; GFX90A-NEXT: s_subb_u32 s1, 0, s13 12647; GFX90A-NEXT: v_mov_b32_e32 v4, 0 12648; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 12649; GFX90A-NEXT: s_ashr_i32 s14, s5, 31 12650; GFX90A-NEXT: v_mul_f32_e32 v0, s17, v0 12651; GFX90A-NEXT: v_mul_f32_e32 v1, s18, v0 12652; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 12653; GFX90A-NEXT: v_mac_f32_e32 v0, s19, v1 12654; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 12655; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 12656; GFX90A-NEXT: s_mov_b32 s15, s14 12657; GFX90A-NEXT: v_mul_hi_u32 v3, s0, v0 12658; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v1 12659; GFX90A-NEXT: v_mul_lo_u32 v2, s1, v0 12660; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 12661; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 12662; GFX90A-NEXT: v_mul_lo_u32 v6, s0, v0 12663; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 12664; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 12665; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 12666; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 12667; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 12668; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 12669; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 12670; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 12671; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 12672; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc 12673; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc 12674; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 12675; GFX90A-NEXT: v_mov_b32_e32 v6, 0 12676; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 12677; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc 12678; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 12679; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 12680; GFX90A-NEXT: v_mul_lo_u32 v2, s0, v1 12681; GFX90A-NEXT: v_mul_hi_u32 v3, s0, v0 12682; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 12683; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v0 12684; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 12685; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v0 12686; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v5 12687; GFX90A-NEXT: v_mul_lo_u32 v8, v1, v5 12688; GFX90A-NEXT: v_mul_lo_u32 v10, v0, v2 12689; GFX90A-NEXT: v_mul_hi_u32 v5, v0, v5 12690; GFX90A-NEXT: v_mul_hi_u32 v9, v0, v2 12691; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v10 12692; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc 12693; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 12694; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v2 12695; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v7, vcc 12696; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc 12697; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 12698; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 12699; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc 12700; GFX90A-NEXT: s_add_u32 s0, s4, s14 12701; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 12702; GFX90A-NEXT: s_addc_u32 s1, s5, s14 12703; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 12704; GFX90A-NEXT: s_xor_b64 s[4:5], s[0:1], s[14:15] 12705; GFX90A-NEXT: v_mul_lo_u32 v3, s4, v1 12706; GFX90A-NEXT: v_mul_hi_u32 v5, s4, v0 12707; GFX90A-NEXT: v_mul_hi_u32 v2, s4, v1 12708; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 12709; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 12710; GFX90A-NEXT: v_mul_hi_u32 v7, s5, v0 12711; GFX90A-NEXT: v_mul_lo_u32 v0, s5, v0 12712; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 12713; GFX90A-NEXT: v_mul_hi_u32 v5, s5, v1 12714; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v2, v7, vcc 12715; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc 12716; GFX90A-NEXT: v_mul_lo_u32 v1, s5, v1 12717; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 12718; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v2, vcc 12719; GFX90A-NEXT: v_mul_lo_u32 v2, s12, v1 12720; GFX90A-NEXT: v_mul_hi_u32 v3, s12, v0 12721; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 12722; GFX90A-NEXT: v_mul_lo_u32 v3, s13, v0 12723; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 12724; GFX90A-NEXT: v_mul_lo_u32 v5, s12, v0 12725; GFX90A-NEXT: v_sub_u32_e32 v3, s5, v2 12726; GFX90A-NEXT: v_mov_b32_e32 v7, s13 12727; GFX90A-NEXT: v_sub_co_u32_e32 v5, vcc, s4, v5 12728; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v7, vcc 12729; GFX90A-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s12, v5 12730; GFX90A-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] 12731; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v3 12732; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 12733; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 12734; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 12735; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v3 12736; GFX90A-NEXT: v_cndmask_b32_e64 v3, v8, v7, s[0:1] 12737; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 12738; GFX90A-NEXT: v_cndmask_b32_e64 v3, 1, 2, s[0:1] 12739; GFX90A-NEXT: v_mov_b32_e32 v8, s5 12740; GFX90A-NEXT: v_add_co_u32_e64 v3, s[0:1], v0, v3 12741; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v8, v2, vcc 12742; GFX90A-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] 12743; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 12744; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 12745; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 12746; GFX90A-NEXT: s_xor_b64 s[0:1], s[14:15], s[10:11] 12747; GFX90A-NEXT: s_ashr_i32 s4, s9, 31 12748; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 12749; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 12750; GFX90A-NEXT: s_add_u32 s8, s8, s4 12751; GFX90A-NEXT: v_cndmask_b32_e32 v2, v8, v5, vcc 12752; GFX90A-NEXT: s_mov_b32 s5, s4 12753; GFX90A-NEXT: s_addc_u32 s9, s9, s4 12754; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 12755; GFX90A-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5] 12756; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 12757; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s8 12758; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s9 12759; GFX90A-NEXT: v_xor_b32_e32 v0, s0, v0 12760; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc 12761; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 12762; GFX90A-NEXT: v_mac_f32_e32 v2, s16, v3 12763; GFX90A-NEXT: v_rcp_f32_e32 v2, v2 12764; GFX90A-NEXT: s_sub_u32 s0, 0, s8 12765; GFX90A-NEXT: v_xor_b32_e32 v1, s1, v1 12766; GFX90A-NEXT: v_mov_b32_e32 v5, s1 12767; GFX90A-NEXT: v_mul_f32_e32 v2, s17, v2 12768; GFX90A-NEXT: v_mul_f32_e32 v3, s18, v2 12769; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 12770; GFX90A-NEXT: v_mac_f32_e32 v2, s19, v3 12771; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 12772; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 12773; GFX90A-NEXT: s_subb_u32 s1, 0, s9 12774; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc 12775; GFX90A-NEXT: v_mul_hi_u32 v7, s0, v2 12776; GFX90A-NEXT: v_mul_lo_u32 v8, s0, v3 12777; GFX90A-NEXT: v_mul_lo_u32 v5, s1, v2 12778; GFX90A-NEXT: v_add_u32_e32 v7, v7, v8 12779; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 12780; GFX90A-NEXT: v_mul_lo_u32 v9, s0, v2 12781; GFX90A-NEXT: v_mul_lo_u32 v8, v2, v5 12782; GFX90A-NEXT: v_mul_hi_u32 v10, v2, v9 12783; GFX90A-NEXT: v_mul_hi_u32 v7, v2, v5 12784; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 12785; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc 12786; GFX90A-NEXT: v_mul_hi_u32 v11, v3, v9 12787; GFX90A-NEXT: v_mul_lo_u32 v9, v3, v9 12788; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9 12789; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v5 12790; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v11, vcc 12791; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v10, v4, vcc 12792; GFX90A-NEXT: v_mul_lo_u32 v5, v3, v5 12793; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 12794; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v6, v8, vcc 12795; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 12796; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc 12797; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v3 12798; GFX90A-NEXT: v_mul_hi_u32 v7, s0, v2 12799; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 12800; GFX90A-NEXT: v_mul_lo_u32 v7, s1, v2 12801; GFX90A-NEXT: v_add_u32_e32 v5, v5, v7 12802; GFX90A-NEXT: v_mul_lo_u32 v8, s0, v2 12803; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v8 12804; GFX90A-NEXT: v_mul_lo_u32 v10, v3, v8 12805; GFX90A-NEXT: v_mul_lo_u32 v12, v2, v5 12806; GFX90A-NEXT: v_mul_hi_u32 v8, v2, v8 12807; GFX90A-NEXT: v_mul_hi_u32 v11, v2, v5 12808; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 12809; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc 12810; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 12811; GFX90A-NEXT: v_mul_hi_u32 v7, v3, v5 12812; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc 12813; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v4, vcc 12814; GFX90A-NEXT: v_mul_lo_u32 v5, v3, v5 12815; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v8, v5 12816; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 12817; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v6, v7, vcc 12818; GFX90A-NEXT: s_add_u32 s0, s6, s10 12819; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 12820; GFX90A-NEXT: s_mov_b32 s11, s10 12821; GFX90A-NEXT: s_addc_u32 s1, s7, s10 12822; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc 12823; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 12824; GFX90A-NEXT: v_mul_lo_u32 v7, s6, v3 12825; GFX90A-NEXT: v_mul_hi_u32 v8, s6, v2 12826; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v3 12827; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 12828; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc 12829; GFX90A-NEXT: v_mul_hi_u32 v9, s7, v2 12830; GFX90A-NEXT: v_mul_lo_u32 v2, s7, v2 12831; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 12832; GFX90A-NEXT: v_mul_hi_u32 v8, s7, v3 12833; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v9, vcc 12834; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v4, vcc 12835; GFX90A-NEXT: v_mul_lo_u32 v3, s7, v3 12836; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 12837; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc 12838; GFX90A-NEXT: v_mul_lo_u32 v5, s8, v3 12839; GFX90A-NEXT: v_mul_hi_u32 v6, s8, v2 12840; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 12841; GFX90A-NEXT: v_mul_lo_u32 v6, s9, v2 12842; GFX90A-NEXT: v_add_u32_e32 v5, v5, v6 12843; GFX90A-NEXT: v_mul_lo_u32 v7, s8, v2 12844; GFX90A-NEXT: v_sub_u32_e32 v6, s7, v5 12845; GFX90A-NEXT: v_mov_b32_e32 v8, s9 12846; GFX90A-NEXT: v_sub_co_u32_e32 v7, vcc, s6, v7 12847; GFX90A-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v8, vcc 12848; GFX90A-NEXT: v_subrev_co_u32_e64 v8, s[0:1], s8, v7 12849; GFX90A-NEXT: v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1] 12850; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 12851; GFX90A-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] 12852; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 12853; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 12854; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 12855; GFX90A-NEXT: v_cndmask_b32_e64 v6, v9, v8, s[0:1] 12856; GFX90A-NEXT: v_mov_b32_e32 v9, s7 12857; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v9, v5, vcc 12858; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 12859; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 12860; GFX90A-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc 12861; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v7 12862; GFX90A-NEXT: v_cndmask_b32_e64 v6, 1, 2, s[0:1] 12863; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 12864; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s9, v5 12865; GFX90A-NEXT: v_add_co_u32_e64 v6, s[0:1], v2, v6 12866; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc 12867; GFX90A-NEXT: v_addc_co_u32_e64 v8, s[0:1], 0, v3, s[0:1] 12868; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 12869; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 12870; GFX90A-NEXT: s_xor_b64 s[0:1], s[10:11], s[4:5] 12871; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc 12872; GFX90A-NEXT: v_xor_b32_e32 v2, s0, v2 12873; GFX90A-NEXT: v_xor_b32_e32 v3, s1, v3 12874; GFX90A-NEXT: v_mov_b32_e32 v5, s1 12875; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2 12876; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc 12877; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 12878; GFX90A-NEXT: s_endpgm 12879 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 12880 %r = sdiv <2 x i64> %x, %shl.y 12881 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 12882 ret void 12883} 12884 12885define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 12886; CHECK-LABEL: @srem_i64_oddk_denom( 12887; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 1235195 12888; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 12889; CHECK-NEXT: ret void 12890; 12891; GFX6-LABEL: srem_i64_oddk_denom: 12892; GFX6: ; %bb.0: 12893; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 12894; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 12895; GFX6-NEXT: v_rcp_f32_e32 v0, v0 12896; GFX6-NEXT: s_mov_b32 s4, 0xffed2705 12897; GFX6-NEXT: v_mov_b32_e32 v8, 0 12898; GFX6-NEXT: v_mov_b32_e32 v7, 0 12899; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 12900; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 12901; GFX6-NEXT: v_trunc_f32_e32 v1, v1 12902; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 12903; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 12904; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 12905; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 12906; GFX6-NEXT: s_mov_b32 s7, 0xf000 12907; GFX6-NEXT: v_mul_lo_u32 v2, v1, s4 12908; GFX6-NEXT: v_mul_hi_u32 v3, v0, s4 12909; GFX6-NEXT: v_mul_lo_u32 v4, v0, s4 12910; GFX6-NEXT: s_waitcnt lgkmcnt(0) 12911; GFX6-NEXT: s_ashr_i32 s8, s3, 31 12912; GFX6-NEXT: s_add_u32 s2, s2, s8 12913; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 12914; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 12915; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 12916; GFX6-NEXT: v_mul_hi_u32 v6, v0, v4 12917; GFX6-NEXT: v_mul_hi_u32 v3, v0, v2 12918; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 12919; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 12920; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 12921; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 12922; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 12923; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc 12924; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 12925; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc 12926; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 12927; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 12928; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 12929; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 12930; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 12931; GFX6-NEXT: v_mul_lo_u32 v2, v1, s4 12932; GFX6-NEXT: v_mul_hi_u32 v3, v0, s4 12933; GFX6-NEXT: s_mov_b32 s9, s8 12934; GFX6-NEXT: s_addc_u32 s3, s3, s8 12935; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] 12936; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 12937; GFX6-NEXT: v_mul_lo_u32 v3, v0, s4 12938; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 12939; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 12940; GFX6-NEXT: v_mul_hi_u32 v9, v0, v3 12941; GFX6-NEXT: v_mul_hi_u32 v10, v0, v2 12942; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 12943; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 12944; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 12945; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 12946; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc 12947; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 12948; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 12949; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v9, v5, vcc 12950; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v7, vcc 12951; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 12952; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 12953; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 12954; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 12955; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 12956; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 12957; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 12958; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 12959; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 12960; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 12961; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 12962; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 12963; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 12964; GFX6-NEXT: s_mov_b32 s4, s0 12965; GFX6-NEXT: s_mov_b32 s0, 0x12d8fb 12966; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 12967; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 12968; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 12969; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 12970; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 12971; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 12972; GFX6-NEXT: v_mul_hi_u32 v2, v0, s0 12973; GFX6-NEXT: v_mul_lo_u32 v0, v0, s0 12974; GFX6-NEXT: s_mov_b32 s5, s1 12975; GFX6-NEXT: s_mov_b32 s6, -1 12976; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 12977; GFX6-NEXT: v_mov_b32_e32 v2, s3 12978; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 12979; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 12980; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v0 12981; GFX6-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc 12982; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s0, v2 12983; GFX6-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v3, vcc 12984; GFX6-NEXT: s_mov_b32 s0, 0x12d8fa 12985; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v2 12986; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 12987; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 12988; GFX6-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc 12989; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 12990; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v0 12991; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 12992; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 12993; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 12994; GFX6-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[0:1] 12995; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 12996; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 12997; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 12998; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 12999; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 13000; GFX6-NEXT: v_xor_b32_e32 v1, s8, v1 13001; GFX6-NEXT: v_mov_b32_e32 v2, s8 13002; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 13003; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 13004; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 13005; GFX6-NEXT: s_endpgm 13006; 13007; GFX9-LABEL: srem_i64_oddk_denom: 13008; GFX9: ; %bb.0: 13009; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 13010; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 13011; GFX9-NEXT: v_rcp_f32_e32 v0, v0 13012; GFX9-NEXT: s_mov_b32 s4, 0xffed2705 13013; GFX9-NEXT: v_mov_b32_e32 v7, 0 13014; GFX9-NEXT: v_mov_b32_e32 v5, 0 13015; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 13016; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 13017; GFX9-NEXT: v_trunc_f32_e32 v1, v1 13018; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 13019; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 13020; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 13021; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 13022; GFX9-NEXT: v_mul_lo_u32 v2, v1, s4 13023; GFX9-NEXT: v_mul_hi_u32 v3, v0, s4 13024; GFX9-NEXT: v_mul_lo_u32 v4, v0, s4 13025; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 13026; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 13027; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 13028; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 13029; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 13030; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 13031; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 13032; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 13033; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v8, vcc 13034; GFX9-NEXT: v_mul_lo_u32 v8, v1, v4 13035; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 13036; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 13037; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc 13038; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc 13039; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 13040; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 13041; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 13042; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 13043; GFX9-NEXT: v_mul_lo_u32 v2, v1, s4 13044; GFX9-NEXT: v_mul_hi_u32 v3, v0, s4 13045; GFX9-NEXT: v_mul_lo_u32 v4, v0, s4 13046; GFX9-NEXT: s_waitcnt lgkmcnt(0) 13047; GFX9-NEXT: s_ashr_i32 s4, s3, 31 13048; GFX9-NEXT: s_add_u32 s2, s2, s4 13049; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 13050; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 13051; GFX9-NEXT: v_mul_lo_u32 v8, v0, v2 13052; GFX9-NEXT: v_mul_hi_u32 v9, v0, v4 13053; GFX9-NEXT: v_mul_hi_u32 v10, v0, v2 13054; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 13055; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 13056; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 13057; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 13058; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v7, v10, vcc 13059; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 13060; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 13061; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v6, vcc 13062; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc 13063; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 13064; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v3, vcc 13065; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 13066; GFX9-NEXT: s_mov_b32 s5, s4 13067; GFX9-NEXT: s_addc_u32 s3, s3, s4 13068; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 13069; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] 13070; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 13071; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 13072; GFX9-NEXT: v_mul_hi_u32 v4, s2, v1 13073; GFX9-NEXT: v_mul_hi_u32 v6, s3, v1 13074; GFX9-NEXT: v_mul_lo_u32 v1, s3, v1 13075; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 13076; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 13077; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 13078; GFX9-NEXT: v_mul_hi_u32 v0, s3, v0 13079; GFX9-NEXT: s_mov_b32 s5, 0x12d8fb 13080; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 13081; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 13082; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc 13083; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 13084; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v2, vcc 13085; GFX9-NEXT: v_mul_lo_u32 v1, v1, s5 13086; GFX9-NEXT: v_mul_hi_u32 v2, v0, s5 13087; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 13088; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 13089; GFX9-NEXT: v_mov_b32_e32 v2, s3 13090; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 13091; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 13092; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s5, v0 13093; GFX9-NEXT: v_subbrev_co_u32_e32 v3, vcc, 0, v1, vcc 13094; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s5, v2 13095; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc 13096; GFX9-NEXT: s_mov_b32 s2, 0x12d8fa 13097; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s2, v2 13098; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 13099; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 13100; GFX9-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc 13101; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 13102; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 13103; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 13104; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s2, v0 13105; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 13106; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 13107; GFX9-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 13108; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 13109; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 13110; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 13111; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 13112; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 13113; GFX9-NEXT: v_mov_b32_e32 v2, s4 13114; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s4, v0 13115; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc 13116; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] 13117; GFX9-NEXT: s_endpgm 13118; 13119; GFX90A-LABEL: srem_i64_oddk_denom: 13120; GFX90A: ; %bb.0: 13121; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f800000 13122; GFX90A-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 13123; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 13124; GFX90A-NEXT: s_mov_b32 s4, 0xffed2705 13125; GFX90A-NEXT: v_mov_b32_e32 v8, 0 13126; GFX90A-NEXT: v_mov_b32_e32 v2, 0 13127; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 13128; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 13129; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 13130; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 13131; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 13132; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 13133; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 13134; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s4 13135; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s4 13136; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 13137; GFX90A-NEXT: v_sub_u32_e32 v3, v3, v0 13138; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s4 13139; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 13140; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 13141; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 13142; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 13143; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc 13144; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 13145; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 13146; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 13147; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 13148; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v9, vcc 13149; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 13150; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 13151; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 13152; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc 13153; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 13154; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 13155; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s4 13156; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s4 13157; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 13158; GFX90A-NEXT: v_sub_u32_e32 v3, v3, v0 13159; GFX90A-NEXT: v_mul_lo_u32 v5, v0, s4 13160; GFX90A-NEXT: v_mul_hi_u32 v6, v1, v5 13161; GFX90A-NEXT: v_mul_lo_u32 v7, v1, v5 13162; GFX90A-NEXT: v_mul_lo_u32 v10, v0, v3 13163; GFX90A-NEXT: v_mul_hi_u32 v5, v0, v5 13164; GFX90A-NEXT: v_mul_hi_u32 v9, v0, v3 13165; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v10 13166; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, v8, v9, vcc 13167; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 13168; GFX90A-NEXT: v_mul_hi_u32 v4, v1, v3 13169; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v6, vcc 13170; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc 13171; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 13172; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 13173; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 13174; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 13175; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc 13176; GFX90A-NEXT: s_add_u32 s2, s2, s4 13177; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 13178; GFX90A-NEXT: s_mov_b32 s5, s4 13179; GFX90A-NEXT: s_addc_u32 s3, s3, s4 13180; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 13181; GFX90A-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] 13182; GFX90A-NEXT: v_mul_lo_u32 v4, s2, v1 13183; GFX90A-NEXT: v_mul_hi_u32 v5, s2, v0 13184; GFX90A-NEXT: v_mul_hi_u32 v3, s2, v1 13185; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 13186; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc 13187; GFX90A-NEXT: v_mul_hi_u32 v6, s3, v0 13188; GFX90A-NEXT: v_mul_lo_u32 v0, s3, v0 13189; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 13190; GFX90A-NEXT: v_mul_hi_u32 v5, s3, v1 13191; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc 13192; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc 13193; GFX90A-NEXT: v_mul_lo_u32 v1, s3, v1 13194; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 13195; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v3, vcc 13196; GFX90A-NEXT: s_mov_b32 s5, 0x12d8fb 13197; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s5 13198; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s5 13199; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s5 13200; GFX90A-NEXT: v_add_u32_e32 v1, v3, v1 13201; GFX90A-NEXT: v_mov_b32_e32 v3, s3 13202; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 13203; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc 13204; GFX90A-NEXT: v_subrev_co_u32_e32 v3, vcc, s5, v0 13205; GFX90A-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v1, vcc 13206; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s5, v3 13207; GFX90A-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v4, vcc 13208; GFX90A-NEXT: s_mov_b32 s2, 0x12d8fa 13209; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s2, v3 13210; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 13211; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 13212; GFX90A-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc 13213; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 13214; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 13215; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc 13216; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s2, v0 13217; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 13218; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 13219; GFX90A-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 13220; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 13221; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 13222; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 13223; GFX90A-NEXT: v_xor_b32_e32 v0, s4, v0 13224; GFX90A-NEXT: v_xor_b32_e32 v1, s4, v1 13225; GFX90A-NEXT: v_mov_b32_e32 v3, s4 13226; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s4, v0 13227; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 13228; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 13229; GFX90A-NEXT: s_endpgm 13230 %r = srem i64 %x, 1235195 13231 store i64 %r, i64 addrspace(1)* %out 13232 ret void 13233} 13234 13235define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 13236; CHECK-LABEL: @srem_i64_pow2k_denom( 13237; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 4096 13238; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 13239; CHECK-NEXT: ret void 13240; 13241; GFX6-LABEL: srem_i64_pow2k_denom: 13242; GFX6: ; %bb.0: 13243; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 13244; GFX6-NEXT: s_mov_b32 s3, 0xf000 13245; GFX6-NEXT: s_mov_b32 s2, -1 13246; GFX6-NEXT: s_waitcnt lgkmcnt(0) 13247; GFX6-NEXT: s_mov_b32 s0, s4 13248; GFX6-NEXT: s_ashr_i32 s4, s7, 31 13249; GFX6-NEXT: s_lshr_b32 s4, s4, 20 13250; GFX6-NEXT: s_add_u32 s4, s6, s4 13251; GFX6-NEXT: s_mov_b32 s1, s5 13252; GFX6-NEXT: s_addc_u32 s5, s7, 0 13253; GFX6-NEXT: s_and_b32 s4, s4, 0xfffff000 13254; GFX6-NEXT: s_sub_u32 s4, s6, s4 13255; GFX6-NEXT: s_subb_u32 s5, s7, s5 13256; GFX6-NEXT: v_mov_b32_e32 v0, s4 13257; GFX6-NEXT: v_mov_b32_e32 v1, s5 13258; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 13259; GFX6-NEXT: s_endpgm 13260; 13261; GFX9-LABEL: srem_i64_pow2k_denom: 13262; GFX9: ; %bb.0: 13263; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 13264; GFX9-NEXT: v_mov_b32_e32 v2, 0 13265; GFX9-NEXT: s_waitcnt lgkmcnt(0) 13266; GFX9-NEXT: s_ashr_i32 s4, s3, 31 13267; GFX9-NEXT: s_lshr_b32 s4, s4, 20 13268; GFX9-NEXT: s_add_u32 s4, s2, s4 13269; GFX9-NEXT: s_addc_u32 s5, s3, 0 13270; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 13271; GFX9-NEXT: s_sub_u32 s2, s2, s4 13272; GFX9-NEXT: s_subb_u32 s3, s3, s5 13273; GFX9-NEXT: v_mov_b32_e32 v0, s2 13274; GFX9-NEXT: v_mov_b32_e32 v1, s3 13275; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 13276; GFX9-NEXT: s_endpgm 13277; 13278; GFX90A-LABEL: srem_i64_pow2k_denom: 13279; GFX90A: ; %bb.0: 13280; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 13281; GFX90A-NEXT: v_mov_b32_e32 v2, 0 13282; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 13283; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 13284; GFX90A-NEXT: s_lshr_b32 s4, s4, 20 13285; GFX90A-NEXT: s_add_u32 s4, s2, s4 13286; GFX90A-NEXT: s_addc_u32 s5, s3, 0 13287; GFX90A-NEXT: s_and_b32 s4, s4, 0xfffff000 13288; GFX90A-NEXT: s_sub_u32 s2, s2, s4 13289; GFX90A-NEXT: s_subb_u32 s3, s3, s5 13290; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] 13291; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 13292; GFX90A-NEXT: s_endpgm 13293 %r = srem i64 %x, 4096 13294 store i64 %r, i64 addrspace(1)* %out 13295 ret void 13296} 13297 13298define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 13299; CHECK-LABEL: @srem_i64_pow2_shl_denom( 13300; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 13301; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]] 13302; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 13303; CHECK-NEXT: ret void 13304; 13305; GFX6-LABEL: srem_i64_pow2_shl_denom: 13306; GFX6: ; %bb.0: 13307; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd 13308; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 13309; GFX6-NEXT: s_mov_b32 s7, 0xf000 13310; GFX6-NEXT: s_mov_b32 s6, -1 13311; GFX6-NEXT: s_waitcnt lgkmcnt(0) 13312; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 13313; GFX6-NEXT: s_ashr_i32 s4, s3, 31 13314; GFX6-NEXT: s_add_u32 s2, s2, s4 13315; GFX6-NEXT: s_mov_b32 s5, s4 13316; GFX6-NEXT: s_addc_u32 s3, s3, s4 13317; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] 13318; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 13319; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 13320; GFX6-NEXT: s_sub_u32 s4, 0, s8 13321; GFX6-NEXT: s_subb_u32 s5, 0, s9 13322; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 13323; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 13324; GFX6-NEXT: v_rcp_f32_e32 v0, v0 13325; GFX6-NEXT: s_waitcnt lgkmcnt(0) 13326; GFX6-NEXT: s_ashr_i32 s10, s3, 31 13327; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 13328; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 13329; GFX6-NEXT: v_trunc_f32_e32 v1, v1 13330; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 13331; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 13332; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 13333; GFX6-NEXT: s_add_u32 s2, s2, s10 13334; GFX6-NEXT: s_mov_b32 s11, s10 13335; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 13336; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 13337; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 13338; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 13339; GFX6-NEXT: s_addc_u32 s3, s3, s10 13340; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 13341; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 13342; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 13343; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 13344; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 13345; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 13346; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 13347; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 13348; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 13349; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 13350; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 13351; GFX6-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] 13352; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 13353; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 13354; GFX6-NEXT: v_mov_b32_e32 v4, 0 13355; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 13356; GFX6-NEXT: v_mov_b32_e32 v6, 0 13357; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 13358; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 13359; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 13360; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 13361; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 13362; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 13363; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 13364; GFX6-NEXT: s_mov_b32 s5, s1 13365; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 13366; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 13367; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 13368; GFX6-NEXT: v_mul_lo_u32 v8, v0, v2 13369; GFX6-NEXT: v_mul_hi_u32 v9, v0, v3 13370; GFX6-NEXT: v_mul_hi_u32 v10, v0, v2 13371; GFX6-NEXT: v_mul_hi_u32 v7, v1, v3 13372; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 13373; GFX6-NEXT: v_mul_hi_u32 v5, v1, v2 13374; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 13375; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 13376; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 13377; GFX6-NEXT: v_add_i32_e32 v3, vcc, v8, v3 13378; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v9, v7, vcc 13379; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v4, vcc 13380; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 13381; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 13382; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 13383; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 13384; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 13385; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 13386; GFX6-NEXT: v_mul_hi_u32 v5, s12, v1 13387; GFX6-NEXT: v_mul_hi_u32 v7, s13, v1 13388; GFX6-NEXT: v_mul_lo_u32 v1, s13, v1 13389; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 13390; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 13391; GFX6-NEXT: v_mul_lo_u32 v5, s13, v0 13392; GFX6-NEXT: v_mul_hi_u32 v0, s13, v0 13393; GFX6-NEXT: s_mov_b32 s4, s0 13394; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 13395; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 13396; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 13397; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 13398; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 13399; GFX6-NEXT: v_mul_lo_u32 v1, s8, v1 13400; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0 13401; GFX6-NEXT: v_mul_lo_u32 v3, s9, v0 13402; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 13403; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 13404; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 13405; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s13, v1 13406; GFX6-NEXT: v_mov_b32_e32 v3, s9 13407; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 13408; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 13409; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s8, v0 13410; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 13411; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v5 13412; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 13413; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v4 13414; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 13415; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 13416; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v5 13417; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s8, v4 13418; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 13419; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 13420; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 13421; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 13422; GFX6-NEXT: v_mov_b32_e32 v5, s13 13423; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 13424; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 13425; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 13426; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 13427; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 13428; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 13429; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 13430; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 13431; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 13432; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 13433; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 13434; GFX6-NEXT: v_xor_b32_e32 v0, s10, v0 13435; GFX6-NEXT: v_xor_b32_e32 v1, s10, v1 13436; GFX6-NEXT: v_mov_b32_e32 v2, s10 13437; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0 13438; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 13439; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 13440; GFX6-NEXT: s_endpgm 13441; 13442; GFX9-LABEL: srem_i64_pow2_shl_denom: 13443; GFX9: ; %bb.0: 13444; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 13445; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 13446; GFX9-NEXT: v_mov_b32_e32 v2, 0 13447; GFX9-NEXT: s_waitcnt lgkmcnt(0) 13448; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 13449; GFX9-NEXT: s_ashr_i32 s4, s3, 31 13450; GFX9-NEXT: s_add_u32 s2, s2, s4 13451; GFX9-NEXT: s_mov_b32 s5, s4 13452; GFX9-NEXT: s_addc_u32 s3, s3, s4 13453; GFX9-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] 13454; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 13455; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 13456; GFX9-NEXT: s_sub_u32 s2, 0, s8 13457; GFX9-NEXT: s_subb_u32 s3, 0, s9 13458; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 13459; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 13460; GFX9-NEXT: v_rcp_f32_e32 v0, v0 13461; GFX9-NEXT: s_waitcnt lgkmcnt(0) 13462; GFX9-NEXT: s_ashr_i32 s10, s7, 31 13463; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 13464; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 13465; GFX9-NEXT: v_trunc_f32_e32 v1, v1 13466; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 13467; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 13468; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 13469; GFX9-NEXT: s_add_u32 s0, s6, s10 13470; GFX9-NEXT: s_mov_b32 s11, s10 13471; GFX9-NEXT: v_mul_lo_u32 v3, s2, v1 13472; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 13473; GFX9-NEXT: v_mul_lo_u32 v6, s3, v0 13474; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 13475; GFX9-NEXT: s_addc_u32 s1, s7, s10 13476; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 13477; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 13478; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 13479; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3 13480; GFX9-NEXT: v_mul_hi_u32 v8, v0, v3 13481; GFX9-NEXT: v_mul_hi_u32 v7, v1, v5 13482; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 13483; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 13484; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 13485; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc 13486; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 13487; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 13488; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v7, vcc 13489; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v2, vcc 13490; GFX9-NEXT: v_mov_b32_e32 v6, 0 13491; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 13492; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc 13493; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 13494; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 13495; GFX9-NEXT: v_mul_lo_u32 v3, s2, v1 13496; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 13497; GFX9-NEXT: v_mul_lo_u32 v5, s3, v0 13498; GFX9-NEXT: v_mul_lo_u32 v7, s2, v0 13499; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 13500; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 13501; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 13502; GFX9-NEXT: v_mul_lo_u32 v8, v0, v3 13503; GFX9-NEXT: v_mul_hi_u32 v9, v0, v7 13504; GFX9-NEXT: v_mul_hi_u32 v10, v0, v3 13505; GFX9-NEXT: v_mul_hi_u32 v5, v1, v7 13506; GFX9-NEXT: v_mul_lo_u32 v7, v1, v7 13507; GFX9-NEXT: v_mul_hi_u32 v4, v1, v3 13508; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 13509; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc 13510; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 13511; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 13512; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc 13513; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc 13514; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 13515; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v4, vcc 13516; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 13517; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 13518; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 13519; GFX9-NEXT: v_mul_hi_u32 v4, s6, v0 13520; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 13521; GFX9-NEXT: v_mul_hi_u32 v7, s7, v1 13522; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 13523; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 13524; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 13525; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 13526; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 13527; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 13528; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v4, v0, vcc 13529; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v2, vcc 13530; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 13531; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v3, vcc 13532; GFX9-NEXT: v_mul_lo_u32 v1, s8, v1 13533; GFX9-NEXT: v_mul_hi_u32 v3, s8, v0 13534; GFX9-NEXT: v_mul_lo_u32 v4, s9, v0 13535; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 13536; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 13537; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 13538; GFX9-NEXT: v_sub_u32_e32 v3, s7, v1 13539; GFX9-NEXT: v_mov_b32_e32 v4, s9 13540; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 13541; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc 13542; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s8, v0 13543; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1] 13544; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v6 13545; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 13546; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v5 13547; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] 13548; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 13549; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v6 13550; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v5 13551; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] 13552; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] 13553; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 13554; GFX9-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[0:1] 13555; GFX9-NEXT: v_mov_b32_e32 v5, s7 13556; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc 13557; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 13558; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 13559; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 13560; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] 13561; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 13562; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 13563; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 13564; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 13565; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 13566; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 13567; GFX9-NEXT: v_xor_b32_e32 v0, s10, v0 13568; GFX9-NEXT: v_xor_b32_e32 v1, s10, v1 13569; GFX9-NEXT: v_mov_b32_e32 v3, s10 13570; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s10, v0 13571; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 13572; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 13573; GFX9-NEXT: s_endpgm 13574; 13575; GFX90A-LABEL: srem_i64_pow2_shl_denom: 13576; GFX90A: ; %bb.0: 13577; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x34 13578; GFX90A-NEXT: s_mov_b64 s[2:3], 0x1000 13579; GFX90A-NEXT: v_mov_b32_e32 v2, 0 13580; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 13581; GFX90A-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 13582; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 13583; GFX90A-NEXT: s_add_u32 s2, s2, s4 13584; GFX90A-NEXT: s_mov_b32 s5, s4 13585; GFX90A-NEXT: s_addc_u32 s3, s3, s4 13586; GFX90A-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] 13587; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 13588; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 13589; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 13590; GFX90A-NEXT: s_sub_u32 s0, 0, s8 13591; GFX90A-NEXT: s_subb_u32 s1, 0, s9 13592; GFX90A-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 13593; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 13594; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 13595; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 13596; GFX90A-NEXT: s_mov_b32 s11, s10 13597; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 13598; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 13599; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 13600; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 13601; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 13602; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 13603; GFX90A-NEXT: v_mul_lo_u32 v3, s0, v1 13604; GFX90A-NEXT: v_mul_hi_u32 v5, s0, v0 13605; GFX90A-NEXT: v_mul_lo_u32 v4, s1, v0 13606; GFX90A-NEXT: v_add_u32_e32 v3, v5, v3 13607; GFX90A-NEXT: v_mul_lo_u32 v6, s0, v0 13608; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 13609; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 13610; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 13611; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 13612; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 13613; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 13614; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 13615; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 13616; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 13617; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 13618; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc 13619; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 13620; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 13621; GFX90A-NEXT: v_mov_b32_e32 v6, 0 13622; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 13623; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc 13624; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 13625; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 13626; GFX90A-NEXT: v_mul_lo_u32 v3, s0, v1 13627; GFX90A-NEXT: v_mul_hi_u32 v4, s0, v0 13628; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 13629; GFX90A-NEXT: v_mul_lo_u32 v4, s1, v0 13630; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 13631; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v0 13632; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v5 13633; GFX90A-NEXT: v_mul_lo_u32 v8, v1, v5 13634; GFX90A-NEXT: v_mul_lo_u32 v10, v0, v3 13635; GFX90A-NEXT: v_mul_hi_u32 v5, v0, v5 13636; GFX90A-NEXT: v_mul_hi_u32 v9, v0, v3 13637; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v10 13638; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc 13639; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 13640; GFX90A-NEXT: v_mul_hi_u32 v4, v1, v3 13641; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v7, vcc 13642; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc 13643; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 13644; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 13645; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v4, vcc 13646; GFX90A-NEXT: s_add_u32 s0, s6, s10 13647; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 13648; GFX90A-NEXT: s_addc_u32 s1, s7, s10 13649; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 13650; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 13651; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 13652; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 13653; GFX90A-NEXT: v_mul_hi_u32 v3, s6, v1 13654; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 13655; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 13656; GFX90A-NEXT: v_mul_hi_u32 v7, s7, v0 13657; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 13658; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 13659; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 13660; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v7, vcc 13661; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc 13662; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 13663; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 13664; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v3, vcc 13665; GFX90A-NEXT: v_mul_lo_u32 v1, s8, v1 13666; GFX90A-NEXT: v_mul_hi_u32 v3, s8, v0 13667; GFX90A-NEXT: v_add_u32_e32 v1, v3, v1 13668; GFX90A-NEXT: v_mul_lo_u32 v3, s9, v0 13669; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 13670; GFX90A-NEXT: v_mul_lo_u32 v0, s8, v0 13671; GFX90A-NEXT: v_sub_u32_e32 v3, s7, v1 13672; GFX90A-NEXT: v_mov_b32_e32 v4, s9 13673; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 13674; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc 13675; GFX90A-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s8, v0 13676; GFX90A-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1] 13677; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v6 13678; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 13679; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v5 13680; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] 13681; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 13682; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v6 13683; GFX90A-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v5 13684; GFX90A-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] 13685; GFX90A-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] 13686; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 13687; GFX90A-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[0:1] 13688; GFX90A-NEXT: v_mov_b32_e32 v5, s7 13689; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc 13690; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 13691; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 13692; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 13693; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] 13694; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 13695; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 13696; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 13697; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 13698; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 13699; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 13700; GFX90A-NEXT: v_xor_b32_e32 v0, s10, v0 13701; GFX90A-NEXT: v_xor_b32_e32 v1, s10, v1 13702; GFX90A-NEXT: v_mov_b32_e32 v3, s10 13703; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s10, v0 13704; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 13705; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 13706; GFX90A-NEXT: s_endpgm 13707 %shl.y = shl i64 4096, %y 13708 %r = srem i64 %x, %shl.y 13709 store i64 %r, i64 addrspace(1)* %out 13710 ret void 13711} 13712 13713define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 13714; CHECK-LABEL: @srem_v2i64_pow2k_denom( 13715; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 13716; CHECK-NEXT: [[TMP2:%.*]] = srem i64 [[TMP1]], 4096 13717; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 13718; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 13719; CHECK-NEXT: [[TMP5:%.*]] = srem i64 [[TMP4]], 4096 13720; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 13721; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 13722; CHECK-NEXT: ret void 13723; 13724; GFX6-LABEL: srem_v2i64_pow2k_denom: 13725; GFX6: ; %bb.0: 13726; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 13727; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 13728; GFX6-NEXT: s_movk_i32 s8, 0xf000 13729; GFX6-NEXT: s_mov_b32 s7, 0xf000 13730; GFX6-NEXT: s_mov_b32 s6, -1 13731; GFX6-NEXT: s_waitcnt lgkmcnt(0) 13732; GFX6-NEXT: s_ashr_i32 s9, s1, 31 13733; GFX6-NEXT: s_lshr_b32 s9, s9, 20 13734; GFX6-NEXT: s_add_u32 s9, s0, s9 13735; GFX6-NEXT: s_addc_u32 s10, s1, 0 13736; GFX6-NEXT: s_and_b32 s9, s9, s8 13737; GFX6-NEXT: s_sub_u32 s0, s0, s9 13738; GFX6-NEXT: s_subb_u32 s1, s1, s10 13739; GFX6-NEXT: s_ashr_i32 s9, s3, 31 13740; GFX6-NEXT: s_lshr_b32 s9, s9, 20 13741; GFX6-NEXT: s_add_u32 s9, s2, s9 13742; GFX6-NEXT: s_addc_u32 s10, s3, 0 13743; GFX6-NEXT: s_and_b32 s8, s9, s8 13744; GFX6-NEXT: s_sub_u32 s2, s2, s8 13745; GFX6-NEXT: s_subb_u32 s3, s3, s10 13746; GFX6-NEXT: v_mov_b32_e32 v0, s0 13747; GFX6-NEXT: v_mov_b32_e32 v1, s1 13748; GFX6-NEXT: v_mov_b32_e32 v2, s2 13749; GFX6-NEXT: v_mov_b32_e32 v3, s3 13750; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 13751; GFX6-NEXT: s_endpgm 13752; 13753; GFX9-LABEL: srem_v2i64_pow2k_denom: 13754; GFX9: ; %bb.0: 13755; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 13756; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 13757; GFX9-NEXT: s_movk_i32 s8, 0xf000 13758; GFX9-NEXT: v_mov_b32_e32 v4, 0 13759; GFX9-NEXT: s_waitcnt lgkmcnt(0) 13760; GFX9-NEXT: s_ashr_i32 s0, s5, 31 13761; GFX9-NEXT: s_lshr_b32 s0, s0, 20 13762; GFX9-NEXT: s_add_u32 s0, s4, s0 13763; GFX9-NEXT: s_addc_u32 s1, s5, 0 13764; GFX9-NEXT: s_and_b32 s0, s0, s8 13765; GFX9-NEXT: s_sub_u32 s0, s4, s0 13766; GFX9-NEXT: s_subb_u32 s1, s5, s1 13767; GFX9-NEXT: s_ashr_i32 s4, s7, 31 13768; GFX9-NEXT: s_lshr_b32 s4, s4, 20 13769; GFX9-NEXT: s_add_u32 s4, s6, s4 13770; GFX9-NEXT: s_addc_u32 s5, s7, 0 13771; GFX9-NEXT: s_and_b32 s4, s4, s8 13772; GFX9-NEXT: s_sub_u32 s4, s6, s4 13773; GFX9-NEXT: s_subb_u32 s5, s7, s5 13774; GFX9-NEXT: v_mov_b32_e32 v0, s0 13775; GFX9-NEXT: v_mov_b32_e32 v1, s1 13776; GFX9-NEXT: v_mov_b32_e32 v2, s4 13777; GFX9-NEXT: v_mov_b32_e32 v3, s5 13778; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 13779; GFX9-NEXT: s_endpgm 13780; 13781; GFX90A-LABEL: srem_v2i64_pow2k_denom: 13782; GFX90A: ; %bb.0: 13783; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 13784; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 13785; GFX90A-NEXT: s_movk_i32 s8, 0xf000 13786; GFX90A-NEXT: v_mov_b32_e32 v4, 0 13787; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 13788; GFX90A-NEXT: s_ashr_i32 s0, s5, 31 13789; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 13790; GFX90A-NEXT: s_add_u32 s0, s4, s0 13791; GFX90A-NEXT: s_addc_u32 s1, s5, 0 13792; GFX90A-NEXT: s_and_b32 s0, s0, s8 13793; GFX90A-NEXT: s_sub_u32 s0, s4, s0 13794; GFX90A-NEXT: s_subb_u32 s1, s5, s1 13795; GFX90A-NEXT: s_ashr_i32 s4, s7, 31 13796; GFX90A-NEXT: s_lshr_b32 s4, s4, 20 13797; GFX90A-NEXT: s_add_u32 s4, s6, s4 13798; GFX90A-NEXT: s_addc_u32 s5, s7, 0 13799; GFX90A-NEXT: s_and_b32 s4, s4, s8 13800; GFX90A-NEXT: s_sub_u32 s4, s6, s4 13801; GFX90A-NEXT: s_subb_u32 s5, s7, s5 13802; GFX90A-NEXT: v_mov_b32_e32 v0, s0 13803; GFX90A-NEXT: v_mov_b32_e32 v1, s1 13804; GFX90A-NEXT: v_mov_b32_e32 v2, s4 13805; GFX90A-NEXT: v_mov_b32_e32 v3, s5 13806; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 13807; GFX90A-NEXT: s_endpgm 13808 %r = srem <2 x i64> %x, <i64 4096, i64 4096> 13809 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 13810 ret void 13811} 13812 13813define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 13814; CHECK-LABEL: @srem_v2i64_pow2_shl_denom( 13815; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 13816; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 13817; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 13818; CHECK-NEXT: [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]] 13819; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 13820; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 13821; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 13822; CHECK-NEXT: [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]] 13823; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 13824; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 13825; CHECK-NEXT: ret void 13826; 13827; GFX6-LABEL: srem_v2i64_pow2_shl_denom: 13828; GFX6: ; %bb.0: 13829; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 13830; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 13831; GFX6-NEXT: s_mov_b32 s18, 0x4f800000 13832; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc 13833; GFX6-NEXT: s_mov_b32 s20, 0x2f800000 13834; GFX6-NEXT: s_waitcnt lgkmcnt(0) 13835; GFX6-NEXT: s_lshl_b64 s[14:15], s[2:3], s6 13836; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 13837; GFX6-NEXT: s_ashr_i32 s4, s3, 31 13838; GFX6-NEXT: s_add_u32 s2, s2, s4 13839; GFX6-NEXT: s_mov_b32 s5, s4 13840; GFX6-NEXT: s_addc_u32 s3, s3, s4 13841; GFX6-NEXT: s_xor_b64 s[16:17], s[2:3], s[4:5] 13842; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s16 13843; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s17 13844; GFX6-NEXT: s_mov_b32 s21, 0xcf800000 13845; GFX6-NEXT: s_sub_u32 s2, 0, s16 13846; GFX6-NEXT: s_subb_u32 s3, 0, s17 13847; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1 13848; GFX6-NEXT: v_rcp_f32_e32 v0, v0 13849; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 13850; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 13851; GFX6-NEXT: s_mov_b32 s7, 0xf000 13852; GFX6-NEXT: s_mov_b32 s6, -1 13853; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0 13854; GFX6-NEXT: v_mul_f32_e32 v1, s20, v0 13855; GFX6-NEXT: v_trunc_f32_e32 v1, v1 13856; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 13857; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1 13858; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v0 13859; GFX6-NEXT: s_waitcnt lgkmcnt(0) 13860; GFX6-NEXT: s_ashr_i32 s12, s9, 31 13861; GFX6-NEXT: s_add_u32 s0, s8, s12 13862; GFX6-NEXT: v_mul_lo_u32 v0, s2, v2 13863; GFX6-NEXT: v_mul_hi_u32 v1, s2, v3 13864; GFX6-NEXT: v_mul_lo_u32 v4, s3, v3 13865; GFX6-NEXT: v_mul_lo_u32 v5, s2, v3 13866; GFX6-NEXT: s_mov_b32 s13, s12 13867; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 13868; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v4 13869; GFX6-NEXT: v_mul_lo_u32 v0, v3, v1 13870; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 13871; GFX6-NEXT: v_mul_hi_u32 v6, v3, v1 13872; GFX6-NEXT: v_mul_hi_u32 v7, v2, v1 13873; GFX6-NEXT: s_addc_u32 s1, s9, s12 13874; GFX6-NEXT: v_add_i32_e32 v0, vcc, v4, v0 13875; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 13876; GFX6-NEXT: v_mul_lo_u32 v6, v2, v5 13877; GFX6-NEXT: v_mul_hi_u32 v5, v2, v5 13878; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] 13879; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 13880; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc 13881; GFX6-NEXT: v_mul_lo_u32 v5, v2, v1 13882; GFX6-NEXT: v_mov_b32_e32 v0, 0 13883; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v7, v0, vcc 13884; GFX6-NEXT: v_mov_b32_e32 v1, 0 13885; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 13886; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v6, vcc 13887; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 13888; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v2, v5, vcc 13889; GFX6-NEXT: v_mul_lo_u32 v4, s2, v2 13890; GFX6-NEXT: v_mul_hi_u32 v5, s2, v3 13891; GFX6-NEXT: v_mul_lo_u32 v6, s3, v3 13892; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 13893; GFX6-NEXT: v_mul_lo_u32 v5, s2, v3 13894; GFX6-NEXT: v_add_i32_e32 v4, vcc, v6, v4 13895; GFX6-NEXT: v_mul_lo_u32 v8, v3, v4 13896; GFX6-NEXT: v_mul_hi_u32 v9, v3, v5 13897; GFX6-NEXT: v_mul_hi_u32 v10, v3, v4 13898; GFX6-NEXT: v_mul_hi_u32 v7, v2, v5 13899; GFX6-NEXT: v_mul_lo_u32 v5, v2, v5 13900; GFX6-NEXT: v_mul_hi_u32 v6, v2, v4 13901; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 13902; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 13903; GFX6-NEXT: v_mul_lo_u32 v4, v2, v4 13904; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 13905; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v7, vcc 13906; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v6, v0, vcc 13907; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 13908; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v6, vcc 13909; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 13910; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v2, v5, vcc 13911; GFX6-NEXT: v_mul_lo_u32 v4, s8, v2 13912; GFX6-NEXT: v_mul_hi_u32 v5, s8, v3 13913; GFX6-NEXT: v_mul_hi_u32 v6, s8, v2 13914; GFX6-NEXT: v_mul_hi_u32 v7, s9, v2 13915; GFX6-NEXT: v_mul_lo_u32 v2, s9, v2 13916; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 13917; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 13918; GFX6-NEXT: v_mul_lo_u32 v6, s9, v3 13919; GFX6-NEXT: v_mul_hi_u32 v3, s9, v3 13920; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 13921; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 13922; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v7, v0, vcc 13923; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 13924; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v1, v4, vcc 13925; GFX6-NEXT: v_mul_lo_u32 v3, s16, v3 13926; GFX6-NEXT: v_mul_hi_u32 v4, s16, v2 13927; GFX6-NEXT: v_mul_lo_u32 v5, s17, v2 13928; GFX6-NEXT: v_mul_lo_u32 v2, s16, v2 13929; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 13930; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 13931; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s9, v3 13932; GFX6-NEXT: v_mov_b32_e32 v5, s17 13933; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s8, v2 13934; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 13935; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s16, v2 13936; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] 13937; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s17, v7 13938; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 13939; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v6 13940; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1] 13941; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 13942; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v7 13943; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s16, v6 13944; GFX6-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 13945; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 13946; GFX6-NEXT: s_ashr_i32 s2, s15, 31 13947; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 13948; GFX6-NEXT: s_add_u32 s8, s14, s2 13949; GFX6-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] 13950; GFX6-NEXT: v_mov_b32_e32 v7, s9 13951; GFX6-NEXT: s_mov_b32 s3, s2 13952; GFX6-NEXT: s_addc_u32 s9, s15, s2 13953; GFX6-NEXT: s_xor_b64 s[8:9], s[8:9], s[2:3] 13954; GFX6-NEXT: v_cvt_f32_u32_e32 v8, s8 13955; GFX6-NEXT: v_cvt_f32_u32_e32 v9, s9 13956; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc 13957; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s17, v3 13958; GFX6-NEXT: v_mac_f32_e32 v8, s18, v9 13959; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 13960; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s16, v2 13961; GFX6-NEXT: v_rcp_f32_e32 v8, v8 13962; GFX6-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc 13963; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s17, v3 13964; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc 13965; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 13966; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 13967; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 13968; GFX6-NEXT: v_mul_f32_e32 v5, s19, v8 13969; GFX6-NEXT: v_mul_f32_e32 v6, s20, v5 13970; GFX6-NEXT: v_trunc_f32_e32 v6, v6 13971; GFX6-NEXT: v_mac_f32_e32 v5, s21, v6 13972; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 13973; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v6 13974; GFX6-NEXT: s_sub_u32 s0, 0, s8 13975; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 13976; GFX6-NEXT: v_mul_hi_u32 v4, s0, v5 13977; GFX6-NEXT: v_mul_lo_u32 v7, s0, v6 13978; GFX6-NEXT: s_subb_u32 s1, 0, s9 13979; GFX6-NEXT: v_mul_lo_u32 v8, s1, v5 13980; GFX6-NEXT: s_ashr_i32 s14, s11, 31 13981; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 13982; GFX6-NEXT: v_mul_lo_u32 v7, s0, v5 13983; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8 13984; GFX6-NEXT: v_mul_lo_u32 v8, v5, v4 13985; GFX6-NEXT: v_mul_hi_u32 v9, v5, v7 13986; GFX6-NEXT: v_mul_hi_u32 v10, v5, v4 13987; GFX6-NEXT: v_mul_hi_u32 v11, v6, v4 13988; GFX6-NEXT: v_mul_lo_u32 v4, v6, v4 13989; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 13990; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 13991; GFX6-NEXT: v_mul_lo_u32 v10, v6, v7 13992; GFX6-NEXT: v_mul_hi_u32 v7, v6, v7 13993; GFX6-NEXT: s_mov_b32 s15, s14 13994; GFX6-NEXT: v_xor_b32_e32 v2, s12, v2 13995; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v10 13996; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc 13997; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v11, v0, vcc 13998; GFX6-NEXT: v_add_i32_e32 v4, vcc, v7, v4 13999; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v1, v8, vcc 14000; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 14001; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v6, v7, vcc 14002; GFX6-NEXT: v_mul_lo_u32 v6, s0, v5 14003; GFX6-NEXT: v_mul_hi_u32 v7, s0, v4 14004; GFX6-NEXT: v_mul_lo_u32 v8, s1, v4 14005; GFX6-NEXT: v_xor_b32_e32 v3, s12, v3 14006; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 14007; GFX6-NEXT: v_mul_lo_u32 v7, s0, v4 14008; GFX6-NEXT: v_add_i32_e32 v6, vcc, v8, v6 14009; GFX6-NEXT: v_mul_lo_u32 v10, v4, v6 14010; GFX6-NEXT: v_mul_hi_u32 v11, v4, v7 14011; GFX6-NEXT: v_mul_hi_u32 v12, v4, v6 14012; GFX6-NEXT: v_mul_hi_u32 v9, v5, v7 14013; GFX6-NEXT: v_mul_lo_u32 v7, v5, v7 14014; GFX6-NEXT: v_mul_hi_u32 v8, v5, v6 14015; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 14016; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 14017; GFX6-NEXT: v_mul_lo_u32 v6, v5, v6 14018; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 14019; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 14020; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v8, v0, vcc 14021; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 14022; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v1, v8, vcc 14023; GFX6-NEXT: s_add_u32 s0, s10, s14 14024; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 14025; GFX6-NEXT: s_addc_u32 s1, s11, s14 14026; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc 14027; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 14028; GFX6-NEXT: v_mul_lo_u32 v6, s10, v5 14029; GFX6-NEXT: v_mul_hi_u32 v7, s10, v4 14030; GFX6-NEXT: v_mul_hi_u32 v9, s10, v5 14031; GFX6-NEXT: v_mul_hi_u32 v10, s11, v5 14032; GFX6-NEXT: v_mul_lo_u32 v5, s11, v5 14033; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 14034; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc 14035; GFX6-NEXT: v_mul_lo_u32 v9, s11, v4 14036; GFX6-NEXT: v_mul_hi_u32 v4, s11, v4 14037; GFX6-NEXT: v_mov_b32_e32 v8, s12 14038; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v9 14039; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc 14040; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v10, v0, vcc 14041; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 14042; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v1, v0, vcc 14043; GFX6-NEXT: v_mul_lo_u32 v5, s8, v0 14044; GFX6-NEXT: v_mul_hi_u32 v6, s8, v4 14045; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v2 14046; GFX6-NEXT: v_mul_lo_u32 v2, s9, v4 14047; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v3, v8, vcc 14048; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v5 14049; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 14050; GFX6-NEXT: v_mul_lo_u32 v3, s8, v4 14051; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s11, v2 14052; GFX6-NEXT: v_mov_b32_e32 v5, s9 14053; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 14054; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 14055; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s8, v3 14056; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] 14057; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v7 14058; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 14059; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v6 14060; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1] 14061; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 14062; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v7 14063; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s8, v6 14064; GFX6-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 14065; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 14066; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 14067; GFX6-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] 14068; GFX6-NEXT: v_mov_b32_e32 v7, s11 14069; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v7, v2, vcc 14070; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 14071; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 14072; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 14073; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 14074; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v2 14075; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 14076; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 14077; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 14078; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 14079; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 14080; GFX6-NEXT: v_xor_b32_e32 v3, s14, v3 14081; GFX6-NEXT: v_xor_b32_e32 v4, s14, v2 14082; GFX6-NEXT: v_mov_b32_e32 v5, s14 14083; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s14, v3 14084; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v4, v5, vcc 14085; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 14086; GFX6-NEXT: s_endpgm 14087; 14088; GFX9-LABEL: srem_v2i64_pow2_shl_denom: 14089; GFX9: ; %bb.0: 14090; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 14091; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 14092; GFX9-NEXT: s_mov_b32 s16, 0x4f800000 14093; GFX9-NEXT: s_mov_b32 s17, 0x5f7ffffc 14094; GFX9-NEXT: s_mov_b32 s18, 0x2f800000 14095; GFX9-NEXT: s_waitcnt lgkmcnt(0) 14096; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s6 14097; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 14098; GFX9-NEXT: s_ashr_i32 s4, s3, 31 14099; GFX9-NEXT: s_add_u32 s2, s2, s4 14100; GFX9-NEXT: s_mov_b32 s5, s4 14101; GFX9-NEXT: s_addc_u32 s3, s3, s4 14102; GFX9-NEXT: s_xor_b64 s[12:13], s[2:3], s[4:5] 14103; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 14104; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 14105; GFX9-NEXT: s_mov_b32 s19, 0xcf800000 14106; GFX9-NEXT: s_sub_u32 s2, 0, s12 14107; GFX9-NEXT: s_subb_u32 s3, 0, s13 14108; GFX9-NEXT: v_mac_f32_e32 v0, s16, v1 14109; GFX9-NEXT: v_rcp_f32_e32 v0, v0 14110; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 14111; GFX9-NEXT: v_mul_f32_e32 v0, s17, v0 14112; GFX9-NEXT: v_mul_f32_e32 v1, s18, v0 14113; GFX9-NEXT: v_trunc_f32_e32 v1, v1 14114; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 14115; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v1 14116; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 14117; GFX9-NEXT: s_waitcnt lgkmcnt(0) 14118; GFX9-NEXT: s_ashr_i32 s8, s5, 31 14119; GFX9-NEXT: s_mov_b32 s9, s8 14120; GFX9-NEXT: v_mul_lo_u32 v0, s2, v2 14121; GFX9-NEXT: v_mul_hi_u32 v1, s2, v3 14122; GFX9-NEXT: v_mul_lo_u32 v5, s3, v3 14123; GFX9-NEXT: v_mul_lo_u32 v4, s2, v3 14124; GFX9-NEXT: v_add_u32_e32 v0, v1, v0 14125; GFX9-NEXT: v_add_u32_e32 v5, v0, v5 14126; GFX9-NEXT: v_mul_hi_u32 v1, v3, v4 14127; GFX9-NEXT: v_mul_lo_u32 v6, v3, v5 14128; GFX9-NEXT: v_mul_hi_u32 v7, v3, v5 14129; GFX9-NEXT: v_mul_hi_u32 v8, v2, v5 14130; GFX9-NEXT: v_mul_lo_u32 v5, v2, v5 14131; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v6 14132; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 14133; GFX9-NEXT: v_mul_lo_u32 v7, v2, v4 14134; GFX9-NEXT: v_mul_hi_u32 v4, v2, v4 14135; GFX9-NEXT: v_mov_b32_e32 v0, 0 14136; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v7 14137; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v4, vcc 14138; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v0, vcc 14139; GFX9-NEXT: v_mov_b32_e32 v1, 0 14140; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 14141; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v6, vcc 14142; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 14143; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v5, vcc 14144; GFX9-NEXT: v_mul_lo_u32 v4, s2, v2 14145; GFX9-NEXT: v_mul_hi_u32 v5, s2, v3 14146; GFX9-NEXT: v_mul_lo_u32 v6, s3, v3 14147; GFX9-NEXT: v_mul_lo_u32 v7, s2, v3 14148; GFX9-NEXT: s_add_u32 s2, s4, s8 14149; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 14150; GFX9-NEXT: v_add_u32_e32 v4, v4, v6 14151; GFX9-NEXT: v_mul_lo_u32 v8, v3, v4 14152; GFX9-NEXT: v_mul_hi_u32 v9, v3, v7 14153; GFX9-NEXT: v_mul_hi_u32 v10, v3, v4 14154; GFX9-NEXT: v_mul_hi_u32 v6, v2, v7 14155; GFX9-NEXT: v_mul_lo_u32 v7, v2, v7 14156; GFX9-NEXT: v_mul_hi_u32 v5, v2, v4 14157; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 14158; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc 14159; GFX9-NEXT: v_mul_lo_u32 v4, v2, v4 14160; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 14161; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v9, v6, vcc 14162; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v0, vcc 14163; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 14164; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v5, vcc 14165; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 14166; GFX9-NEXT: s_addc_u32 s3, s5, s8 14167; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v5, vcc 14168; GFX9-NEXT: s_xor_b64 s[14:15], s[2:3], s[8:9] 14169; GFX9-NEXT: v_mul_lo_u32 v4, s14, v2 14170; GFX9-NEXT: v_mul_hi_u32 v5, s14, v3 14171; GFX9-NEXT: v_mul_hi_u32 v6, s14, v2 14172; GFX9-NEXT: v_mul_hi_u32 v7, s15, v2 14173; GFX9-NEXT: v_mul_lo_u32 v2, s15, v2 14174; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 14175; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc 14176; GFX9-NEXT: v_mul_lo_u32 v6, s15, v3 14177; GFX9-NEXT: v_mul_hi_u32 v3, s15, v3 14178; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 14179; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 14180; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc 14181; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v0, vcc 14182; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 14183; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v4, vcc 14184; GFX9-NEXT: v_mul_lo_u32 v3, s12, v3 14185; GFX9-NEXT: v_mul_hi_u32 v4, s12, v2 14186; GFX9-NEXT: v_mul_lo_u32 v5, s13, v2 14187; GFX9-NEXT: v_mul_lo_u32 v2, s12, v2 14188; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 14189; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 14190; GFX9-NEXT: v_sub_u32_e32 v4, s15, v3 14191; GFX9-NEXT: v_mov_b32_e32 v5, s13 14192; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s14, v2 14193; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v5, vcc 14194; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s12, v2 14195; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[2:3], 0, v4, s[0:1] 14196; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v7 14197; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 14198; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v6 14199; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v5, s[0:1] 14200; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 14201; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v7 14202; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s12, v6 14203; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 14204; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] 14205; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 14206; GFX9-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[0:1] 14207; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] 14208; GFX9-NEXT: s_ashr_i32 s0, s11, 31 14209; GFX9-NEXT: s_add_u32 s2, s10, s0 14210; GFX9-NEXT: s_mov_b32 s1, s0 14211; GFX9-NEXT: s_addc_u32 s3, s11, s0 14212; GFX9-NEXT: v_mov_b32_e32 v6, s15 14213; GFX9-NEXT: s_xor_b64 s[10:11], s[2:3], s[0:1] 14214; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v6, v3, vcc 14215; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s10 14216; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s11 14217; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v3 14218; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 14219; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v2 14220; GFX9-NEXT: v_mac_f32_e32 v6, s16, v7 14221; GFX9-NEXT: v_rcp_f32_e32 v6, v6 14222; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc 14223; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s13, v3 14224; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc 14225; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 14226; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 14227; GFX9-NEXT: v_mul_f32_e32 v5, s17, v6 14228; GFX9-NEXT: v_mul_f32_e32 v6, s18, v5 14229; GFX9-NEXT: v_trunc_f32_e32 v6, v6 14230; GFX9-NEXT: v_mac_f32_e32 v5, s19, v6 14231; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 14232; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 14233; GFX9-NEXT: s_sub_u32 s0, 0, s10 14234; GFX9-NEXT: s_subb_u32 s1, 0, s11 14235; GFX9-NEXT: v_mul_hi_u32 v7, s0, v5 14236; GFX9-NEXT: v_mul_lo_u32 v8, s0, v6 14237; GFX9-NEXT: v_mul_lo_u32 v9, s1, v5 14238; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 14239; GFX9-NEXT: v_mul_lo_u32 v4, s0, v5 14240; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 14241; GFX9-NEXT: v_add_u32_e32 v7, v7, v9 14242; GFX9-NEXT: v_mul_lo_u32 v8, v5, v7 14243; GFX9-NEXT: v_mul_hi_u32 v9, v5, v4 14244; GFX9-NEXT: v_mul_hi_u32 v10, v5, v7 14245; GFX9-NEXT: v_mul_hi_u32 v11, v6, v7 14246; GFX9-NEXT: v_mul_lo_u32 v7, v6, v7 14247; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 14248; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc 14249; GFX9-NEXT: v_mul_lo_u32 v10, v6, v4 14250; GFX9-NEXT: v_mul_hi_u32 v4, v6, v4 14251; GFX9-NEXT: s_ashr_i32 s12, s7, 31 14252; GFX9-NEXT: s_mov_b32 s13, s12 14253; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 14254; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v4, vcc 14255; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v0, vcc 14256; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 14257; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v8, vcc 14258; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 14259; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v7, vcc 14260; GFX9-NEXT: v_mul_lo_u32 v6, s0, v5 14261; GFX9-NEXT: v_mul_hi_u32 v7, s0, v4 14262; GFX9-NEXT: v_mul_lo_u32 v8, s1, v4 14263; GFX9-NEXT: v_mul_lo_u32 v9, s0, v4 14264; GFX9-NEXT: s_add_u32 s0, s6, s12 14265; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 14266; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 14267; GFX9-NEXT: v_mul_lo_u32 v10, v4, v6 14268; GFX9-NEXT: v_mul_hi_u32 v11, v4, v9 14269; GFX9-NEXT: v_mul_hi_u32 v12, v4, v6 14270; GFX9-NEXT: v_mul_hi_u32 v8, v5, v9 14271; GFX9-NEXT: v_mul_lo_u32 v9, v5, v9 14272; GFX9-NEXT: v_mul_hi_u32 v7, v5, v6 14273; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 14274; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc 14275; GFX9-NEXT: v_mul_lo_u32 v6, v5, v6 14276; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 14277; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc 14278; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v0, vcc 14279; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6 14280; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v7, vcc 14281; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 14282; GFX9-NEXT: s_addc_u32 s1, s7, s12 14283; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v7, vcc 14284; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] 14285; GFX9-NEXT: v_mul_lo_u32 v6, s6, v5 14286; GFX9-NEXT: v_mul_hi_u32 v7, s6, v4 14287; GFX9-NEXT: v_mul_hi_u32 v9, s6, v5 14288; GFX9-NEXT: v_mul_hi_u32 v10, s7, v5 14289; GFX9-NEXT: v_mul_lo_u32 v5, s7, v5 14290; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 14291; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v9, vcc 14292; GFX9-NEXT: v_mul_lo_u32 v9, s7, v4 14293; GFX9-NEXT: v_mul_hi_u32 v4, s7, v4 14294; GFX9-NEXT: v_xor_b32_e32 v2, s8, v2 14295; GFX9-NEXT: v_xor_b32_e32 v3, s8, v3 14296; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v9 14297; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc 14298; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v10, v0, vcc 14299; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 14300; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc 14301; GFX9-NEXT: v_mul_lo_u32 v5, s10, v1 14302; GFX9-NEXT: v_mul_hi_u32 v6, s10, v4 14303; GFX9-NEXT: v_mul_lo_u32 v7, s11, v4 14304; GFX9-NEXT: v_mul_lo_u32 v4, s10, v4 14305; GFX9-NEXT: v_mov_b32_e32 v8, s8 14306; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s8, v2 14307; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v8, vcc 14308; GFX9-NEXT: v_add_u32_e32 v3, v6, v5 14309; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 14310; GFX9-NEXT: v_sub_u32_e32 v5, s7, v3 14311; GFX9-NEXT: v_mov_b32_e32 v6, s11 14312; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s6, v4 14313; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc 14314; GFX9-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s10, v4 14315; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[2:3], 0, v5, s[0:1] 14316; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s11, v8 14317; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 14318; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s10, v7 14319; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, s[0:1] 14320; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[2:3] 14321; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s11, v8 14322; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s10, v7 14323; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[2:3] 14324; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] 14325; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 14326; GFX9-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[0:1] 14327; GFX9-NEXT: v_mov_b32_e32 v7, s7 14328; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc 14329; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 14330; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 14331; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v4 14332; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1] 14333; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 14334; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v3 14335; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 14336; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 14337; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc 14338; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 14339; GFX9-NEXT: v_xor_b32_e32 v4, s12, v4 14340; GFX9-NEXT: v_xor_b32_e32 v5, s12, v3 14341; GFX9-NEXT: v_mov_b32_e32 v6, s12 14342; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s12, v4 14343; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v5, v6, vcc 14344; GFX9-NEXT: s_waitcnt lgkmcnt(0) 14345; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[4:5] 14346; GFX9-NEXT: s_endpgm 14347; 14348; GFX90A-LABEL: srem_v2i64_pow2_shl_denom: 14349; GFX90A: ; %bb.0: 14350; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 14351; GFX90A-NEXT: s_mov_b64 s[2:3], 0x1000 14352; GFX90A-NEXT: s_mov_b32 s16, 0x4f800000 14353; GFX90A-NEXT: s_mov_b32 s17, 0x5f7ffffc 14354; GFX90A-NEXT: s_mov_b32 s18, 0x2f800000 14355; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 14356; GFX90A-NEXT: s_lshl_b64 s[10:11], s[2:3], s6 14357; GFX90A-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 14358; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 14359; GFX90A-NEXT: s_add_u32 s2, s2, s4 14360; GFX90A-NEXT: s_mov_b32 s5, s4 14361; GFX90A-NEXT: s_addc_u32 s3, s3, s4 14362; GFX90A-NEXT: s_xor_b64 s[12:13], s[2:3], s[4:5] 14363; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s12 14364; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s13 14365; GFX90A-NEXT: s_mov_b32 s19, 0xcf800000 14366; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 14367; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 14368; GFX90A-NEXT: s_sub_u32 s0, 0, s12 14369; GFX90A-NEXT: v_mac_f32_e32 v0, s16, v1 14370; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 14371; GFX90A-NEXT: s_subb_u32 s1, 0, s13 14372; GFX90A-NEXT: v_mov_b32_e32 v4, 0 14373; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 14374; GFX90A-NEXT: s_ashr_i32 s14, s5, 31 14375; GFX90A-NEXT: v_mul_f32_e32 v0, s17, v0 14376; GFX90A-NEXT: v_mul_f32_e32 v1, s18, v0 14377; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 14378; GFX90A-NEXT: v_mac_f32_e32 v0, s19, v1 14379; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 14380; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 14381; GFX90A-NEXT: s_mov_b32 s15, s14 14382; GFX90A-NEXT: v_mul_hi_u32 v3, s0, v0 14383; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v1 14384; GFX90A-NEXT: v_mul_lo_u32 v2, s1, v0 14385; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 14386; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 14387; GFX90A-NEXT: v_mul_lo_u32 v6, s0, v0 14388; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 14389; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 14390; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 14391; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 14392; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 14393; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 14394; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 14395; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 14396; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 14397; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc 14398; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc 14399; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 14400; GFX90A-NEXT: v_mov_b32_e32 v6, 0 14401; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 14402; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc 14403; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 14404; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 14405; GFX90A-NEXT: v_mul_lo_u32 v2, s0, v1 14406; GFX90A-NEXT: v_mul_hi_u32 v3, s0, v0 14407; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 14408; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v0 14409; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 14410; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v0 14411; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v5 14412; GFX90A-NEXT: v_mul_lo_u32 v8, v1, v5 14413; GFX90A-NEXT: v_mul_lo_u32 v10, v0, v2 14414; GFX90A-NEXT: v_mul_hi_u32 v5, v0, v5 14415; GFX90A-NEXT: v_mul_hi_u32 v9, v0, v2 14416; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v10 14417; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v9, vcc 14418; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 14419; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v2 14420; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v7, vcc 14421; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc 14422; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 14423; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 14424; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc 14425; GFX90A-NEXT: s_add_u32 s0, s4, s14 14426; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 14427; GFX90A-NEXT: s_addc_u32 s1, s5, s14 14428; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 14429; GFX90A-NEXT: s_xor_b64 s[4:5], s[0:1], s[14:15] 14430; GFX90A-NEXT: v_mul_lo_u32 v3, s4, v1 14431; GFX90A-NEXT: v_mul_hi_u32 v5, s4, v0 14432; GFX90A-NEXT: v_mul_hi_u32 v2, s4, v1 14433; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 14434; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 14435; GFX90A-NEXT: v_mul_hi_u32 v7, s5, v0 14436; GFX90A-NEXT: v_mul_lo_u32 v0, s5, v0 14437; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 14438; GFX90A-NEXT: v_mul_hi_u32 v5, s5, v1 14439; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v2, v7, vcc 14440; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc 14441; GFX90A-NEXT: v_mul_lo_u32 v1, s5, v1 14442; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 14443; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v2, vcc 14444; GFX90A-NEXT: v_mul_lo_u32 v1, s12, v1 14445; GFX90A-NEXT: v_mul_hi_u32 v2, s12, v0 14446; GFX90A-NEXT: v_add_u32_e32 v1, v2, v1 14447; GFX90A-NEXT: v_mul_lo_u32 v2, s13, v0 14448; GFX90A-NEXT: v_add_u32_e32 v1, v1, v2 14449; GFX90A-NEXT: v_mul_lo_u32 v0, s12, v0 14450; GFX90A-NEXT: v_sub_u32_e32 v2, s5, v1 14451; GFX90A-NEXT: v_mov_b32_e32 v3, s13 14452; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 14453; GFX90A-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc 14454; GFX90A-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s12, v0 14455; GFX90A-NEXT: v_subbrev_co_u32_e64 v7, s[2:3], 0, v2, s[0:1] 14456; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v7 14457; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 14458; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v5 14459; GFX90A-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] 14460; GFX90A-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 14461; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v7 14462; GFX90A-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s12, v5 14463; GFX90A-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 14464; GFX90A-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] 14465; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 14466; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1] 14467; GFX90A-NEXT: v_mov_b32_e32 v5, s5 14468; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc 14469; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 14470; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v2, s[0:1] 14471; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 14472; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 14473; GFX90A-NEXT: s_ashr_i32 s0, s11, 31 14474; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 14475; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 14476; GFX90A-NEXT: s_add_u32 s2, s10, s0 14477; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc 14478; GFX90A-NEXT: s_mov_b32 s1, s0 14479; GFX90A-NEXT: s_addc_u32 s3, s11, s0 14480; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 14481; GFX90A-NEXT: s_xor_b64 s[4:5], s[2:3], s[0:1] 14482; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 14483; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 14484; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s4 14485; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s5 14486; GFX90A-NEXT: v_xor_b32_e32 v0, s14, v0 14487; GFX90A-NEXT: s_sub_u32 s0, 0, s4 14488; GFX90A-NEXT: v_xor_b32_e32 v1, s14, v1 14489; GFX90A-NEXT: v_mac_f32_e32 v2, s16, v3 14490; GFX90A-NEXT: v_rcp_f32_e32 v2, v2 14491; GFX90A-NEXT: v_mov_b32_e32 v5, s14 14492; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s14, v0 14493; GFX90A-NEXT: v_mul_f32_e32 v2, s17, v2 14494; GFX90A-NEXT: v_mul_f32_e32 v3, s18, v2 14495; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 14496; GFX90A-NEXT: v_mac_f32_e32 v2, s19, v3 14497; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 14498; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 14499; GFX90A-NEXT: s_subb_u32 s1, 0, s5 14500; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc 14501; GFX90A-NEXT: v_mul_hi_u32 v7, s0, v2 14502; GFX90A-NEXT: v_mul_lo_u32 v8, s0, v3 14503; GFX90A-NEXT: v_mul_lo_u32 v5, s1, v2 14504; GFX90A-NEXT: v_add_u32_e32 v7, v7, v8 14505; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 14506; GFX90A-NEXT: v_mul_lo_u32 v9, s0, v2 14507; GFX90A-NEXT: v_mul_lo_u32 v8, v2, v5 14508; GFX90A-NEXT: v_mul_hi_u32 v10, v2, v9 14509; GFX90A-NEXT: v_mul_hi_u32 v7, v2, v5 14510; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 14511; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc 14512; GFX90A-NEXT: v_mul_hi_u32 v11, v3, v9 14513; GFX90A-NEXT: v_mul_lo_u32 v9, v3, v9 14514; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9 14515; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v5 14516; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v11, vcc 14517; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v10, v4, vcc 14518; GFX90A-NEXT: v_mul_lo_u32 v5, v3, v5 14519; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 14520; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v6, v8, vcc 14521; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 14522; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc 14523; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v3 14524; GFX90A-NEXT: v_mul_hi_u32 v7, s0, v2 14525; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 14526; GFX90A-NEXT: v_mul_lo_u32 v7, s1, v2 14527; GFX90A-NEXT: v_add_u32_e32 v5, v5, v7 14528; GFX90A-NEXT: v_mul_lo_u32 v8, s0, v2 14529; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v8 14530; GFX90A-NEXT: v_mul_lo_u32 v10, v3, v8 14531; GFX90A-NEXT: v_mul_lo_u32 v12, v2, v5 14532; GFX90A-NEXT: v_mul_hi_u32 v8, v2, v8 14533; GFX90A-NEXT: v_mul_hi_u32 v11, v2, v5 14534; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 14535; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc 14536; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 14537; GFX90A-NEXT: v_mul_hi_u32 v7, v3, v5 14538; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc 14539; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v4, vcc 14540; GFX90A-NEXT: v_mul_lo_u32 v5, v3, v5 14541; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v8, v5 14542; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 14543; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v6, v7, vcc 14544; GFX90A-NEXT: s_add_u32 s0, s6, s10 14545; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 14546; GFX90A-NEXT: s_mov_b32 s11, s10 14547; GFX90A-NEXT: s_addc_u32 s1, s7, s10 14548; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v7, vcc 14549; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 14550; GFX90A-NEXT: v_mul_lo_u32 v7, s6, v3 14551; GFX90A-NEXT: v_mul_hi_u32 v8, s6, v2 14552; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v3 14553; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 14554; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc 14555; GFX90A-NEXT: v_mul_hi_u32 v9, s7, v2 14556; GFX90A-NEXT: v_mul_lo_u32 v2, s7, v2 14557; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 14558; GFX90A-NEXT: v_mul_hi_u32 v8, s7, v3 14559; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v9, vcc 14560; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v4, vcc 14561; GFX90A-NEXT: v_mul_lo_u32 v3, s7, v3 14562; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 14563; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc 14564; GFX90A-NEXT: v_mul_lo_u32 v3, s4, v3 14565; GFX90A-NEXT: v_mul_hi_u32 v5, s4, v2 14566; GFX90A-NEXT: v_add_u32_e32 v3, v5, v3 14567; GFX90A-NEXT: v_mul_lo_u32 v5, s5, v2 14568; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 14569; GFX90A-NEXT: v_mul_lo_u32 v2, s4, v2 14570; GFX90A-NEXT: v_sub_u32_e32 v5, s7, v3 14571; GFX90A-NEXT: v_mov_b32_e32 v6, s5 14572; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, s6, v2 14573; GFX90A-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc 14574; GFX90A-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s4, v2 14575; GFX90A-NEXT: v_subbrev_co_u32_e64 v8, s[2:3], 0, v5, s[0:1] 14576; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v8 14577; GFX90A-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 14578; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v7 14579; GFX90A-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, s[0:1] 14580; GFX90A-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[2:3] 14581; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], s5, v8 14582; GFX90A-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s4, v7 14583; GFX90A-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[2:3] 14584; GFX90A-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] 14585; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 14586; GFX90A-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[0:1] 14587; GFX90A-NEXT: v_mov_b32_e32 v7, s7 14588; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc 14589; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 14590; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 14591; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 14592; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1] 14593; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 14594; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v3 14595; GFX90A-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 14596; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 14597; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 14598; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 14599; GFX90A-NEXT: v_xor_b32_e32 v2, s10, v2 14600; GFX90A-NEXT: v_xor_b32_e32 v3, s10, v3 14601; GFX90A-NEXT: v_mov_b32_e32 v5, s10 14602; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s10, v2 14603; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc 14604; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] 14605; GFX90A-NEXT: s_endpgm 14606 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 14607 %r = srem <2 x i64> %x, %shl.y 14608 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 14609 ret void 14610} 14611