1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update 3; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s 5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s 6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx90a -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX90A %s 7 8define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 9; CHECK-LABEL: @udiv_i32( 10; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 11; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 12; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000 13; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 14; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]] 15; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]] 16; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64 17; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 18; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 19; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 20; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 21; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 22; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]] 23; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64 24; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 25; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 26; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 27; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 28; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 29; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]] 30; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]] 31; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]] 32; CHECK-NEXT: [[TMP23:%.*]] = add i32 [[TMP19]], 1 33; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP19]] 34; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP21]], [[Y]] 35; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP22]], i32 [[TMP25]], i32 [[TMP21]] 36; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]] 37; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP24]], 1 38; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]] 39; CHECK-NEXT: store i32 [[TMP29]], i32 addrspace(1)* [[OUT:%.*]], align 4 40; CHECK-NEXT: ret void 41; 42; GFX6-LABEL: udiv_i32: 43; GFX6: ; %bb.0: 44; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 45; GFX6-NEXT: s_mov_b32 s7, 0xf000 46; GFX6-NEXT: s_mov_b32 s6, -1 47; GFX6-NEXT: s_waitcnt lgkmcnt(0) 48; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 49; GFX6-NEXT: s_sub_i32 s4, 0, s3 50; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 51; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 52; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 53; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 54; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 55; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 56; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 57; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 58; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 59; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 60; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 61; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 62; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 63; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 64; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 65; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 66; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 67; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 68; GFX6-NEXT: s_waitcnt lgkmcnt(0) 69; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 70; GFX6-NEXT: s_endpgm 71; 72; GFX9-LABEL: udiv_i32: 73; GFX9: ; %bb.0: 74; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 75; GFX9-NEXT: v_mov_b32_e32 v2, 0 76; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 77; GFX9-NEXT: s_waitcnt lgkmcnt(0) 78; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 79; GFX9-NEXT: s_sub_i32 s4, 0, s3 80; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 81; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 82; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 83; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 84; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 85; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 86; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 87; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3 88; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 89; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 90; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 91; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 92; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 93; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 94; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 95; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 96; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 97; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 98; GFX9-NEXT: s_endpgm 99; 100; GFX90A-LABEL: udiv_i32: 101; GFX90A: ; %bb.0: 102; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 103; GFX90A-NEXT: v_mov_b32_e32 v1, 0 104; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 105; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 106; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 107; GFX90A-NEXT: s_sub_i32 s4, 0, s3 108; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 109; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 110; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 111; GFX90A-NEXT: v_mul_lo_u32 v2, s4, v0 112; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 113; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 114; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 115; GFX90A-NEXT: v_mul_lo_u32 v2, v0, s3 116; GFX90A-NEXT: v_sub_u32_e32 v2, s2, v2 117; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 118; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 119; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 120; GFX90A-NEXT: v_subrev_u32_e32 v3, s3, v2 121; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 122; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 123; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 124; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 125; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] 126; GFX90A-NEXT: s_endpgm 127 %r = udiv i32 %x, %y 128 store i32 %r, i32 addrspace(1)* %out 129 ret void 130} 131 132define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 133; CHECK-LABEL: @urem_i32( 134; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 135; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 136; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000 137; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 138; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]] 139; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]] 140; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64 141; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 142; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 143; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 144; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 145; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 146; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]] 147; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64 148; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 149; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 150; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 151; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 152; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 153; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]] 154; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]] 155; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]] 156; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP21]], [[Y]] 157; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP21]] 158; CHECK-NEXT: [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]] 159; CHECK-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]] 160; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]] 161; CHECK-NEXT: store i32 [[TMP27]], i32 addrspace(1)* [[OUT:%.*]], align 4 162; CHECK-NEXT: ret void 163; 164; GFX6-LABEL: urem_i32: 165; GFX6: ; %bb.0: 166; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 167; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 168; GFX6-NEXT: s_mov_b32 s3, 0xf000 169; GFX6-NEXT: s_waitcnt lgkmcnt(0) 170; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 171; GFX6-NEXT: s_sub_i32 s2, 0, s5 172; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 173; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 174; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 175; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 176; GFX6-NEXT: s_mov_b32 s2, -1 177; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 178; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 179; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 180; GFX6-NEXT: v_mul_lo_u32 v0, v0, s5 181; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 182; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s5, v0 183; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 184; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 185; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s5, v0 186; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 187; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 188; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 189; GFX6-NEXT: s_endpgm 190; 191; GFX9-LABEL: urem_i32: 192; GFX9: ; %bb.0: 193; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 194; GFX9-NEXT: s_waitcnt lgkmcnt(0) 195; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 196; GFX9-NEXT: s_sub_i32 s4, 0, s3 197; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 198; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 199; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 200; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 201; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 202; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 203; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 204; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 205; GFX9-NEXT: v_mov_b32_e32 v1, 0 206; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 207; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 208; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 209; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 210; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 211; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 212; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 213; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 214; GFX9-NEXT: s_waitcnt lgkmcnt(0) 215; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 216; GFX9-NEXT: s_endpgm 217; 218; GFX90A-LABEL: urem_i32: 219; GFX90A: ; %bb.0: 220; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 221; GFX90A-NEXT: v_mov_b32_e32 v1, 0 222; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 223; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 224; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 225; GFX90A-NEXT: s_sub_i32 s4, 0, s3 226; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 227; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 228; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 229; GFX90A-NEXT: v_mul_lo_u32 v2, s4, v0 230; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 231; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 232; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 233; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s3 234; GFX90A-NEXT: v_sub_u32_e32 v0, s2, v0 235; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 236; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 237; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 238; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 239; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 240; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 241; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] 242; GFX90A-NEXT: s_endpgm 243 %r = urem i32 %x, %y 244 store i32 %r, i32 addrspace(1)* %out 245 ret void 246} 247 248define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 249; CHECK-LABEL: @sdiv_i32( 250; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 251; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 252; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 253; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[X]], [[TMP1]] 254; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]] 255; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]] 256; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]] 257; CHECK-NEXT: [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float 258; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]]) 259; CHECK-NEXT: [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41EFFFFFC0000000 260; CHECK-NEXT: [[TMP11:%.*]] = fptoui float [[TMP10]] to i32 261; CHECK-NEXT: [[TMP12:%.*]] = sub i32 0, [[TMP7]] 262; CHECK-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP11]] 263; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP11]] to i64 264; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 265; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 266; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 267; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 268; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 269; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP11]], [[TMP19]] 270; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP6]] to i64 271; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP20]] to i64 272; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]] 273; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 274; CHECK-NEXT: [[TMP25:%.*]] = lshr i64 [[TMP23]], 32 275; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 276; CHECK-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], [[TMP7]] 277; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP6]], [[TMP27]] 278; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP7]] 279; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 280; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 281; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP28]], [[TMP7]] 282; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP29]], i32 [[TMP32]], i32 [[TMP28]] 283; CHECK-NEXT: [[TMP34:%.*]] = icmp uge i32 [[TMP33]], [[TMP7]] 284; CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP31]], 1 285; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]] 286; CHECK-NEXT: [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]] 287; CHECK-NEXT: [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]] 288; CHECK-NEXT: store i32 [[TMP38]], i32 addrspace(1)* [[OUT:%.*]], align 4 289; CHECK-NEXT: ret void 290; 291; GFX6-LABEL: sdiv_i32: 292; GFX6: ; %bb.0: 293; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 294; GFX6-NEXT: s_mov_b32 s7, 0xf000 295; GFX6-NEXT: s_mov_b32 s6, -1 296; GFX6-NEXT: s_waitcnt lgkmcnt(0) 297; GFX6-NEXT: s_ashr_i32 s8, s3, 31 298; GFX6-NEXT: s_add_i32 s3, s3, s8 299; GFX6-NEXT: s_xor_b32 s3, s3, s8 300; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 301; GFX6-NEXT: s_sub_i32 s4, 0, s3 302; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 303; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 304; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 305; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 306; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 307; GFX6-NEXT: s_ashr_i32 s0, s2, 31 308; GFX6-NEXT: s_add_i32 s1, s2, s0 309; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 310; GFX6-NEXT: s_xor_b32 s1, s1, s0 311; GFX6-NEXT: s_xor_b32 s2, s0, s8 312; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 313; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 314; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 315; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 316; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 317; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 318; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 319; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 320; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 321; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 322; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 323; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 324; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 325; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 326; GFX6-NEXT: s_waitcnt lgkmcnt(0) 327; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 328; GFX6-NEXT: s_endpgm 329; 330; GFX9-LABEL: sdiv_i32: 331; GFX9: ; %bb.0: 332; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 333; GFX9-NEXT: v_mov_b32_e32 v2, 0 334; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 335; GFX9-NEXT: s_waitcnt lgkmcnt(0) 336; GFX9-NEXT: s_ashr_i32 s4, s3, 31 337; GFX9-NEXT: s_add_i32 s3, s3, s4 338; GFX9-NEXT: s_xor_b32 s3, s3, s4 339; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 340; GFX9-NEXT: s_sub_i32 s5, 0, s3 341; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 342; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 343; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 344; GFX9-NEXT: v_mul_lo_u32 v1, s5, v0 345; GFX9-NEXT: s_ashr_i32 s5, s2, 31 346; GFX9-NEXT: s_add_i32 s2, s2, s5 347; GFX9-NEXT: s_xor_b32 s2, s2, s5 348; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 349; GFX9-NEXT: s_xor_b32 s4, s5, s4 350; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 351; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 352; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3 353; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 354; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 355; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 356; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 357; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 358; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 359; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 360; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 361; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 362; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 363; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 364; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 365; GFX9-NEXT: s_endpgm 366; 367; GFX90A-LABEL: sdiv_i32: 368; GFX90A: ; %bb.0: 369; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 370; GFX90A-NEXT: v_mov_b32_e32 v1, 0 371; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 372; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 373; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 374; GFX90A-NEXT: s_add_i32 s3, s3, s4 375; GFX90A-NEXT: s_xor_b32 s3, s3, s4 376; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 377; GFX90A-NEXT: s_ashr_i32 s5, s2, 31 378; GFX90A-NEXT: s_add_i32 s2, s2, s5 379; GFX90A-NEXT: s_xor_b32 s4, s5, s4 380; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 381; GFX90A-NEXT: s_xor_b32 s2, s2, s5 382; GFX90A-NEXT: s_sub_i32 s5, 0, s3 383; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 384; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 385; GFX90A-NEXT: v_mul_lo_u32 v2, s5, v0 386; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 387; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 388; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 389; GFX90A-NEXT: v_mul_lo_u32 v2, v0, s3 390; GFX90A-NEXT: v_sub_u32_e32 v2, s2, v2 391; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 392; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 393; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 394; GFX90A-NEXT: v_subrev_u32_e32 v3, s3, v2 395; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 396; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 397; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 398; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 399; GFX90A-NEXT: v_xor_b32_e32 v0, s4, v0 400; GFX90A-NEXT: v_subrev_u32_e32 v0, s4, v0 401; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] 402; GFX90A-NEXT: s_endpgm 403 %r = sdiv i32 %x, %y 404 store i32 %r, i32 addrspace(1)* %out 405 ret void 406} 407 408define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 409; CHECK-LABEL: @srem_i32( 410; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 411; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 412; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[X]], [[TMP1]] 413; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]] 414; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]] 415; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]] 416; CHECK-NEXT: [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float 417; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 418; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41EFFFFFC0000000 419; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP9]] to i32 420; CHECK-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP6]] 421; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP10]] 422; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP10]] to i64 423; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64 424; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]] 425; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 426; CHECK-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP15]], 32 427; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 428; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP10]], [[TMP18]] 429; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64 430; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP19]] to i64 431; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]] 432; CHECK-NEXT: [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32 433; CHECK-NEXT: [[TMP24:%.*]] = lshr i64 [[TMP22]], 32 434; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 435; CHECK-NEXT: [[TMP26:%.*]] = mul i32 [[TMP25]], [[TMP6]] 436; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP5]], [[TMP26]] 437; CHECK-NEXT: [[TMP28:%.*]] = icmp uge i32 [[TMP27]], [[TMP6]] 438; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP6]] 439; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP28]], i32 [[TMP29]], i32 [[TMP27]] 440; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP6]] 441; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP30]], [[TMP6]] 442; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]] 443; CHECK-NEXT: [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]] 444; CHECK-NEXT: [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]] 445; CHECK-NEXT: store i32 [[TMP35]], i32 addrspace(1)* [[OUT:%.*]], align 4 446; CHECK-NEXT: ret void 447; 448; GFX6-LABEL: srem_i32: 449; GFX6: ; %bb.0: 450; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 451; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 452; GFX6-NEXT: s_waitcnt lgkmcnt(0) 453; GFX6-NEXT: s_ashr_i32 s4, s3, 31 454; GFX6-NEXT: s_add_i32 s3, s3, s4 455; GFX6-NEXT: s_xor_b32 s4, s3, s4 456; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 457; GFX6-NEXT: s_sub_i32 s3, 0, s4 458; GFX6-NEXT: s_ashr_i32 s5, s2, 31 459; GFX6-NEXT: s_add_i32 s2, s2, s5 460; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 461; GFX6-NEXT: s_xor_b32 s6, s2, s5 462; GFX6-NEXT: s_mov_b32 s2, -1 463; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 464; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 465; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 466; GFX6-NEXT: s_mov_b32 s3, 0xf000 467; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 468; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 469; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 470; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 471; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 472; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 473; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 474; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 475; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 476; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 477; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 478; GFX6-NEXT: v_xor_b32_e32 v0, s5, v0 479; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s5, v0 480; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 481; GFX6-NEXT: s_endpgm 482; 483; GFX9-LABEL: srem_i32: 484; GFX9: ; %bb.0: 485; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 486; GFX9-NEXT: s_waitcnt lgkmcnt(0) 487; GFX9-NEXT: s_ashr_i32 s4, s3, 31 488; GFX9-NEXT: s_add_i32 s3, s3, s4 489; GFX9-NEXT: s_xor_b32 s3, s3, s4 490; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 491; GFX9-NEXT: s_sub_i32 s4, 0, s3 492; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 493; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 494; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 495; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 496; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 497; GFX9-NEXT: s_ashr_i32 s4, s2, 31 498; GFX9-NEXT: s_add_i32 s2, s2, s4 499; GFX9-NEXT: s_xor_b32 s2, s2, s4 500; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 501; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 502; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 503; GFX9-NEXT: v_mov_b32_e32 v1, 0 504; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 505; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 506; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 507; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 508; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 509; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 510; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 511; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 512; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 513; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 514; GFX9-NEXT: s_waitcnt lgkmcnt(0) 515; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 516; GFX9-NEXT: s_endpgm 517; 518; GFX90A-LABEL: srem_i32: 519; GFX90A: ; %bb.0: 520; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 521; GFX90A-NEXT: v_mov_b32_e32 v1, 0 522; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 523; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 524; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 525; GFX90A-NEXT: s_add_i32 s3, s3, s4 526; GFX90A-NEXT: s_xor_b32 s3, s3, s4 527; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 528; GFX90A-NEXT: s_sub_i32 s5, 0, s3 529; GFX90A-NEXT: s_ashr_i32 s4, s2, 31 530; GFX90A-NEXT: s_add_i32 s2, s2, s4 531; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 532; GFX90A-NEXT: s_xor_b32 s2, s2, s4 533; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 534; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 535; GFX90A-NEXT: v_mul_lo_u32 v2, s5, v0 536; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 537; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 538; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 539; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s3 540; GFX90A-NEXT: v_sub_u32_e32 v0, s2, v0 541; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 542; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 543; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 544; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 545; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 546; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 547; GFX90A-NEXT: v_xor_b32_e32 v0, s4, v0 548; GFX90A-NEXT: v_subrev_u32_e32 v0, s4, v0 549; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] 550; GFX90A-NEXT: s_endpgm 551 %r = srem i32 %x, %y 552 store i32 %r, i32 addrspace(1)* %out 553 ret void 554} 555 556define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 557; CHECK-LABEL: @udiv_i16( 558; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 559; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 560; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 561; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 562; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 563; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 564; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 565; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 566; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 567; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 568; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 569; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 570; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 571; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 572; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 573; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 65535 574; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 575; CHECK-NEXT: store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]], align 2 576; CHECK-NEXT: ret void 577; 578; GFX6-LABEL: udiv_i16: 579; GFX6: ; %bb.0: 580; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb 581; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 582; GFX6-NEXT: s_waitcnt lgkmcnt(0) 583; GFX6-NEXT: s_lshr_b32 s3, s2, 16 584; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 585; GFX6-NEXT: s_and_b32 s2, s2, 0xffff 586; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s2 587; GFX6-NEXT: s_mov_b32 s3, 0xf000 588; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 589; GFX6-NEXT: s_mov_b32 s2, -1 590; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 591; GFX6-NEXT: v_trunc_f32_e32 v2, v2 592; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 593; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 594; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 595; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 596; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 597; GFX6-NEXT: s_endpgm 598; 599; GFX9-LABEL: udiv_i16: 600; GFX9: ; %bb.0: 601; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 602; GFX9-NEXT: v_mov_b32_e32 v3, 0 603; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 604; GFX9-NEXT: s_waitcnt lgkmcnt(0) 605; GFX9-NEXT: s_lshr_b32 s3, s2, 16 606; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 607; GFX9-NEXT: s_and_b32 s2, s2, 0xffff 608; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 609; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 610; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 611; GFX9-NEXT: v_trunc_f32_e32 v2, v2 612; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 613; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 614; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 615; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 616; GFX9-NEXT: global_store_short v3, v0, s[0:1] 617; GFX9-NEXT: s_endpgm 618; 619; GFX90A-LABEL: udiv_i16: 620; GFX90A: ; %bb.0: 621; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x2c 622; GFX90A-NEXT: v_mov_b32_e32 v3, 0 623; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 624; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 625; GFX90A-NEXT: s_lshr_b32 s3, s2, 16 626; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 627; GFX90A-NEXT: s_and_b32 s2, s2, 0xffff 628; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s2 629; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0 630; GFX90A-NEXT: v_mul_f32_e32 v2, v1, v2 631; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 632; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v2 633; GFX90A-NEXT: v_mad_f32 v1, -v2, v0, v1 634; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 635; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 636; GFX90A-NEXT: global_store_short v3, v0, s[0:1] 637; GFX90A-NEXT: s_endpgm 638 %r = udiv i16 %x, %y 639 store i16 %r, i16 addrspace(1)* %out 640 ret void 641} 642 643define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 644; CHECK-LABEL: @urem_i16( 645; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 646; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 647; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 648; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 649; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 650; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 651; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 652; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 653; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 654; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 655; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 656; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 657; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 658; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 659; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 660; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 661; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 662; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 663; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 664; CHECK-NEXT: store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]], align 2 665; CHECK-NEXT: ret void 666; 667; GFX6-LABEL: urem_i16: 668; GFX6: ; %bb.0: 669; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 670; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 671; GFX6-NEXT: s_waitcnt lgkmcnt(0) 672; GFX6-NEXT: s_lshr_b32 s2, s4, 16 673; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 674; GFX6-NEXT: s_and_b32 s3, s4, 0xffff 675; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 676; GFX6-NEXT: s_mov_b32 s3, 0xf000 677; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 678; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 679; GFX6-NEXT: v_trunc_f32_e32 v2, v2 680; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 681; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 682; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 683; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 684; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 685; GFX6-NEXT: s_mov_b32 s2, -1 686; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 687; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 688; GFX6-NEXT: s_endpgm 689; 690; GFX9-LABEL: urem_i16: 691; GFX9: ; %bb.0: 692; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 693; GFX9-NEXT: s_waitcnt lgkmcnt(0) 694; GFX9-NEXT: s_lshr_b32 s3, s2, 16 695; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 696; GFX9-NEXT: s_and_b32 s4, s2, 0xffff 697; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 698; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 699; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 700; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 701; GFX9-NEXT: v_trunc_f32_e32 v2, v2 702; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 703; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 704; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 705; GFX9-NEXT: v_mov_b32_e32 v1, 0 706; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 707; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 708; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 709; GFX9-NEXT: s_waitcnt lgkmcnt(0) 710; GFX9-NEXT: global_store_short v1, v0, s[0:1] 711; GFX9-NEXT: s_endpgm 712; 713; GFX90A-LABEL: urem_i16: 714; GFX90A: ; %bb.0: 715; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x2c 716; GFX90A-NEXT: v_mov_b32_e32 v3, 0 717; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 718; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 719; GFX90A-NEXT: s_lshr_b32 s3, s2, 16 720; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 721; GFX90A-NEXT: s_and_b32 s4, s2, 0xffff 722; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s4 723; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0 724; GFX90A-NEXT: v_mul_f32_e32 v2, v1, v2 725; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 726; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v2 727; GFX90A-NEXT: v_mad_f32 v1, -v2, v0, v1 728; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 729; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 730; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s3 731; GFX90A-NEXT: v_sub_u32_e32 v0, s2, v0 732; GFX90A-NEXT: global_store_short v3, v0, s[0:1] 733; GFX90A-NEXT: s_endpgm 734 %r = urem i16 %x, %y 735 store i16 %r, i16 addrspace(1)* %out 736 ret void 737} 738 739define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 740; CHECK-LABEL: @sdiv_i16( 741; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 742; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 743; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 744; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 745; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 746; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 747; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 748; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 749; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 750; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 751; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 752; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 753; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 754; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 755; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 756; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 757; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 758; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 759; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 16 760; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 16 761; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 762; CHECK-NEXT: store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]], align 2 763; CHECK-NEXT: ret void 764; 765; GFX6-LABEL: sdiv_i16: 766; GFX6: ; %bb.0: 767; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 768; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 769; GFX6-NEXT: s_mov_b32 s7, 0xf000 770; GFX6-NEXT: s_mov_b32 s6, -1 771; GFX6-NEXT: s_waitcnt lgkmcnt(0) 772; GFX6-NEXT: s_ashr_i32 s1, s0, 16 773; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 774; GFX6-NEXT: s_sext_i32_i16 s0, s0 775; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 776; GFX6-NEXT: s_xor_b32 s0, s0, s1 777; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 778; GFX6-NEXT: s_ashr_i32 s0, s0, 30 779; GFX6-NEXT: s_or_b32 s0, s0, 1 780; GFX6-NEXT: v_mov_b32_e32 v3, s0 781; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 782; GFX6-NEXT: v_trunc_f32_e32 v2, v2 783; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 784; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 785; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 786; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 787; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 788; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 789; GFX6-NEXT: s_endpgm 790; 791; GFX9-LABEL: sdiv_i16: 792; GFX9: ; %bb.0: 793; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 794; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 795; GFX9-NEXT: v_mov_b32_e32 v1, 0 796; GFX9-NEXT: s_waitcnt lgkmcnt(0) 797; GFX9-NEXT: s_ashr_i32 s0, s4, 16 798; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 799; GFX9-NEXT: s_sext_i32_i16 s1, s4 800; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 801; GFX9-NEXT: s_xor_b32 s0, s1, s0 802; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 803; GFX9-NEXT: s_ashr_i32 s0, s0, 30 804; GFX9-NEXT: s_or_b32 s4, s0, 1 805; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 806; GFX9-NEXT: v_trunc_f32_e32 v3, v3 807; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 808; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 809; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 810; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 811; GFX9-NEXT: s_cselect_b32 s0, s4, 0 812; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 813; GFX9-NEXT: global_store_short v1, v0, s[2:3] 814; GFX9-NEXT: s_endpgm 815; 816; GFX90A-LABEL: sdiv_i16: 817; GFX90A: ; %bb.0: 818; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 819; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 820; GFX90A-NEXT: v_mov_b32_e32 v1, 0 821; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 822; GFX90A-NEXT: s_ashr_i32 s0, s4, 16 823; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 824; GFX90A-NEXT: s_sext_i32_i16 s1, s4 825; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 826; GFX90A-NEXT: s_xor_b32 s0, s1, s0 827; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 828; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 829; GFX90A-NEXT: s_or_b32 s4, s0, 1 830; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 831; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 832; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 833; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 834; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 835; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 836; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 837; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 838; GFX90A-NEXT: global_store_short v1, v0, s[2:3] 839; GFX90A-NEXT: s_endpgm 840 %r = sdiv i16 %x, %y 841 store i16 %r, i16 addrspace(1)* %out 842 ret void 843} 844 845define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 846; CHECK-LABEL: @srem_i16( 847; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 848; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 849; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 850; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 851; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 852; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 853; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 854; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 855; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 856; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 857; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 858; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 859; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 860; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 861; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 862; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 863; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 864; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 865; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 866; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 867; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 868; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 869; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 870; CHECK-NEXT: store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]], align 2 871; CHECK-NEXT: ret void 872; 873; GFX6-LABEL: srem_i16: 874; GFX6: ; %bb.0: 875; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 876; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 877; GFX6-NEXT: s_waitcnt lgkmcnt(0) 878; GFX6-NEXT: s_ashr_i32 s2, s4, 16 879; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 880; GFX6-NEXT: s_sext_i32_i16 s3, s4 881; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 882; GFX6-NEXT: s_xor_b32 s3, s3, s2 883; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 884; GFX6-NEXT: s_ashr_i32 s3, s3, 30 885; GFX6-NEXT: s_or_b32 s3, s3, 1 886; GFX6-NEXT: v_mov_b32_e32 v3, s3 887; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 888; GFX6-NEXT: v_trunc_f32_e32 v2, v2 889; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 890; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 891; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 892; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 893; GFX6-NEXT: s_mov_b32 s3, 0xf000 894; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 895; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 896; GFX6-NEXT: s_mov_b32 s2, -1 897; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 898; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 899; GFX6-NEXT: s_endpgm 900; 901; GFX9-LABEL: srem_i16: 902; GFX9: ; %bb.0: 903; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 904; GFX9-NEXT: s_waitcnt lgkmcnt(0) 905; GFX9-NEXT: s_ashr_i32 s5, s4, 16 906; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5 907; GFX9-NEXT: s_sext_i32_i16 s2, s4 908; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 909; GFX9-NEXT: s_xor_b32 s2, s2, s5 910; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 911; GFX9-NEXT: s_ashr_i32 s2, s2, 30 912; GFX9-NEXT: s_or_b32 s6, s2, 1 913; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 914; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 915; GFX9-NEXT: v_trunc_f32_e32 v2, v2 916; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 917; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 918; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 919; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 920; GFX9-NEXT: s_cselect_b32 s2, s6, 0 921; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 922; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 923; GFX9-NEXT: v_mov_b32_e32 v1, 0 924; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 925; GFX9-NEXT: s_waitcnt lgkmcnt(0) 926; GFX9-NEXT: global_store_short v1, v0, s[0:1] 927; GFX9-NEXT: s_endpgm 928; 929; GFX90A-LABEL: srem_i16: 930; GFX90A: ; %bb.0: 931; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 932; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 933; GFX90A-NEXT: v_mov_b32_e32 v1, 0 934; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 935; GFX90A-NEXT: s_ashr_i32 s5, s4, 16 936; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s5 937; GFX90A-NEXT: s_sext_i32_i16 s0, s4 938; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s0 939; GFX90A-NEXT: s_xor_b32 s0, s0, s5 940; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 941; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 942; GFX90A-NEXT: s_or_b32 s6, s0, 1 943; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 944; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 945; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 946; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 947; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 948; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 949; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 950; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 951; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s5 952; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 953; GFX90A-NEXT: global_store_short v1, v0, s[2:3] 954; GFX90A-NEXT: s_endpgm 955 %r = srem i16 %x, %y 956 store i16 %r, i16 addrspace(1)* %out 957 ret void 958} 959 960define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 961; CHECK-LABEL: @udiv_i8( 962; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 963; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 964; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 965; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 966; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 967; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 968; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 969; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 970; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 971; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 972; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 973; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 974; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 975; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 976; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 977; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 255 978; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 979; CHECK-NEXT: store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]], align 1 980; CHECK-NEXT: ret void 981; 982; GFX6-LABEL: udiv_i8: 983; GFX6: ; %bb.0: 984; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 985; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 986; GFX6-NEXT: s_mov_b32 s7, 0xf000 987; GFX6-NEXT: s_mov_b32 s6, -1 988; GFX6-NEXT: s_waitcnt lgkmcnt(0) 989; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s0 990; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 991; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 992; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 993; GFX6-NEXT: v_trunc_f32_e32 v1, v1 994; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 995; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 996; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 997; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 998; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 999; GFX6-NEXT: s_endpgm 1000; 1001; GFX9-LABEL: udiv_i8: 1002; GFX9: ; %bb.0: 1003; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 1004; GFX9-NEXT: v_mov_b32_e32 v2, 0 1005; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1006; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1007; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 1008; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 1009; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 1010; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 1011; GFX9-NEXT: v_trunc_f32_e32 v1, v1 1012; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 1013; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v3 1014; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1015; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 1016; GFX9-NEXT: global_store_byte v2, v0, s[0:1] 1017; GFX9-NEXT: s_endpgm 1018; 1019; GFX90A-LABEL: udiv_i8: 1020; GFX90A: ; %bb.0: 1021; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x2c 1022; GFX90A-NEXT: v_mov_b32_e32 v2, 0 1023; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1024; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1025; GFX90A-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 1026; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v0 1027; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 1028; GFX90A-NEXT: v_mul_f32_e32 v1, v3, v1 1029; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 1030; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v1 1031; GFX90A-NEXT: v_mad_f32 v1, -v1, v0, v3 1032; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1033; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 1034; GFX90A-NEXT: global_store_byte v2, v0, s[0:1] 1035; GFX90A-NEXT: s_endpgm 1036 %r = udiv i8 %x, %y 1037 store i8 %r, i8 addrspace(1)* %out 1038 ret void 1039} 1040 1041define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 1042; CHECK-LABEL: @urem_i8( 1043; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 1044; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 1045; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 1046; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 1047; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 1048; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 1049; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 1050; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 1051; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 1052; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 1053; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 1054; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 1055; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 1056; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 1057; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 1058; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 1059; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 1060; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 255 1061; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8 1062; CHECK-NEXT: store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]], align 1 1063; CHECK-NEXT: ret void 1064; 1065; GFX6-LABEL: urem_i8: 1066; GFX6: ; %bb.0: 1067; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 1068; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1069; GFX6-NEXT: s_mov_b32 s3, 0xf000 1070; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1071; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 1072; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 1073; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 1074; GFX6-NEXT: s_lshr_b32 s2, s4, 8 1075; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 1076; GFX6-NEXT: v_trunc_f32_e32 v1, v1 1077; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 1078; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 1079; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1080; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 1081; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 1082; GFX6-NEXT: s_mov_b32 s2, -1 1083; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1084; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 1085; GFX6-NEXT: s_endpgm 1086; 1087; GFX9-LABEL: urem_i8: 1088; GFX9: ; %bb.0: 1089; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 1090; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1091; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 1092; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 1093; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 1094; GFX9-NEXT: s_lshr_b32 s3, s2, 8 1095; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1096; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 1097; GFX9-NEXT: v_trunc_f32_e32 v1, v1 1098; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 1099; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2 1100; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1101; GFX9-NEXT: v_mov_b32_e32 v1, 0 1102; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 1103; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 1104; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1105; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1106; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 1107; GFX9-NEXT: s_endpgm 1108; 1109; GFX90A-LABEL: urem_i8: 1110; GFX90A: ; %bb.0: 1111; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1112; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 1113; GFX90A-NEXT: v_mov_b32_e32 v2, 0 1114; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1115; GFX90A-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 1116; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v0 1117; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v3, s4 1118; GFX90A-NEXT: s_lshr_b32 s0, s4, 8 1119; GFX90A-NEXT: v_mul_f32_e32 v1, v3, v1 1120; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 1121; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v1 1122; GFX90A-NEXT: v_mad_f32 v1, -v1, v0, v3 1123; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1124; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 1125; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s0 1126; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 1127; GFX90A-NEXT: global_store_byte v2, v0, s[2:3] 1128; GFX90A-NEXT: s_endpgm 1129 %r = urem i8 %x, %y 1130 store i8 %r, i8 addrspace(1)* %out 1131 ret void 1132} 1133 1134define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 1135; CHECK-LABEL: @sdiv_i8( 1136; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 1137; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 1138; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 1139; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 1140; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 1141; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 1142; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 1143; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 1144; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 1145; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 1146; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 1147; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 1148; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 1149; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 1150; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 1151; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 1152; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 1153; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 1154; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 24 1155; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 24 1156; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8 1157; CHECK-NEXT: store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]], align 1 1158; CHECK-NEXT: ret void 1159; 1160; GFX6-LABEL: sdiv_i8: 1161; GFX6: ; %bb.0: 1162; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1163; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 1164; GFX6-NEXT: s_mov_b32 s7, 0xf000 1165; GFX6-NEXT: s_mov_b32 s6, -1 1166; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1167; GFX6-NEXT: s_bfe_i32 s1, s0, 0x80008 1168; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 1169; GFX6-NEXT: s_sext_i32_i8 s0, s0 1170; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 1171; GFX6-NEXT: s_xor_b32 s0, s0, s1 1172; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 1173; GFX6-NEXT: s_ashr_i32 s0, s0, 30 1174; GFX6-NEXT: s_or_b32 s0, s0, 1 1175; GFX6-NEXT: v_mov_b32_e32 v3, s0 1176; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 1177; GFX6-NEXT: v_trunc_f32_e32 v2, v2 1178; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 1179; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 1180; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 1181; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 1182; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1183; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 1184; GFX6-NEXT: s_endpgm 1185; 1186; GFX9-LABEL: sdiv_i8: 1187; GFX9: ; %bb.0: 1188; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1189; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 1190; GFX9-NEXT: v_mov_b32_e32 v1, 0 1191; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1192; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80008 1193; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 1194; GFX9-NEXT: s_sext_i32_i8 s1, s4 1195; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 1196; GFX9-NEXT: s_xor_b32 s0, s1, s0 1197; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 1198; GFX9-NEXT: s_ashr_i32 s0, s0, 30 1199; GFX9-NEXT: s_or_b32 s4, s0, 1 1200; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 1201; GFX9-NEXT: v_trunc_f32_e32 v3, v3 1202; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 1203; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 1204; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 1205; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 1206; GFX9-NEXT: s_cselect_b32 s0, s4, 0 1207; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 1208; GFX9-NEXT: global_store_byte v1, v0, s[2:3] 1209; GFX9-NEXT: s_endpgm 1210; 1211; GFX90A-LABEL: sdiv_i8: 1212; GFX90A: ; %bb.0: 1213; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1214; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 1215; GFX90A-NEXT: v_mov_b32_e32 v1, 0 1216; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1217; GFX90A-NEXT: s_bfe_i32 s0, s4, 0x80008 1218; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 1219; GFX90A-NEXT: s_sext_i32_i8 s1, s4 1220; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 1221; GFX90A-NEXT: s_xor_b32 s0, s1, s0 1222; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 1223; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 1224; GFX90A-NEXT: s_or_b32 s4, s0, 1 1225; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 1226; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 1227; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 1228; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 1229; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 1230; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 1231; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 1232; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 1233; GFX90A-NEXT: global_store_byte v1, v0, s[2:3] 1234; GFX90A-NEXT: s_endpgm 1235 %r = sdiv i8 %x, %y 1236 store i8 %r, i8 addrspace(1)* %out 1237 ret void 1238} 1239 1240define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 1241; CHECK-LABEL: @srem_i8( 1242; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 1243; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 1244; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 1245; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 1246; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 1247; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 1248; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 1249; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 1250; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 1251; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 1252; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 1253; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 1254; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 1255; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 1256; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 1257; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 1258; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 1259; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 1260; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 1261; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 1262; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 24 1263; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 24 1264; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8 1265; CHECK-NEXT: store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]], align 1 1266; CHECK-NEXT: ret void 1267; 1268; GFX6-LABEL: srem_i8: 1269; GFX6: ; %bb.0: 1270; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 1271; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 1272; GFX6-NEXT: s_mov_b32 s7, 0xf000 1273; GFX6-NEXT: s_mov_b32 s6, -1 1274; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1275; GFX6-NEXT: s_bfe_i32 s1, s0, 0x80008 1276; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 1277; GFX6-NEXT: s_sext_i32_i8 s3, s0 1278; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 1279; GFX6-NEXT: s_xor_b32 s1, s3, s1 1280; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 1281; GFX6-NEXT: s_ashr_i32 s1, s1, 30 1282; GFX6-NEXT: s_or_b32 s1, s1, 1 1283; GFX6-NEXT: v_mov_b32_e32 v3, s1 1284; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 1285; GFX6-NEXT: v_trunc_f32_e32 v2, v2 1286; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 1287; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 1288; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 1289; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 1290; GFX6-NEXT: s_lshr_b32 s2, s0, 8 1291; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1292; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 1293; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1294; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 1295; GFX6-NEXT: s_endpgm 1296; 1297; GFX9-LABEL: srem_i8: 1298; GFX9: ; %bb.0: 1299; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 1300; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1301; GFX9-NEXT: s_bfe_i32 s2, s4, 0x80008 1302; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 1303; GFX9-NEXT: s_sext_i32_i8 s3, s4 1304; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 1305; GFX9-NEXT: s_xor_b32 s2, s3, s2 1306; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 1307; GFX9-NEXT: s_ashr_i32 s2, s2, 30 1308; GFX9-NEXT: s_lshr_b32 s5, s4, 8 1309; GFX9-NEXT: s_or_b32 s6, s2, 1 1310; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 1311; GFX9-NEXT: v_trunc_f32_e32 v2, v2 1312; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 1313; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 1314; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 1315; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 1316; GFX9-NEXT: s_cselect_b32 s2, s6, 0 1317; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 1318; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 1319; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1320; GFX9-NEXT: v_mov_b32_e32 v1, 0 1321; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 1322; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1323; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 1324; GFX9-NEXT: s_endpgm 1325; 1326; GFX90A-LABEL: srem_i8: 1327; GFX90A: ; %bb.0: 1328; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1329; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 1330; GFX90A-NEXT: v_mov_b32_e32 v0, 0 1331; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1332; GFX90A-NEXT: s_bfe_i32 s0, s4, 0x80008 1333; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s0 1334; GFX90A-NEXT: s_sext_i32_i8 s1, s4 1335; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 1336; GFX90A-NEXT: s_xor_b32 s0, s1, s0 1337; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v1 1338; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 1339; GFX90A-NEXT: s_lshr_b32 s5, s4, 8 1340; GFX90A-NEXT: s_or_b32 s6, s0, 1 1341; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 1342; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 1343; GFX90A-NEXT: v_mad_f32 v2, -v3, v1, v2 1344; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 1345; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v1| 1346; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 1347; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 1348; GFX90A-NEXT: v_add_u32_e32 v1, s0, v3 1349; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s5 1350; GFX90A-NEXT: v_sub_u32_e32 v1, s4, v1 1351; GFX90A-NEXT: global_store_byte v0, v1, s[2:3] 1352; GFX90A-NEXT: s_endpgm 1353 %r = srem i8 %x, %y 1354 store i8 %r, i8 addrspace(1)* %out 1355 ret void 1356} 1357 1358define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1359; CHECK-LABEL: @udiv_v4i32( 1360; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1361; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1362; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 1363; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 1364; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 1365; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 1366; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 1367; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 1368; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 1369; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 1370; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 1371; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 1372; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 1373; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 1374; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 1375; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 1376; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1377; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1378; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1379; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1380; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1381; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 1382; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 1383; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 1384; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1 1385; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]] 1386; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]] 1387; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]] 1388; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]] 1389; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 1390; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 1391; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i64 0 1392; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[X]], i64 1 1393; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1394; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float 1395; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 1396; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000 1397; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32 1398; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]] 1399; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]] 1400; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64 1401; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64 1402; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]] 1403; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 1404; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32 1405; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32 1406; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]] 1407; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64 1408; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64 1409; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]] 1410; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 1411; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32 1412; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32 1413; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]] 1414; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]] 1415; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]] 1416; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1 1417; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]] 1418; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]] 1419; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]] 1420; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]] 1421; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1 1422; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]] 1423; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP63]], i64 1 1424; CHECK-NEXT: [[TMP65:%.*]] = extractelement <4 x i32> [[X]], i64 2 1425; CHECK-NEXT: [[TMP66:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1426; CHECK-NEXT: [[TMP67:%.*]] = uitofp i32 [[TMP66]] to float 1427; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP67]]) 1428; CHECK-NEXT: [[TMP69:%.*]] = fmul fast float [[TMP68]], 0x41EFFFFFC0000000 1429; CHECK-NEXT: [[TMP70:%.*]] = fptoui float [[TMP69]] to i32 1430; CHECK-NEXT: [[TMP71:%.*]] = sub i32 0, [[TMP66]] 1431; CHECK-NEXT: [[TMP72:%.*]] = mul i32 [[TMP71]], [[TMP70]] 1432; CHECK-NEXT: [[TMP73:%.*]] = zext i32 [[TMP70]] to i64 1433; CHECK-NEXT: [[TMP74:%.*]] = zext i32 [[TMP72]] to i64 1434; CHECK-NEXT: [[TMP75:%.*]] = mul i64 [[TMP73]], [[TMP74]] 1435; CHECK-NEXT: [[TMP76:%.*]] = trunc i64 [[TMP75]] to i32 1436; CHECK-NEXT: [[TMP77:%.*]] = lshr i64 [[TMP75]], 32 1437; CHECK-NEXT: [[TMP78:%.*]] = trunc i64 [[TMP77]] to i32 1438; CHECK-NEXT: [[TMP79:%.*]] = add i32 [[TMP70]], [[TMP78]] 1439; CHECK-NEXT: [[TMP80:%.*]] = zext i32 [[TMP65]] to i64 1440; CHECK-NEXT: [[TMP81:%.*]] = zext i32 [[TMP79]] to i64 1441; CHECK-NEXT: [[TMP82:%.*]] = mul i64 [[TMP80]], [[TMP81]] 1442; CHECK-NEXT: [[TMP83:%.*]] = trunc i64 [[TMP82]] to i32 1443; CHECK-NEXT: [[TMP84:%.*]] = lshr i64 [[TMP82]], 32 1444; CHECK-NEXT: [[TMP85:%.*]] = trunc i64 [[TMP84]] to i32 1445; CHECK-NEXT: [[TMP86:%.*]] = mul i32 [[TMP85]], [[TMP66]] 1446; CHECK-NEXT: [[TMP87:%.*]] = sub i32 [[TMP65]], [[TMP86]] 1447; CHECK-NEXT: [[TMP88:%.*]] = icmp uge i32 [[TMP87]], [[TMP66]] 1448; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP85]], 1 1449; CHECK-NEXT: [[TMP90:%.*]] = select i1 [[TMP88]], i32 [[TMP89]], i32 [[TMP85]] 1450; CHECK-NEXT: [[TMP91:%.*]] = sub i32 [[TMP87]], [[TMP66]] 1451; CHECK-NEXT: [[TMP92:%.*]] = select i1 [[TMP88]], i32 [[TMP91]], i32 [[TMP87]] 1452; CHECK-NEXT: [[TMP93:%.*]] = icmp uge i32 [[TMP92]], [[TMP66]] 1453; CHECK-NEXT: [[TMP94:%.*]] = add i32 [[TMP90]], 1 1454; CHECK-NEXT: [[TMP95:%.*]] = select i1 [[TMP93]], i32 [[TMP94]], i32 [[TMP90]] 1455; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP95]], i64 2 1456; CHECK-NEXT: [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 3 1457; CHECK-NEXT: [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1458; CHECK-NEXT: [[TMP99:%.*]] = uitofp i32 [[TMP98]] to float 1459; CHECK-NEXT: [[TMP100:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP99]]) 1460; CHECK-NEXT: [[TMP101:%.*]] = fmul fast float [[TMP100]], 0x41EFFFFFC0000000 1461; CHECK-NEXT: [[TMP102:%.*]] = fptoui float [[TMP101]] to i32 1462; CHECK-NEXT: [[TMP103:%.*]] = sub i32 0, [[TMP98]] 1463; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP102]] 1464; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP102]] to i64 1465; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64 1466; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 1467; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 1468; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 1469; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 1470; CHECK-NEXT: [[TMP111:%.*]] = add i32 [[TMP102]], [[TMP110]] 1471; CHECK-NEXT: [[TMP112:%.*]] = zext i32 [[TMP97]] to i64 1472; CHECK-NEXT: [[TMP113:%.*]] = zext i32 [[TMP111]] to i64 1473; CHECK-NEXT: [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]] 1474; CHECK-NEXT: [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32 1475; CHECK-NEXT: [[TMP116:%.*]] = lshr i64 [[TMP114]], 32 1476; CHECK-NEXT: [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32 1477; CHECK-NEXT: [[TMP118:%.*]] = mul i32 [[TMP117]], [[TMP98]] 1478; CHECK-NEXT: [[TMP119:%.*]] = sub i32 [[TMP97]], [[TMP118]] 1479; CHECK-NEXT: [[TMP120:%.*]] = icmp uge i32 [[TMP119]], [[TMP98]] 1480; CHECK-NEXT: [[TMP121:%.*]] = add i32 [[TMP117]], 1 1481; CHECK-NEXT: [[TMP122:%.*]] = select i1 [[TMP120]], i32 [[TMP121]], i32 [[TMP117]] 1482; CHECK-NEXT: [[TMP123:%.*]] = sub i32 [[TMP119]], [[TMP98]] 1483; CHECK-NEXT: [[TMP124:%.*]] = select i1 [[TMP120]], i32 [[TMP123]], i32 [[TMP119]] 1484; CHECK-NEXT: [[TMP125:%.*]] = icmp uge i32 [[TMP124]], [[TMP98]] 1485; CHECK-NEXT: [[TMP126:%.*]] = add i32 [[TMP122]], 1 1486; CHECK-NEXT: [[TMP127:%.*]] = select i1 [[TMP125]], i32 [[TMP126]], i32 [[TMP122]] 1487; CHECK-NEXT: [[TMP128:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP127]], i64 3 1488; CHECK-NEXT: store <4 x i32> [[TMP128]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1489; CHECK-NEXT: ret void 1490; 1491; GFX6-LABEL: udiv_v4i32: 1492; GFX6: ; %bb.0: 1493; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1494; GFX6-NEXT: s_mov_b32 s3, 0x4f7ffffe 1495; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 1496; GFX6-NEXT: s_mov_b32 s15, 0xf000 1497; GFX6-NEXT: s_mov_b32 s14, -1 1498; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1499; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 1500; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 1501; GFX6-NEXT: s_sub_i32 s2, 0, s8 1502; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s10 1503; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 1504; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 1505; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s11 1506; GFX6-NEXT: v_mul_f32_e32 v0, s3, v0 1507; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 1508; GFX6-NEXT: v_mul_f32_e32 v1, s3, v1 1509; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 1510; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 1511; GFX6-NEXT: s_sub_i32 s2, 0, s9 1512; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 1513; GFX6-NEXT: s_sub_i32 s2, 0, s10 1514; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 1515; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 1516; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1517; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 1518; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 1519; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 1520; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 1521; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 1522; GFX6-NEXT: v_mul_lo_u32 v5, v1, s9 1523; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 1524; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 1525; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 1526; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v2 1527; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 1528; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 1529; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 1530; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v4 1531; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1532; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s5, v5 1533; GFX6-NEXT: v_mul_f32_e32 v2, s3, v2 1534; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 1535; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1536; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v3 1537; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 1538; GFX6-NEXT: v_mul_lo_u32 v4, s2, v2 1539; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s9, v3 1540; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1541; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4 1542; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v1 1543; GFX6-NEXT: s_sub_i32 s0, 0, s11 1544; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 1545; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v6 1546; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 1547; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1548; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 1549; GFX6-NEXT: v_mul_f32_e32 v4, s3, v4 1550; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 1551; GFX6-NEXT: v_mul_lo_u32 v3, v2, s10 1552; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v2 1553; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 1554; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 1555; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v3 1556; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 1557; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 1558; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s10, v3 1559; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 1560; GFX6-NEXT: v_mul_hi_u32 v4, s7, v4 1561; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 1562; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v2 1563; GFX6-NEXT: v_mul_lo_u32 v6, v4, s11 1564; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 1565; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1566; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1567; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v6 1568; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v3 1569; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] 1570; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s11, v3 1571; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1572; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1573; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1574; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 1575; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 1576; GFX6-NEXT: s_endpgm 1577; 1578; GFX9-LABEL: udiv_v4i32: 1579; GFX9: ; %bb.0: 1580; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 1581; GFX9-NEXT: s_mov_b32 s12, 0x4f7ffffe 1582; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1583; GFX9-NEXT: v_mov_b32_e32 v4, 0 1584; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1585; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 1586; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 1587; GFX9-NEXT: s_sub_i32 s2, 0, s8 1588; GFX9-NEXT: s_sub_i32 s3, 0, s9 1589; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1590; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 1591; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s10 1592; GFX9-NEXT: v_mul_f32_e32 v0, s12, v0 1593; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1594; GFX9-NEXT: v_mul_f32_e32 v1, s12, v1 1595; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 1596; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 1597; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 1598; GFX9-NEXT: s_sub_i32 s2, 0, s10 1599; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 1600; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 1601; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 1602; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 1603; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 1604; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 1605; GFX9-NEXT: v_mul_f32_e32 v3, s12, v5 1606; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 1607; GFX9-NEXT: v_mul_lo_u32 v5, v0, s8 1608; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s11 1609; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 1610; GFX9-NEXT: v_add_u32_e32 v7, 1, v0 1611; GFX9-NEXT: v_sub_u32_e32 v5, s4, v5 1612; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 1613; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc 1614; GFX9-NEXT: v_subrev_u32_e32 v7, s8, v5 1615; GFX9-NEXT: v_mul_lo_u32 v6, v1, s9 1616; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc 1617; GFX9-NEXT: v_add_u32_e32 v7, 1, v0 1618; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 1619; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 1620; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc 1621; GFX9-NEXT: v_mul_lo_u32 v7, s2, v3 1622; GFX9-NEXT: v_sub_u32_e32 v6, s5, v6 1623; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 1624; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v6 1625; GFX9-NEXT: v_mul_f32_e32 v2, s12, v2 1626; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1627; GFX9-NEXT: v_mul_hi_u32 v5, v3, v7 1628; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 1629; GFX9-NEXT: s_sub_i32 s2, 0, s11 1630; GFX9-NEXT: v_subrev_u32_e32 v7, s9, v6 1631; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 1632; GFX9-NEXT: v_mul_lo_u32 v5, s2, v2 1633; GFX9-NEXT: v_mul_hi_u32 v3, s6, v3 1634; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc 1635; GFX9-NEXT: v_add_u32_e32 v7, 1, v1 1636; GFX9-NEXT: v_mul_hi_u32 v5, v2, v5 1637; GFX9-NEXT: v_mul_lo_u32 v8, v3, s10 1638; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v6 1639; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc 1640; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 1641; GFX9-NEXT: v_mul_hi_u32 v5, s7, v2 1642; GFX9-NEXT: v_sub_u32_e32 v6, s6, v8 1643; GFX9-NEXT: v_add_u32_e32 v7, 1, v3 1644; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v6 1645; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc 1646; GFX9-NEXT: v_subrev_u32_e32 v3, s10, v6 1647; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc 1648; GFX9-NEXT: v_mul_lo_u32 v6, v5, s11 1649; GFX9-NEXT: v_add_u32_e32 v7, 1, v2 1650; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 1651; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc 1652; GFX9-NEXT: v_sub_u32_e32 v3, s7, v6 1653; GFX9-NEXT: v_add_u32_e32 v6, 1, v5 1654; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1655; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 1656; GFX9-NEXT: v_subrev_u32_e32 v6, s11, v3 1657; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1658; GFX9-NEXT: v_add_u32_e32 v6, 1, v5 1659; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1660; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 1661; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1662; GFX9-NEXT: s_endpgm 1663; 1664; GFX90A-LABEL: udiv_v4i32: 1665; GFX90A: ; %bb.0: 1666; GFX90A-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 1667; GFX90A-NEXT: s_mov_b32 s3, 0x4f7ffffe 1668; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1669; GFX90A-NEXT: v_mov_b32_e32 v4, 0 1670; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1671; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 1672; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 1673; GFX90A-NEXT: s_sub_i32 s2, 0, s8 1674; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 1675; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 1676; GFX90A-NEXT: v_mul_f32_e32 v0, s3, v0 1677; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 1678; GFX90A-NEXT: v_mul_f32_e32 v1, s3, v1 1679; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 1680; GFX90A-NEXT: v_mul_lo_u32 v2, s2, v0 1681; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 1682; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 1683; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 1684; GFX90A-NEXT: v_mul_lo_u32 v2, v0, s8 1685; GFX90A-NEXT: v_sub_u32_e32 v2, s4, v2 1686; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 1687; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 1688; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1689; GFX90A-NEXT: v_subrev_u32_e32 v3, s8, v2 1690; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 1691; GFX90A-NEXT: s_sub_i32 s2, 0, s9 1692; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 1693; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 1694; GFX90A-NEXT: v_mul_lo_u32 v2, s2, v1 1695; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1696; GFX90A-NEXT: v_mul_hi_u32 v2, v1, v2 1697; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s10 1698; GFX90A-NEXT: v_add_u32_e32 v1, v1, v2 1699; GFX90A-NEXT: v_mul_hi_u32 v1, s5, v1 1700; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s9 1701; GFX90A-NEXT: v_sub_u32_e32 v2, s5, v2 1702; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v3 1703; GFX90A-NEXT: v_add_u32_e32 v5, 1, v1 1704; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 1705; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1706; GFX90A-NEXT: v_subrev_u32_e32 v5, s9, v2 1707; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1708; GFX90A-NEXT: v_add_u32_e32 v5, 1, v1 1709; GFX90A-NEXT: v_mul_f32_e32 v3, s3, v3 1710; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 1711; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 1712; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1713; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s11 1714; GFX90A-NEXT: s_sub_i32 s2, 0, s10 1715; GFX90A-NEXT: v_mul_lo_u32 v2, s2, v3 1716; GFX90A-NEXT: v_mul_hi_u32 v2, v3, v2 1717; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v5 1718; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 1719; GFX90A-NEXT: v_mul_hi_u32 v2, s6, v2 1720; GFX90A-NEXT: v_mul_lo_u32 v3, v2, s10 1721; GFX90A-NEXT: v_mul_f32_e32 v5, s3, v5 1722; GFX90A-NEXT: v_sub_u32_e32 v3, s6, v3 1723; GFX90A-NEXT: v_cvt_u32_f32_e32 v5, v5 1724; GFX90A-NEXT: v_add_u32_e32 v6, 1, v2 1725; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 1726; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 1727; GFX90A-NEXT: v_subrev_u32_e32 v6, s10, v3 1728; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1729; GFX90A-NEXT: s_sub_i32 s2, 0, s11 1730; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 1731; GFX90A-NEXT: v_mul_lo_u32 v3, s2, v5 1732; GFX90A-NEXT: v_mul_hi_u32 v3, v5, v3 1733; GFX90A-NEXT: v_add_u32_e32 v3, v5, v3 1734; GFX90A-NEXT: v_mul_hi_u32 v3, s7, v3 1735; GFX90A-NEXT: v_mul_lo_u32 v5, v3, s11 1736; GFX90A-NEXT: v_add_u32_e32 v6, 1, v2 1737; GFX90A-NEXT: v_sub_u32_e32 v5, s7, v5 1738; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 1739; GFX90A-NEXT: v_add_u32_e32 v6, 1, v3 1740; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 1741; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1742; GFX90A-NEXT: v_subrev_u32_e32 v6, s11, v5 1743; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 1744; GFX90A-NEXT: v_add_u32_e32 v6, 1, v3 1745; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 1746; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1747; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1748; GFX90A-NEXT: s_endpgm 1749 %r = udiv <4 x i32> %x, %y 1750 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1751 ret void 1752} 1753 1754define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1755; CHECK-LABEL: @urem_v4i32( 1756; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1757; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1758; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 1759; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 1760; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 1761; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 1762; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 1763; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 1764; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 1765; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 1766; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 1767; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 1768; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 1769; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 1770; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 1771; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 1772; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1773; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1774; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1775; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1776; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1777; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 1778; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 1779; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 1780; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]] 1781; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]] 1782; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]] 1783; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]] 1784; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]] 1785; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> undef, i32 [[TMP29]], i64 0 1786; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[X]], i64 1 1787; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1788; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float 1789; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 1790; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000 1791; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32 1792; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]] 1793; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]] 1794; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64 1795; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64 1796; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]] 1797; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32 1798; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32 1799; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 1800; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]] 1801; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64 1802; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64 1803; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]] 1804; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32 1805; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32 1806; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 1807; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]] 1808; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]] 1809; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]] 1810; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]] 1811; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]] 1812; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]] 1813; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]] 1814; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]] 1815; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP59]], i64 1 1816; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i32> [[X]], i64 2 1817; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1818; CHECK-NEXT: [[TMP63:%.*]] = uitofp i32 [[TMP62]] to float 1819; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP63]]) 1820; CHECK-NEXT: [[TMP65:%.*]] = fmul fast float [[TMP64]], 0x41EFFFFFC0000000 1821; CHECK-NEXT: [[TMP66:%.*]] = fptoui float [[TMP65]] to i32 1822; CHECK-NEXT: [[TMP67:%.*]] = sub i32 0, [[TMP62]] 1823; CHECK-NEXT: [[TMP68:%.*]] = mul i32 [[TMP67]], [[TMP66]] 1824; CHECK-NEXT: [[TMP69:%.*]] = zext i32 [[TMP66]] to i64 1825; CHECK-NEXT: [[TMP70:%.*]] = zext i32 [[TMP68]] to i64 1826; CHECK-NEXT: [[TMP71:%.*]] = mul i64 [[TMP69]], [[TMP70]] 1827; CHECK-NEXT: [[TMP72:%.*]] = trunc i64 [[TMP71]] to i32 1828; CHECK-NEXT: [[TMP73:%.*]] = lshr i64 [[TMP71]], 32 1829; CHECK-NEXT: [[TMP74:%.*]] = trunc i64 [[TMP73]] to i32 1830; CHECK-NEXT: [[TMP75:%.*]] = add i32 [[TMP66]], [[TMP74]] 1831; CHECK-NEXT: [[TMP76:%.*]] = zext i32 [[TMP61]] to i64 1832; CHECK-NEXT: [[TMP77:%.*]] = zext i32 [[TMP75]] to i64 1833; CHECK-NEXT: [[TMP78:%.*]] = mul i64 [[TMP76]], [[TMP77]] 1834; CHECK-NEXT: [[TMP79:%.*]] = trunc i64 [[TMP78]] to i32 1835; CHECK-NEXT: [[TMP80:%.*]] = lshr i64 [[TMP78]], 32 1836; CHECK-NEXT: [[TMP81:%.*]] = trunc i64 [[TMP80]] to i32 1837; CHECK-NEXT: [[TMP82:%.*]] = mul i32 [[TMP81]], [[TMP62]] 1838; CHECK-NEXT: [[TMP83:%.*]] = sub i32 [[TMP61]], [[TMP82]] 1839; CHECK-NEXT: [[TMP84:%.*]] = icmp uge i32 [[TMP83]], [[TMP62]] 1840; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP83]], [[TMP62]] 1841; CHECK-NEXT: [[TMP86:%.*]] = select i1 [[TMP84]], i32 [[TMP85]], i32 [[TMP83]] 1842; CHECK-NEXT: [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP62]] 1843; CHECK-NEXT: [[TMP88:%.*]] = sub i32 [[TMP86]], [[TMP62]] 1844; CHECK-NEXT: [[TMP89:%.*]] = select i1 [[TMP87]], i32 [[TMP88]], i32 [[TMP86]] 1845; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP89]], i64 2 1846; CHECK-NEXT: [[TMP91:%.*]] = extractelement <4 x i32> [[X]], i64 3 1847; CHECK-NEXT: [[TMP92:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1848; CHECK-NEXT: [[TMP93:%.*]] = uitofp i32 [[TMP92]] to float 1849; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP93]]) 1850; CHECK-NEXT: [[TMP95:%.*]] = fmul fast float [[TMP94]], 0x41EFFFFFC0000000 1851; CHECK-NEXT: [[TMP96:%.*]] = fptoui float [[TMP95]] to i32 1852; CHECK-NEXT: [[TMP97:%.*]] = sub i32 0, [[TMP92]] 1853; CHECK-NEXT: [[TMP98:%.*]] = mul i32 [[TMP97]], [[TMP96]] 1854; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP96]] to i64 1855; CHECK-NEXT: [[TMP100:%.*]] = zext i32 [[TMP98]] to i64 1856; CHECK-NEXT: [[TMP101:%.*]] = mul i64 [[TMP99]], [[TMP100]] 1857; CHECK-NEXT: [[TMP102:%.*]] = trunc i64 [[TMP101]] to i32 1858; CHECK-NEXT: [[TMP103:%.*]] = lshr i64 [[TMP101]], 32 1859; CHECK-NEXT: [[TMP104:%.*]] = trunc i64 [[TMP103]] to i32 1860; CHECK-NEXT: [[TMP105:%.*]] = add i32 [[TMP96]], [[TMP104]] 1861; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP91]] to i64 1862; CHECK-NEXT: [[TMP107:%.*]] = zext i32 [[TMP105]] to i64 1863; CHECK-NEXT: [[TMP108:%.*]] = mul i64 [[TMP106]], [[TMP107]] 1864; CHECK-NEXT: [[TMP109:%.*]] = trunc i64 [[TMP108]] to i32 1865; CHECK-NEXT: [[TMP110:%.*]] = lshr i64 [[TMP108]], 32 1866; CHECK-NEXT: [[TMP111:%.*]] = trunc i64 [[TMP110]] to i32 1867; CHECK-NEXT: [[TMP112:%.*]] = mul i32 [[TMP111]], [[TMP92]] 1868; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP91]], [[TMP112]] 1869; CHECK-NEXT: [[TMP114:%.*]] = icmp uge i32 [[TMP113]], [[TMP92]] 1870; CHECK-NEXT: [[TMP115:%.*]] = sub i32 [[TMP113]], [[TMP92]] 1871; CHECK-NEXT: [[TMP116:%.*]] = select i1 [[TMP114]], i32 [[TMP115]], i32 [[TMP113]] 1872; CHECK-NEXT: [[TMP117:%.*]] = icmp uge i32 [[TMP116]], [[TMP92]] 1873; CHECK-NEXT: [[TMP118:%.*]] = sub i32 [[TMP116]], [[TMP92]] 1874; CHECK-NEXT: [[TMP119:%.*]] = select i1 [[TMP117]], i32 [[TMP118]], i32 [[TMP116]] 1875; CHECK-NEXT: [[TMP120:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP119]], i64 3 1876; CHECK-NEXT: store <4 x i32> [[TMP120]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1877; CHECK-NEXT: ret void 1878; 1879; GFX6-LABEL: urem_v4i32: 1880; GFX6: ; %bb.0: 1881; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1882; GFX6-NEXT: s_mov_b32 s13, 0x4f7ffffe 1883; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1884; GFX6-NEXT: s_mov_b32 s3, 0xf000 1885; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1886; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 1887; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 1888; GFX6-NEXT: s_sub_i32 s2, 0, s8 1889; GFX6-NEXT: s_sub_i32 s12, 0, s9 1890; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 1891; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 1892; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 1893; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s11 1894; GFX6-NEXT: v_mul_f32_e32 v0, s13, v0 1895; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 1896; GFX6-NEXT: v_mul_f32_e32 v1, s13, v1 1897; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 1898; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 1899; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 1900; GFX6-NEXT: s_mov_b32 s2, -1 1901; GFX6-NEXT: v_mul_lo_u32 v4, s12, v1 1902; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 1903; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 1904; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1905; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 1906; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 1907; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 1908; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 1909; GFX6-NEXT: v_mul_f32_e32 v2, s13, v3 1910; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 1911; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 1912; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1913; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 1914; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1915; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1916; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 1917; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1918; GFX6-NEXT: s_sub_i32 s4, 0, s10 1919; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1920; GFX6-NEXT: v_mul_lo_u32 v3, s4, v2 1921; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 1922; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 1923; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1924; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 1925; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1926; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 1927; GFX6-NEXT: s_sub_i32 s4, 0, s11 1928; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 1929; GFX6-NEXT: v_mul_f32_e32 v3, s13, v4 1930; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 1931; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 1932; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 1933; GFX6-NEXT: v_mul_lo_u32 v5, s4, v3 1934; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1935; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1936; GFX6-NEXT: v_mul_lo_u32 v2, v2, s10 1937; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 1938; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 1939; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 1940; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 1941; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 1942; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1943; GFX6-NEXT: v_mul_lo_u32 v3, v3, s11 1944; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1945; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s10, v2 1946; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1947; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 1948; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 1949; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 1950; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1951; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1952; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 1953; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1954; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1955; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1956; GFX6-NEXT: s_endpgm 1957; 1958; GFX9-LABEL: urem_v4i32: 1959; GFX9: ; %bb.0: 1960; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 1961; GFX9-NEXT: s_mov_b32 s12, 0x4f7ffffe 1962; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1963; GFX9-NEXT: v_mov_b32_e32 v4, 0 1964; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1965; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 1966; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 1967; GFX9-NEXT: s_sub_i32 s2, 0, s8 1968; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s10 1969; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1970; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 1971; GFX9-NEXT: s_sub_i32 s3, 0, s9 1972; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 1973; GFX9-NEXT: v_mul_f32_e32 v0, s12, v0 1974; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1975; GFX9-NEXT: v_mul_f32_e32 v1, s12, v1 1976; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 1977; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s11 1978; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 1979; GFX9-NEXT: s_sub_i32 s2, 0, s10 1980; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 1981; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 1982; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 1983; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 1984; GFX9-NEXT: v_mul_f32_e32 v2, s12, v5 1985; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 1986; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 1987; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v6 1988; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 1989; GFX9-NEXT: v_mul_lo_u32 v5, s2, v2 1990; GFX9-NEXT: s_sub_i32 s2, 0, s11 1991; GFX9-NEXT: v_mul_f32_e32 v3, s12, v3 1992; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 1993; GFX9-NEXT: v_mul_hi_u32 v5, v2, v5 1994; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 1995; GFX9-NEXT: v_mul_lo_u32 v0, v0, s8 1996; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 1997; GFX9-NEXT: v_mul_lo_u32 v5, s2, v3 1998; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 1999; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 2000; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 2001; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 2002; GFX9-NEXT: v_subrev_u32_e32 v6, s8, v0 2003; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2004; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 2005; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 2006; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 2007; GFX9-NEXT: v_mul_lo_u32 v2, v2, s10 2008; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 2009; GFX9-NEXT: v_subrev_u32_e32 v6, s8, v0 2010; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2011; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 2012; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v1 2013; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2014; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 2015; GFX9-NEXT: v_mul_lo_u32 v3, v3, s11 2016; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v1 2017; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2018; GFX9-NEXT: v_sub_u32_e32 v2, s6, v2 2019; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 2020; GFX9-NEXT: v_subrev_u32_e32 v5, s10, v2 2021; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 2022; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2023; GFX9-NEXT: v_subrev_u32_e32 v5, s10, v2 2024; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 2025; GFX9-NEXT: v_sub_u32_e32 v3, s7, v3 2026; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2027; GFX9-NEXT: v_subrev_u32_e32 v5, s11, v3 2028; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 2029; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2030; GFX9-NEXT: v_subrev_u32_e32 v5, s11, v3 2031; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 2032; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2033; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2034; GFX9-NEXT: s_endpgm 2035; 2036; GFX90A-LABEL: urem_v4i32: 2037; GFX90A: ; %bb.0: 2038; GFX90A-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 2039; GFX90A-NEXT: s_mov_b32 s12, 0x4f7ffffe 2040; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2041; GFX90A-NEXT: v_mov_b32_e32 v4, 0 2042; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 2043; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 2044; GFX90A-NEXT: s_sub_i32 s2, 0, s8 2045; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 2046; GFX90A-NEXT: s_sub_i32 s3, 0, s9 2047; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 2048; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 2049; GFX90A-NEXT: v_mul_f32_e32 v0, s12, v0 2050; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 2051; GFX90A-NEXT: v_mul_f32_e32 v1, s12, v1 2052; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 2053; GFX90A-NEXT: v_mul_lo_u32 v2, s2, v0 2054; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 2055; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 2056; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 2057; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s8 2058; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 2059; GFX90A-NEXT: v_subrev_u32_e32 v2, s8, v0 2060; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2061; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2062; GFX90A-NEXT: v_subrev_u32_e32 v2, s8, v0 2063; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2064; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2065; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s10 2066; GFX90A-NEXT: v_mul_lo_u32 v3, s3, v1 2067; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 2068; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 2069; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v2 2070; GFX90A-NEXT: v_mul_hi_u32 v1, s5, v1 2071; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s9 2072; GFX90A-NEXT: v_sub_u32_e32 v1, s5, v1 2073; GFX90A-NEXT: v_mul_f32_e32 v2, s12, v2 2074; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 2075; GFX90A-NEXT: v_subrev_u32_e32 v3, s9, v1 2076; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2077; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2078; GFX90A-NEXT: v_subrev_u32_e32 v3, s9, v1 2079; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2080; GFX90A-NEXT: s_sub_i32 s2, 0, s10 2081; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2082; GFX90A-NEXT: v_mul_lo_u32 v3, s2, v2 2083; GFX90A-NEXT: v_mul_hi_u32 v3, v2, v3 2084; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 2085; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s11 2086; GFX90A-NEXT: v_mul_hi_u32 v2, s6, v2 2087; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s10 2088; GFX90A-NEXT: v_sub_u32_e32 v2, s6, v2 2089; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v3 2090; GFX90A-NEXT: v_subrev_u32_e32 v5, s10, v2 2091; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 2092; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2093; GFX90A-NEXT: v_mul_f32_e32 v3, s12, v3 2094; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 2095; GFX90A-NEXT: v_subrev_u32_e32 v5, s10, v2 2096; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 2097; GFX90A-NEXT: s_sub_i32 s2, 0, s11 2098; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2099; GFX90A-NEXT: v_mul_lo_u32 v5, s2, v3 2100; GFX90A-NEXT: v_mul_hi_u32 v5, v3, v5 2101; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 2102; GFX90A-NEXT: v_mul_hi_u32 v3, s7, v3 2103; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s11 2104; GFX90A-NEXT: v_sub_u32_e32 v3, s7, v3 2105; GFX90A-NEXT: v_subrev_u32_e32 v5, s11, v3 2106; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 2107; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2108; GFX90A-NEXT: v_subrev_u32_e32 v5, s11, v3 2109; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 2110; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2111; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2112; GFX90A-NEXT: s_endpgm 2113 %r = urem <4 x i32> %x, %y 2114 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 2115 ret void 2116} 2117 2118define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 2119; CHECK-LABEL: @sdiv_v4i32( 2120; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 2121; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 2122; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 2123; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 2124; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 2125; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 2126; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 2127; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 2128; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 2129; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 2130; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 2131; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000 2132; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 2133; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]] 2134; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]] 2135; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64 2136; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 2137; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 2138; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 2139; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 2140; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 2141; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]] 2142; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64 2143; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64 2144; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 2145; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 2146; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 2147; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 2148; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]] 2149; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]] 2150; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]] 2151; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1 2152; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]] 2153; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]] 2154; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]] 2155; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]] 2156; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1 2157; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]] 2158; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]] 2159; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]] 2160; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> undef, i32 [[TMP40]], i64 0 2161; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[X]], i64 1 2162; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i32> [[Y]], i64 1 2163; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31 2164; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31 2165; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]] 2166; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]] 2167; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]] 2168; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]] 2169; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]] 2170; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float 2171; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]]) 2172; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000 2173; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32 2174; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]] 2175; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]] 2176; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64 2177; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64 2178; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]] 2179; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32 2180; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32 2181; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32 2182; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]] 2183; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64 2184; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64 2185; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]] 2186; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32 2187; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32 2188; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32 2189; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]] 2190; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]] 2191; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]] 2192; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1 2193; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]] 2194; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]] 2195; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]] 2196; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]] 2197; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1 2198; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]] 2199; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]] 2200; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]] 2201; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP81]], i64 1 2202; CHECK-NEXT: [[TMP83:%.*]] = extractelement <4 x i32> [[X]], i64 2 2203; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i32> [[Y]], i64 2 2204; CHECK-NEXT: [[TMP85:%.*]] = ashr i32 [[TMP83]], 31 2205; CHECK-NEXT: [[TMP86:%.*]] = ashr i32 [[TMP84]], 31 2206; CHECK-NEXT: [[TMP87:%.*]] = xor i32 [[TMP85]], [[TMP86]] 2207; CHECK-NEXT: [[TMP88:%.*]] = add i32 [[TMP83]], [[TMP85]] 2208; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP84]], [[TMP86]] 2209; CHECK-NEXT: [[TMP90:%.*]] = xor i32 [[TMP88]], [[TMP85]] 2210; CHECK-NEXT: [[TMP91:%.*]] = xor i32 [[TMP89]], [[TMP86]] 2211; CHECK-NEXT: [[TMP92:%.*]] = uitofp i32 [[TMP91]] to float 2212; CHECK-NEXT: [[TMP93:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP92]]) 2213; CHECK-NEXT: [[TMP94:%.*]] = fmul fast float [[TMP93]], 0x41EFFFFFC0000000 2214; CHECK-NEXT: [[TMP95:%.*]] = fptoui float [[TMP94]] to i32 2215; CHECK-NEXT: [[TMP96:%.*]] = sub i32 0, [[TMP91]] 2216; CHECK-NEXT: [[TMP97:%.*]] = mul i32 [[TMP96]], [[TMP95]] 2217; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP95]] to i64 2218; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64 2219; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]] 2220; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 2221; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32 2222; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32 2223; CHECK-NEXT: [[TMP104:%.*]] = add i32 [[TMP95]], [[TMP103]] 2224; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP90]] to i64 2225; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64 2226; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 2227; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 2228; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 2229; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 2230; CHECK-NEXT: [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP91]] 2231; CHECK-NEXT: [[TMP112:%.*]] = sub i32 [[TMP90]], [[TMP111]] 2232; CHECK-NEXT: [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP91]] 2233; CHECK-NEXT: [[TMP114:%.*]] = add i32 [[TMP110]], 1 2234; CHECK-NEXT: [[TMP115:%.*]] = select i1 [[TMP113]], i32 [[TMP114]], i32 [[TMP110]] 2235; CHECK-NEXT: [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP91]] 2236; CHECK-NEXT: [[TMP117:%.*]] = select i1 [[TMP113]], i32 [[TMP116]], i32 [[TMP112]] 2237; CHECK-NEXT: [[TMP118:%.*]] = icmp uge i32 [[TMP117]], [[TMP91]] 2238; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], 1 2239; CHECK-NEXT: [[TMP120:%.*]] = select i1 [[TMP118]], i32 [[TMP119]], i32 [[TMP115]] 2240; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP120]], [[TMP87]] 2241; CHECK-NEXT: [[TMP122:%.*]] = sub i32 [[TMP121]], [[TMP87]] 2242; CHECK-NEXT: [[TMP123:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP122]], i64 2 2243; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i32> [[X]], i64 3 2244; CHECK-NEXT: [[TMP125:%.*]] = extractelement <4 x i32> [[Y]], i64 3 2245; CHECK-NEXT: [[TMP126:%.*]] = ashr i32 [[TMP124]], 31 2246; CHECK-NEXT: [[TMP127:%.*]] = ashr i32 [[TMP125]], 31 2247; CHECK-NEXT: [[TMP128:%.*]] = xor i32 [[TMP126]], [[TMP127]] 2248; CHECK-NEXT: [[TMP129:%.*]] = add i32 [[TMP124]], [[TMP126]] 2249; CHECK-NEXT: [[TMP130:%.*]] = add i32 [[TMP125]], [[TMP127]] 2250; CHECK-NEXT: [[TMP131:%.*]] = xor i32 [[TMP129]], [[TMP126]] 2251; CHECK-NEXT: [[TMP132:%.*]] = xor i32 [[TMP130]], [[TMP127]] 2252; CHECK-NEXT: [[TMP133:%.*]] = uitofp i32 [[TMP132]] to float 2253; CHECK-NEXT: [[TMP134:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP133]]) 2254; CHECK-NEXT: [[TMP135:%.*]] = fmul fast float [[TMP134]], 0x41EFFFFFC0000000 2255; CHECK-NEXT: [[TMP136:%.*]] = fptoui float [[TMP135]] to i32 2256; CHECK-NEXT: [[TMP137:%.*]] = sub i32 0, [[TMP132]] 2257; CHECK-NEXT: [[TMP138:%.*]] = mul i32 [[TMP137]], [[TMP136]] 2258; CHECK-NEXT: [[TMP139:%.*]] = zext i32 [[TMP136]] to i64 2259; CHECK-NEXT: [[TMP140:%.*]] = zext i32 [[TMP138]] to i64 2260; CHECK-NEXT: [[TMP141:%.*]] = mul i64 [[TMP139]], [[TMP140]] 2261; CHECK-NEXT: [[TMP142:%.*]] = trunc i64 [[TMP141]] to i32 2262; CHECK-NEXT: [[TMP143:%.*]] = lshr i64 [[TMP141]], 32 2263; CHECK-NEXT: [[TMP144:%.*]] = trunc i64 [[TMP143]] to i32 2264; CHECK-NEXT: [[TMP145:%.*]] = add i32 [[TMP136]], [[TMP144]] 2265; CHECK-NEXT: [[TMP146:%.*]] = zext i32 [[TMP131]] to i64 2266; CHECK-NEXT: [[TMP147:%.*]] = zext i32 [[TMP145]] to i64 2267; CHECK-NEXT: [[TMP148:%.*]] = mul i64 [[TMP146]], [[TMP147]] 2268; CHECK-NEXT: [[TMP149:%.*]] = trunc i64 [[TMP148]] to i32 2269; CHECK-NEXT: [[TMP150:%.*]] = lshr i64 [[TMP148]], 32 2270; CHECK-NEXT: [[TMP151:%.*]] = trunc i64 [[TMP150]] to i32 2271; CHECK-NEXT: [[TMP152:%.*]] = mul i32 [[TMP151]], [[TMP132]] 2272; CHECK-NEXT: [[TMP153:%.*]] = sub i32 [[TMP131]], [[TMP152]] 2273; CHECK-NEXT: [[TMP154:%.*]] = icmp uge i32 [[TMP153]], [[TMP132]] 2274; CHECK-NEXT: [[TMP155:%.*]] = add i32 [[TMP151]], 1 2275; CHECK-NEXT: [[TMP156:%.*]] = select i1 [[TMP154]], i32 [[TMP155]], i32 [[TMP151]] 2276; CHECK-NEXT: [[TMP157:%.*]] = sub i32 [[TMP153]], [[TMP132]] 2277; CHECK-NEXT: [[TMP158:%.*]] = select i1 [[TMP154]], i32 [[TMP157]], i32 [[TMP153]] 2278; CHECK-NEXT: [[TMP159:%.*]] = icmp uge i32 [[TMP158]], [[TMP132]] 2279; CHECK-NEXT: [[TMP160:%.*]] = add i32 [[TMP156]], 1 2280; CHECK-NEXT: [[TMP161:%.*]] = select i1 [[TMP159]], i32 [[TMP160]], i32 [[TMP156]] 2281; CHECK-NEXT: [[TMP162:%.*]] = xor i32 [[TMP161]], [[TMP128]] 2282; CHECK-NEXT: [[TMP163:%.*]] = sub i32 [[TMP162]], [[TMP128]] 2283; CHECK-NEXT: [[TMP164:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP163]], i64 3 2284; CHECK-NEXT: store <4 x i32> [[TMP164]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 2285; CHECK-NEXT: ret void 2286; 2287; GFX6-LABEL: sdiv_v4i32: 2288; GFX6: ; %bb.0: 2289; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 2290; GFX6-NEXT: s_mov_b32 s16, 0x4f7ffffe 2291; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 2292; GFX6-NEXT: s_mov_b32 s15, 0xf000 2293; GFX6-NEXT: s_mov_b32 s14, -1 2294; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2295; GFX6-NEXT: s_ashr_i32 s2, s8, 31 2296; GFX6-NEXT: s_add_i32 s3, s8, s2 2297; GFX6-NEXT: s_xor_b32 s3, s3, s2 2298; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 2299; GFX6-NEXT: s_ashr_i32 s8, s9, 31 2300; GFX6-NEXT: s_add_i32 s0, s9, s8 2301; GFX6-NEXT: s_xor_b32 s9, s0, s8 2302; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 2303; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 2304; GFX6-NEXT: s_sub_i32 s1, 0, s3 2305; GFX6-NEXT: s_ashr_i32 s0, s4, 31 2306; GFX6-NEXT: v_mul_f32_e32 v0, s16, v0 2307; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 2308; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 2309; GFX6-NEXT: s_xor_b32 s2, s0, s2 2310; GFX6-NEXT: v_mul_lo_u32 v2, s1, v0 2311; GFX6-NEXT: s_add_i32 s1, s4, s0 2312; GFX6-NEXT: v_mul_f32_e32 v1, s16, v1 2313; GFX6-NEXT: s_xor_b32 s1, s1, s0 2314; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 2315; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2316; GFX6-NEXT: s_sub_i32 s0, 0, s9 2317; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 2318; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 2319; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 2320; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 2321; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 2322; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 2323; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 2324; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v3 2325; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 2326; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s3, v3 2327; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] 2328; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 2329; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2330; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 2331; GFX6-NEXT: s_ashr_i32 s0, s5, 31 2332; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 2333; GFX6-NEXT: s_add_i32 s1, s5, s0 2334; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 2335; GFX6-NEXT: s_ashr_i32 s3, s10, 31 2336; GFX6-NEXT: s_xor_b32 s1, s1, s0 2337; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 2338; GFX6-NEXT: s_xor_b32 s2, s0, s8 2339; GFX6-NEXT: s_add_i32 s0, s10, s3 2340; GFX6-NEXT: s_xor_b32 s4, s0, s3 2341; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 2342; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 2343; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 2344; GFX6-NEXT: v_mul_lo_u32 v2, v1, s9 2345; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 2346; GFX6-NEXT: v_mul_f32_e32 v3, s16, v3 2347; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 2348; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 2349; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 2350; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 2351; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v2 2352; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 2353; GFX6-NEXT: s_sub_i32 s0, 0, s4 2354; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 2355; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 2356; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 2357; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 2358; GFX6-NEXT: v_mul_hi_u32 v2, v3, v5 2359; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 2360; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 2361; GFX6-NEXT: s_ashr_i32 s2, s11, 31 2362; GFX6-NEXT: s_ashr_i32 s0, s6, 31 2363; GFX6-NEXT: s_add_i32 s5, s11, s2 2364; GFX6-NEXT: s_add_i32 s1, s6, s0 2365; GFX6-NEXT: s_xor_b32 s5, s5, s2 2366; GFX6-NEXT: s_xor_b32 s1, s1, s0 2367; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 2368; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s5 2369; GFX6-NEXT: v_mul_hi_u32 v2, s1, v2 2370; GFX6-NEXT: s_xor_b32 s3, s0, s3 2371; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v4 2372; GFX6-NEXT: v_mul_lo_u32 v3, v2, s4 2373; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v2 2374; GFX6-NEXT: v_mul_f32_e32 v4, s16, v4 2375; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 2376; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 2377; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v3 2378; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 2379; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v3 2380; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 2381; GFX6-NEXT: s_sub_i32 s0, 0, s5 2382; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 2383; GFX6-NEXT: s_ashr_i32 s0, s7, 31 2384; GFX6-NEXT: s_add_i32 s1, s7, s0 2385; GFX6-NEXT: s_xor_b32 s1, s1, s0 2386; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 2387; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v2 2388; GFX6-NEXT: s_xor_b32 s2, s0, s2 2389; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 2390; GFX6-NEXT: v_mul_hi_u32 v4, s1, v4 2391; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 2392; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2393; GFX6-NEXT: v_xor_b32_e32 v2, s3, v2 2394; GFX6-NEXT: v_mul_lo_u32 v3, v4, s5 2395; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 2396; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v2 2397; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 2398; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v3 2399; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] 2400; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v3 2401; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 2402; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 2403; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 2404; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 2405; GFX6-NEXT: v_xor_b32_e32 v3, s2, v3 2406; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v3 2407; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 2408; GFX6-NEXT: s_endpgm 2409; 2410; GFX9-LABEL: sdiv_v4i32: 2411; GFX9: ; %bb.0: 2412; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 2413; GFX9-NEXT: s_mov_b32 s15, 0x4f7ffffe 2414; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2415; GFX9-NEXT: v_mov_b32_e32 v4, 0 2416; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2417; GFX9-NEXT: s_ashr_i32 s2, s8, 31 2418; GFX9-NEXT: s_add_i32 s3, s8, s2 2419; GFX9-NEXT: s_xor_b32 s3, s3, s2 2420; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 2421; GFX9-NEXT: s_ashr_i32 s12, s9, 31 2422; GFX9-NEXT: s_add_i32 s9, s9, s12 2423; GFX9-NEXT: s_xor_b32 s9, s9, s12 2424; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 2425; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 2426; GFX9-NEXT: s_sub_i32 s14, 0, s3 2427; GFX9-NEXT: s_ashr_i32 s8, s4, 31 2428; GFX9-NEXT: v_mul_f32_e32 v0, s15, v0 2429; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 2430; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 2431; GFX9-NEXT: s_add_i32 s4, s4, s8 2432; GFX9-NEXT: s_xor_b32 s4, s4, s8 2433; GFX9-NEXT: v_mul_lo_u32 v2, s14, v0 2434; GFX9-NEXT: v_mul_f32_e32 v1, s15, v1 2435; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 2436; GFX9-NEXT: s_sub_i32 s14, 0, s9 2437; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 2438; GFX9-NEXT: s_ashr_i32 s13, s5, 31 2439; GFX9-NEXT: v_mul_lo_u32 v3, s14, v1 2440; GFX9-NEXT: s_add_i32 s5, s5, s13 2441; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 2442; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 2443; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 2444; GFX9-NEXT: s_xor_b32 s5, s5, s13 2445; GFX9-NEXT: s_xor_b32 s2, s8, s2 2446; GFX9-NEXT: v_mul_lo_u32 v3, v0, s3 2447; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 2448; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 2449; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 2450; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 2451; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 2452; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2453; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v3 2454; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 2455; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 2456; GFX9-NEXT: s_ashr_i32 s3, s10, 31 2457; GFX9-NEXT: s_add_i32 s4, s10, s3 2458; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 2459; GFX9-NEXT: s_xor_b32 s4, s4, s3 2460; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 2461; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 2462; GFX9-NEXT: v_mul_lo_u32 v2, v1, s9 2463; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 2464; GFX9-NEXT: s_ashr_i32 s8, s11, 31 2465; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 2466; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 2467; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 2468; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 2469; GFX9-NEXT: v_mul_f32_e32 v3, s15, v3 2470; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 2471; GFX9-NEXT: v_subrev_u32_e32 v5, s9, v2 2472; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2473; GFX9-NEXT: s_sub_i32 s5, 0, s4 2474; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 2475; GFX9-NEXT: v_mul_lo_u32 v2, s5, v3 2476; GFX9-NEXT: s_add_i32 s9, s11, s8 2477; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 2478; GFX9-NEXT: s_xor_b32 s9, s9, s8 2479; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 2480; GFX9-NEXT: v_mul_hi_u32 v2, v3, v2 2481; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s9 2482; GFX9-NEXT: s_ashr_i32 s5, s6, 31 2483; GFX9-NEXT: s_add_i32 s6, s6, s5 2484; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 2485; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v5 2486; GFX9-NEXT: s_xor_b32 s6, s6, s5 2487; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 2488; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 2489; GFX9-NEXT: v_mul_f32_e32 v3, s15, v3 2490; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 2491; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 2492; GFX9-NEXT: s_xor_b32 s2, s13, s12 2493; GFX9-NEXT: v_mul_lo_u32 v5, v2, s4 2494; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 2495; GFX9-NEXT: v_subrev_u32_e32 v1, s2, v1 2496; GFX9-NEXT: s_xor_b32 s2, s5, s3 2497; GFX9-NEXT: s_sub_i32 s3, 0, s9 2498; GFX9-NEXT: v_mul_lo_u32 v7, s3, v3 2499; GFX9-NEXT: v_sub_u32_e32 v5, s6, v5 2500; GFX9-NEXT: v_add_u32_e32 v6, 1, v2 2501; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 2502; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2503; GFX9-NEXT: v_subrev_u32_e32 v6, s4, v5 2504; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 2505; GFX9-NEXT: v_mul_hi_u32 v6, v3, v7 2506; GFX9-NEXT: s_ashr_i32 s3, s7, 31 2507; GFX9-NEXT: s_add_i32 s5, s7, s3 2508; GFX9-NEXT: s_xor_b32 s5, s5, s3 2509; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 2510; GFX9-NEXT: v_mul_hi_u32 v3, s5, v3 2511; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 2512; GFX9-NEXT: v_add_u32_e32 v6, 1, v2 2513; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2514; GFX9-NEXT: v_mul_lo_u32 v5, v3, s9 2515; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 2516; GFX9-NEXT: v_xor_b32_e32 v2, s2, v2 2517; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v2 2518; GFX9-NEXT: v_sub_u32_e32 v5, s5, v5 2519; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 2520; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2521; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v5 2522; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 2523; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 2524; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 2525; GFX9-NEXT: s_xor_b32 s2, s3, s8 2526; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2527; GFX9-NEXT: v_xor_b32_e32 v3, s2, v3 2528; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v3 2529; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2530; GFX9-NEXT: s_endpgm 2531; 2532; GFX90A-LABEL: sdiv_v4i32: 2533; GFX90A: ; %bb.0: 2534; GFX90A-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 2535; GFX90A-NEXT: s_mov_b32 s13, 0x4f7ffffe 2536; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2537; GFX90A-NEXT: v_mov_b32_e32 v4, 0 2538; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 2539; GFX90A-NEXT: s_ashr_i32 s2, s8, 31 2540; GFX90A-NEXT: s_add_i32 s3, s8, s2 2541; GFX90A-NEXT: s_xor_b32 s3, s3, s2 2542; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 2543; GFX90A-NEXT: s_ashr_i32 s8, s4, 31 2544; GFX90A-NEXT: s_add_i32 s4, s4, s8 2545; GFX90A-NEXT: s_xor_b32 s2, s8, s2 2546; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 2547; GFX90A-NEXT: s_xor_b32 s4, s4, s8 2548; GFX90A-NEXT: s_sub_i32 s8, 0, s3 2549; GFX90A-NEXT: s_ashr_i32 s12, s9, 31 2550; GFX90A-NEXT: v_mul_f32_e32 v0, s13, v0 2551; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 2552; GFX90A-NEXT: v_mul_lo_u32 v1, s8, v0 2553; GFX90A-NEXT: v_mul_hi_u32 v1, v0, v1 2554; GFX90A-NEXT: v_add_u32_e32 v0, v0, v1 2555; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 2556; GFX90A-NEXT: v_mul_lo_u32 v1, v0, s3 2557; GFX90A-NEXT: v_sub_u32_e32 v1, s4, v1 2558; GFX90A-NEXT: s_add_i32 s4, s9, s12 2559; GFX90A-NEXT: s_xor_b32 s4, s4, s12 2560; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s4 2561; GFX90A-NEXT: v_add_u32_e32 v2, 1, v0 2562; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 2563; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2564; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v1 2565; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 2566; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 2567; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v3 2568; GFX90A-NEXT: v_add_u32_e32 v2, 1, v0 2569; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2570; GFX90A-NEXT: v_xor_b32_e32 v0, s2, v0 2571; GFX90A-NEXT: v_mul_f32_e32 v1, s13, v1 2572; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 2573; GFX90A-NEXT: v_subrev_u32_e32 v0, s2, v0 2574; GFX90A-NEXT: s_ashr_i32 s2, s5, 31 2575; GFX90A-NEXT: s_add_i32 s5, s5, s2 2576; GFX90A-NEXT: s_xor_b32 s3, s2, s12 2577; GFX90A-NEXT: s_xor_b32 s2, s5, s2 2578; GFX90A-NEXT: s_sub_i32 s5, 0, s4 2579; GFX90A-NEXT: v_mul_lo_u32 v2, s5, v1 2580; GFX90A-NEXT: v_mul_hi_u32 v2, v1, v2 2581; GFX90A-NEXT: v_add_u32_e32 v1, v1, v2 2582; GFX90A-NEXT: v_mul_hi_u32 v1, s2, v1 2583; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s4 2584; GFX90A-NEXT: v_sub_u32_e32 v2, s2, v2 2585; GFX90A-NEXT: s_ashr_i32 s2, s10, 31 2586; GFX90A-NEXT: s_add_i32 s5, s10, s2 2587; GFX90A-NEXT: s_xor_b32 s5, s5, s2 2588; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s5 2589; GFX90A-NEXT: v_add_u32_e32 v3, 1, v1 2590; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 2591; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2592; GFX90A-NEXT: v_subrev_u32_e32 v3, s4, v2 2593; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 2594; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 2595; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v5 2596; GFX90A-NEXT: v_add_u32_e32 v3, 1, v1 2597; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2598; GFX90A-NEXT: v_xor_b32_e32 v1, s3, v1 2599; GFX90A-NEXT: v_mul_f32_e32 v2, s13, v2 2600; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 2601; GFX90A-NEXT: v_subrev_u32_e32 v1, s3, v1 2602; GFX90A-NEXT: s_ashr_i32 s3, s6, 31 2603; GFX90A-NEXT: s_add_i32 s4, s6, s3 2604; GFX90A-NEXT: s_xor_b32 s2, s3, s2 2605; GFX90A-NEXT: s_xor_b32 s3, s4, s3 2606; GFX90A-NEXT: s_sub_i32 s4, 0, s5 2607; GFX90A-NEXT: v_mul_lo_u32 v3, s4, v2 2608; GFX90A-NEXT: v_mul_hi_u32 v3, v2, v3 2609; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 2610; GFX90A-NEXT: v_mul_hi_u32 v2, s3, v2 2611; GFX90A-NEXT: v_mul_lo_u32 v3, v2, s5 2612; GFX90A-NEXT: v_sub_u32_e32 v3, s3, v3 2613; GFX90A-NEXT: s_ashr_i32 s3, s11, 31 2614; GFX90A-NEXT: s_add_i32 s4, s11, s3 2615; GFX90A-NEXT: s_xor_b32 s4, s4, s3 2616; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s4 2617; GFX90A-NEXT: v_add_u32_e32 v5, 1, v2 2618; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 2619; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2620; GFX90A-NEXT: v_subrev_u32_e32 v5, s5, v3 2621; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2622; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 2623; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v6 2624; GFX90A-NEXT: v_add_u32_e32 v5, 1, v2 2625; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2626; GFX90A-NEXT: v_xor_b32_e32 v2, s2, v2 2627; GFX90A-NEXT: v_mul_f32_e32 v3, s13, v3 2628; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 2629; GFX90A-NEXT: v_subrev_u32_e32 v2, s2, v2 2630; GFX90A-NEXT: s_ashr_i32 s2, s7, 31 2631; GFX90A-NEXT: s_add_i32 s5, s7, s2 2632; GFX90A-NEXT: s_xor_b32 s3, s2, s3 2633; GFX90A-NEXT: s_xor_b32 s2, s5, s2 2634; GFX90A-NEXT: s_sub_i32 s5, 0, s4 2635; GFX90A-NEXT: v_mul_lo_u32 v5, s5, v3 2636; GFX90A-NEXT: v_mul_hi_u32 v5, v3, v5 2637; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 2638; GFX90A-NEXT: v_mul_hi_u32 v3, s2, v3 2639; GFX90A-NEXT: v_mul_lo_u32 v5, v3, s4 2640; GFX90A-NEXT: v_sub_u32_e32 v5, s2, v5 2641; GFX90A-NEXT: v_add_u32_e32 v6, 1, v3 2642; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 2643; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2644; GFX90A-NEXT: v_subrev_u32_e32 v6, s4, v5 2645; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 2646; GFX90A-NEXT: v_add_u32_e32 v6, 1, v3 2647; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 2648; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2649; GFX90A-NEXT: v_xor_b32_e32 v3, s3, v3 2650; GFX90A-NEXT: v_subrev_u32_e32 v3, s3, v3 2651; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2652; GFX90A-NEXT: s_endpgm 2653 %r = sdiv <4 x i32> %x, %y 2654 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 2655 ret void 2656} 2657 2658define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 2659; CHECK-LABEL: @srem_v4i32( 2660; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 2661; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 2662; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 2663; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 2664; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 2665; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 2666; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 2667; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 2668; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 2669; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2670; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000 2671; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 2672; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]] 2673; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]] 2674; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64 2675; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 2676; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]] 2677; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 2678; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32 2679; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 2680; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] 2681; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 2682; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64 2683; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 2684; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 2685; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 2686; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 2687; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]] 2688; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]] 2689; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]] 2690; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]] 2691; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]] 2692; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]] 2693; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]] 2694; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]] 2695; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]] 2696; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]] 2697; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i32> undef, i32 [[TMP37]], i64 0 2698; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i32> [[X]], i64 1 2699; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[Y]], i64 1 2700; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31 2701; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31 2702; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]] 2703; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]] 2704; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]] 2705; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]] 2706; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float 2707; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]]) 2708; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000 2709; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32 2710; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]] 2711; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]] 2712; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64 2713; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64 2714; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]] 2715; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32 2716; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32 2717; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32 2718; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]] 2719; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64 2720; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64 2721; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]] 2722; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32 2723; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32 2724; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32 2725; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]] 2726; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]] 2727; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]] 2728; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]] 2729; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]] 2730; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]] 2731; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]] 2732; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]] 2733; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]] 2734; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]] 2735; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP75]], i64 1 2736; CHECK-NEXT: [[TMP77:%.*]] = extractelement <4 x i32> [[X]], i64 2 2737; CHECK-NEXT: [[TMP78:%.*]] = extractelement <4 x i32> [[Y]], i64 2 2738; CHECK-NEXT: [[TMP79:%.*]] = ashr i32 [[TMP77]], 31 2739; CHECK-NEXT: [[TMP80:%.*]] = ashr i32 [[TMP78]], 31 2740; CHECK-NEXT: [[TMP81:%.*]] = add i32 [[TMP77]], [[TMP79]] 2741; CHECK-NEXT: [[TMP82:%.*]] = add i32 [[TMP78]], [[TMP80]] 2742; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP79]] 2743; CHECK-NEXT: [[TMP84:%.*]] = xor i32 [[TMP82]], [[TMP80]] 2744; CHECK-NEXT: [[TMP85:%.*]] = uitofp i32 [[TMP84]] to float 2745; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP85]]) 2746; CHECK-NEXT: [[TMP87:%.*]] = fmul fast float [[TMP86]], 0x41EFFFFFC0000000 2747; CHECK-NEXT: [[TMP88:%.*]] = fptoui float [[TMP87]] to i32 2748; CHECK-NEXT: [[TMP89:%.*]] = sub i32 0, [[TMP84]] 2749; CHECK-NEXT: [[TMP90:%.*]] = mul i32 [[TMP89]], [[TMP88]] 2750; CHECK-NEXT: [[TMP91:%.*]] = zext i32 [[TMP88]] to i64 2751; CHECK-NEXT: [[TMP92:%.*]] = zext i32 [[TMP90]] to i64 2752; CHECK-NEXT: [[TMP93:%.*]] = mul i64 [[TMP91]], [[TMP92]] 2753; CHECK-NEXT: [[TMP94:%.*]] = trunc i64 [[TMP93]] to i32 2754; CHECK-NEXT: [[TMP95:%.*]] = lshr i64 [[TMP93]], 32 2755; CHECK-NEXT: [[TMP96:%.*]] = trunc i64 [[TMP95]] to i32 2756; CHECK-NEXT: [[TMP97:%.*]] = add i32 [[TMP88]], [[TMP96]] 2757; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP83]] to i64 2758; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64 2759; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]] 2760; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 2761; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32 2762; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32 2763; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP84]] 2764; CHECK-NEXT: [[TMP105:%.*]] = sub i32 [[TMP83]], [[TMP104]] 2765; CHECK-NEXT: [[TMP106:%.*]] = icmp uge i32 [[TMP105]], [[TMP84]] 2766; CHECK-NEXT: [[TMP107:%.*]] = sub i32 [[TMP105]], [[TMP84]] 2767; CHECK-NEXT: [[TMP108:%.*]] = select i1 [[TMP106]], i32 [[TMP107]], i32 [[TMP105]] 2768; CHECK-NEXT: [[TMP109:%.*]] = icmp uge i32 [[TMP108]], [[TMP84]] 2769; CHECK-NEXT: [[TMP110:%.*]] = sub i32 [[TMP108]], [[TMP84]] 2770; CHECK-NEXT: [[TMP111:%.*]] = select i1 [[TMP109]], i32 [[TMP110]], i32 [[TMP108]] 2771; CHECK-NEXT: [[TMP112:%.*]] = xor i32 [[TMP111]], [[TMP79]] 2772; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP112]], [[TMP79]] 2773; CHECK-NEXT: [[TMP114:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP113]], i64 2 2774; CHECK-NEXT: [[TMP115:%.*]] = extractelement <4 x i32> [[X]], i64 3 2775; CHECK-NEXT: [[TMP116:%.*]] = extractelement <4 x i32> [[Y]], i64 3 2776; CHECK-NEXT: [[TMP117:%.*]] = ashr i32 [[TMP115]], 31 2777; CHECK-NEXT: [[TMP118:%.*]] = ashr i32 [[TMP116]], 31 2778; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], [[TMP117]] 2779; CHECK-NEXT: [[TMP120:%.*]] = add i32 [[TMP116]], [[TMP118]] 2780; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP119]], [[TMP117]] 2781; CHECK-NEXT: [[TMP122:%.*]] = xor i32 [[TMP120]], [[TMP118]] 2782; CHECK-NEXT: [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float 2783; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]]) 2784; CHECK-NEXT: [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41EFFFFFC0000000 2785; CHECK-NEXT: [[TMP126:%.*]] = fptoui float [[TMP125]] to i32 2786; CHECK-NEXT: [[TMP127:%.*]] = sub i32 0, [[TMP122]] 2787; CHECK-NEXT: [[TMP128:%.*]] = mul i32 [[TMP127]], [[TMP126]] 2788; CHECK-NEXT: [[TMP129:%.*]] = zext i32 [[TMP126]] to i64 2789; CHECK-NEXT: [[TMP130:%.*]] = zext i32 [[TMP128]] to i64 2790; CHECK-NEXT: [[TMP131:%.*]] = mul i64 [[TMP129]], [[TMP130]] 2791; CHECK-NEXT: [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32 2792; CHECK-NEXT: [[TMP133:%.*]] = lshr i64 [[TMP131]], 32 2793; CHECK-NEXT: [[TMP134:%.*]] = trunc i64 [[TMP133]] to i32 2794; CHECK-NEXT: [[TMP135:%.*]] = add i32 [[TMP126]], [[TMP134]] 2795; CHECK-NEXT: [[TMP136:%.*]] = zext i32 [[TMP121]] to i64 2796; CHECK-NEXT: [[TMP137:%.*]] = zext i32 [[TMP135]] to i64 2797; CHECK-NEXT: [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]] 2798; CHECK-NEXT: [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32 2799; CHECK-NEXT: [[TMP140:%.*]] = lshr i64 [[TMP138]], 32 2800; CHECK-NEXT: [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32 2801; CHECK-NEXT: [[TMP142:%.*]] = mul i32 [[TMP141]], [[TMP122]] 2802; CHECK-NEXT: [[TMP143:%.*]] = sub i32 [[TMP121]], [[TMP142]] 2803; CHECK-NEXT: [[TMP144:%.*]] = icmp uge i32 [[TMP143]], [[TMP122]] 2804; CHECK-NEXT: [[TMP145:%.*]] = sub i32 [[TMP143]], [[TMP122]] 2805; CHECK-NEXT: [[TMP146:%.*]] = select i1 [[TMP144]], i32 [[TMP145]], i32 [[TMP143]] 2806; CHECK-NEXT: [[TMP147:%.*]] = icmp uge i32 [[TMP146]], [[TMP122]] 2807; CHECK-NEXT: [[TMP148:%.*]] = sub i32 [[TMP146]], [[TMP122]] 2808; CHECK-NEXT: [[TMP149:%.*]] = select i1 [[TMP147]], i32 [[TMP148]], i32 [[TMP146]] 2809; CHECK-NEXT: [[TMP150:%.*]] = xor i32 [[TMP149]], [[TMP117]] 2810; CHECK-NEXT: [[TMP151:%.*]] = sub i32 [[TMP150]], [[TMP117]] 2811; CHECK-NEXT: [[TMP152:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP151]], i64 3 2812; CHECK-NEXT: store <4 x i32> [[TMP152]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 2813; CHECK-NEXT: ret void 2814; 2815; GFX6-LABEL: srem_v4i32: 2816; GFX6: ; %bb.0: 2817; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 2818; GFX6-NEXT: s_mov_b32 s14, 0x4f7ffffe 2819; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2820; GFX6-NEXT: s_mov_b32 s3, 0xf000 2821; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2822; GFX6-NEXT: s_ashr_i32 s2, s8, 31 2823; GFX6-NEXT: s_add_i32 s8, s8, s2 2824; GFX6-NEXT: s_xor_b32 s8, s8, s2 2825; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 2826; GFX6-NEXT: s_ashr_i32 s12, s9, 31 2827; GFX6-NEXT: s_add_i32 s9, s9, s12 2828; GFX6-NEXT: s_xor_b32 s9, s9, s12 2829; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 2830; GFX6-NEXT: s_sub_i32 s13, 0, s8 2831; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 2832; GFX6-NEXT: s_ashr_i32 s12, s4, 31 2833; GFX6-NEXT: v_mul_f32_e32 v0, s14, v0 2834; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 2835; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 2836; GFX6-NEXT: s_add_i32 s4, s4, s12 2837; GFX6-NEXT: s_xor_b32 s4, s4, s12 2838; GFX6-NEXT: v_mul_lo_u32 v2, s13, v0 2839; GFX6-NEXT: v_mul_f32_e32 v1, s14, v1 2840; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2841; GFX6-NEXT: s_sub_i32 s13, 0, s9 2842; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 2843; GFX6-NEXT: s_mov_b32 s2, -1 2844; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 2845; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 2846; GFX6-NEXT: v_mul_lo_u32 v2, s13, v1 2847; GFX6-NEXT: s_ashr_i32 s13, s5, 31 2848; GFX6-NEXT: s_add_i32 s5, s5, s13 2849; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 2850; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 2851; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 2852; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 2853; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2854; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 2855; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 2856; GFX6-NEXT: s_xor_b32 s4, s5, s13 2857; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2858; GFX6-NEXT: s_ashr_i32 s5, s10, 31 2859; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2860; GFX6-NEXT: s_add_i32 s8, s10, s5 2861; GFX6-NEXT: s_xor_b32 s5, s8, s5 2862; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 2863; GFX6-NEXT: v_mul_hi_u32 v1, s4, v1 2864; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 2865; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 2866; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 2867; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 2868; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 2869; GFX6-NEXT: v_mul_f32_e32 v2, s14, v2 2870; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 2871; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 2872; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v1 2873; GFX6-NEXT: s_sub_i32 s4, 0, s5 2874; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2875; GFX6-NEXT: v_mul_lo_u32 v4, s4, v2 2876; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2877; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v1 2878; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2879; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2880; GFX6-NEXT: v_mul_hi_u32 v3, v2, v4 2881; GFX6-NEXT: s_ashr_i32 s8, s11, 31 2882; GFX6-NEXT: s_add_i32 s9, s11, s8 2883; GFX6-NEXT: s_ashr_i32 s4, s6, 31 2884; GFX6-NEXT: s_xor_b32 s8, s9, s8 2885; GFX6-NEXT: s_add_i32 s6, s6, s4 2886; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 2887; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 2888; GFX6-NEXT: s_xor_b32 s6, s6, s4 2889; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 2890; GFX6-NEXT: v_xor_b32_e32 v1, s13, v1 2891; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 2892; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s13, v1 2893; GFX6-NEXT: v_mul_lo_u32 v2, v2, s5 2894; GFX6-NEXT: v_mul_f32_e32 v3, s14, v3 2895; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 2896; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 2897; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v2 2898; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 2899; GFX6-NEXT: s_sub_i32 s6, 0, s8 2900; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 2901; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 2902; GFX6-NEXT: s_ashr_i32 s6, s7, 31 2903; GFX6-NEXT: s_add_i32 s7, s7, s6 2904; GFX6-NEXT: s_xor_b32 s7, s7, s6 2905; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 2906; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v2 2907; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 2908; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 2909; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 2910; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2911; GFX6-NEXT: v_xor_b32_e32 v2, s4, v2 2912; GFX6-NEXT: v_mul_lo_u32 v3, v3, s8 2913; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s4, v2 2914; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 2915; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 2916; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 2917; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 2918; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 2919; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 2920; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 2921; GFX6-NEXT: v_xor_b32_e32 v3, s6, v3 2922; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s6, v3 2923; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2924; GFX6-NEXT: s_endpgm 2925; 2926; GFX9-LABEL: srem_v4i32: 2927; GFX9: ; %bb.0: 2928; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 2929; GFX9-NEXT: s_mov_b32 s13, 0x4f7ffffe 2930; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2931; GFX9-NEXT: v_mov_b32_e32 v4, 0 2932; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2933; GFX9-NEXT: s_ashr_i32 s2, s8, 31 2934; GFX9-NEXT: s_add_i32 s8, s8, s2 2935; GFX9-NEXT: s_xor_b32 s2, s8, s2 2936; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 2937; GFX9-NEXT: s_ashr_i32 s3, s9, 31 2938; GFX9-NEXT: s_sub_i32 s12, 0, s2 2939; GFX9-NEXT: s_add_i32 s8, s9, s3 2940; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 2941; GFX9-NEXT: s_xor_b32 s3, s8, s3 2942; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 2943; GFX9-NEXT: s_ashr_i32 s8, s4, 31 2944; GFX9-NEXT: v_mul_f32_e32 v0, s13, v0 2945; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 2946; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 2947; GFX9-NEXT: s_add_i32 s4, s4, s8 2948; GFX9-NEXT: s_xor_b32 s4, s4, s8 2949; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0 2950; GFX9-NEXT: v_mul_f32_e32 v1, s13, v1 2951; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 2952; GFX9-NEXT: s_sub_i32 s12, 0, s3 2953; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 2954; GFX9-NEXT: s_ashr_i32 s9, s5, 31 2955; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 2956; GFX9-NEXT: s_add_i32 s5, s5, s9 2957; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 2958; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 2959; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 2960; GFX9-NEXT: s_xor_b32 s5, s5, s9 2961; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 2962; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 2963; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 2964; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 2965; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v0 2966; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 2967; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2968; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v0 2969; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 2970; GFX9-NEXT: s_ashr_i32 s2, s10, 31 2971; GFX9-NEXT: s_add_i32 s4, s10, s2 2972; GFX9-NEXT: s_xor_b32 s2, s4, s2 2973; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2974; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 2975; GFX9-NEXT: v_mul_lo_u32 v1, v1, s3 2976; GFX9-NEXT: v_xor_b32_e32 v0, s8, v0 2977; GFX9-NEXT: v_subrev_u32_e32 v0, s8, v0 2978; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 2979; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 2980; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 2981; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 2982; GFX9-NEXT: v_mul_f32_e32 v2, s13, v2 2983; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 2984; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2985; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 2986; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 2987; GFX9-NEXT: s_sub_i32 s3, 0, s2 2988; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2989; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 2990; GFX9-NEXT: s_ashr_i32 s3, s11, 31 2991; GFX9-NEXT: s_add_i32 s4, s11, s3 2992; GFX9-NEXT: s_xor_b32 s3, s4, s3 2993; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 2994; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 2995; GFX9-NEXT: s_ashr_i32 s4, s6, 31 2996; GFX9-NEXT: s_add_i32 s5, s6, s4 2997; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 2998; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 2999; GFX9-NEXT: s_xor_b32 s5, s5, s4 3000; GFX9-NEXT: v_mul_hi_u32 v2, s5, v2 3001; GFX9-NEXT: v_mul_f32_e32 v3, s13, v5 3002; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 3003; GFX9-NEXT: s_sub_i32 s6, 0, s3 3004; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 3005; GFX9-NEXT: v_xor_b32_e32 v1, s9, v1 3006; GFX9-NEXT: v_mul_lo_u32 v5, s6, v3 3007; GFX9-NEXT: v_subrev_u32_e32 v1, s9, v1 3008; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 3009; GFX9-NEXT: s_ashr_i32 s5, s7, 31 3010; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 3011; GFX9-NEXT: s_add_i32 s6, s7, s5 3012; GFX9-NEXT: s_xor_b32 s6, s6, s5 3013; GFX9-NEXT: v_subrev_u32_e32 v6, s2, v2 3014; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 3015; GFX9-NEXT: v_mul_hi_u32 v3, s6, v3 3016; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 3017; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 3018; GFX9-NEXT: v_subrev_u32_e32 v5, s2, v2 3019; GFX9-NEXT: v_mul_lo_u32 v3, v3, s3 3020; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 3021; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 3022; GFX9-NEXT: v_xor_b32_e32 v2, s4, v2 3023; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 3024; GFX9-NEXT: v_subrev_u32_e32 v5, s3, v3 3025; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 3026; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 3027; GFX9-NEXT: v_subrev_u32_e32 v5, s3, v3 3028; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 3029; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 3030; GFX9-NEXT: v_xor_b32_e32 v3, s5, v3 3031; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 3032; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v3 3033; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 3034; GFX9-NEXT: s_endpgm 3035; 3036; GFX90A-LABEL: srem_v4i32: 3037; GFX90A: ; %bb.0: 3038; GFX90A-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 3039; GFX90A-NEXT: s_mov_b32 s12, 0x4f7ffffe 3040; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3041; GFX90A-NEXT: v_mov_b32_e32 v4, 0 3042; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 3043; GFX90A-NEXT: s_ashr_i32 s2, s8, 31 3044; GFX90A-NEXT: s_add_i32 s3, s8, s2 3045; GFX90A-NEXT: s_xor_b32 s2, s3, s2 3046; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s2 3047; GFX90A-NEXT: s_ashr_i32 s8, s9, 31 3048; GFX90A-NEXT: s_add_i32 s9, s9, s8 3049; GFX90A-NEXT: s_xor_b32 s8, s9, s8 3050; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 3051; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s8 3052; GFX90A-NEXT: s_sub_i32 s9, 0, s2 3053; GFX90A-NEXT: s_ashr_i32 s3, s4, 31 3054; GFX90A-NEXT: v_mul_f32_e32 v0, s12, v0 3055; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 3056; GFX90A-NEXT: s_add_i32 s4, s4, s3 3057; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 3058; GFX90A-NEXT: s_xor_b32 s4, s4, s3 3059; GFX90A-NEXT: v_mul_lo_u32 v2, s9, v0 3060; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 3061; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 3062; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 3063; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s2 3064; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 3065; GFX90A-NEXT: v_mul_f32_e32 v1, s12, v1 3066; GFX90A-NEXT: v_subrev_u32_e32 v2, s2, v0 3067; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 3068; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 3069; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3070; GFX90A-NEXT: v_subrev_u32_e32 v2, s2, v0 3071; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 3072; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3073; GFX90A-NEXT: s_sub_i32 s4, 0, s8 3074; GFX90A-NEXT: v_xor_b32_e32 v0, s3, v0 3075; GFX90A-NEXT: s_ashr_i32 s2, s5, 31 3076; GFX90A-NEXT: v_mul_lo_u32 v2, s4, v1 3077; GFX90A-NEXT: v_subrev_u32_e32 v0, s3, v0 3078; GFX90A-NEXT: s_add_i32 s3, s5, s2 3079; GFX90A-NEXT: v_mul_hi_u32 v2, v1, v2 3080; GFX90A-NEXT: s_xor_b32 s3, s3, s2 3081; GFX90A-NEXT: v_add_u32_e32 v1, v1, v2 3082; GFX90A-NEXT: v_mul_hi_u32 v1, s3, v1 3083; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s8 3084; GFX90A-NEXT: v_sub_u32_e32 v1, s3, v1 3085; GFX90A-NEXT: s_ashr_i32 s3, s10, 31 3086; GFX90A-NEXT: s_add_i32 s4, s10, s3 3087; GFX90A-NEXT: v_subrev_u32_e32 v2, s8, v1 3088; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 3089; GFX90A-NEXT: s_xor_b32 s3, s4, s3 3090; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3091; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s3 3092; GFX90A-NEXT: v_subrev_u32_e32 v3, s8, v1 3093; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 3094; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 3095; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v2 3096; GFX90A-NEXT: v_xor_b32_e32 v1, s2, v1 3097; GFX90A-NEXT: s_sub_i32 s5, 0, s3 3098; GFX90A-NEXT: v_subrev_u32_e32 v1, s2, v1 3099; GFX90A-NEXT: v_mul_f32_e32 v2, s12, v2 3100; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 3101; GFX90A-NEXT: s_ashr_i32 s2, s6, 31 3102; GFX90A-NEXT: s_add_i32 s4, s6, s2 3103; GFX90A-NEXT: s_xor_b32 s4, s4, s2 3104; GFX90A-NEXT: v_mul_lo_u32 v3, s5, v2 3105; GFX90A-NEXT: v_mul_hi_u32 v3, v2, v3 3106; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 3107; GFX90A-NEXT: v_mul_hi_u32 v2, s4, v2 3108; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s3 3109; GFX90A-NEXT: v_sub_u32_e32 v2, s4, v2 3110; GFX90A-NEXT: s_ashr_i32 s4, s11, 31 3111; GFX90A-NEXT: s_add_i32 s5, s11, s4 3112; GFX90A-NEXT: v_subrev_u32_e32 v3, s3, v2 3113; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 3114; GFX90A-NEXT: s_xor_b32 s4, s5, s4 3115; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 3116; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s4 3117; GFX90A-NEXT: v_subrev_u32_e32 v5, s3, v2 3118; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 3119; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 3120; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v3 3121; GFX90A-NEXT: v_xor_b32_e32 v2, s2, v2 3122; GFX90A-NEXT: s_sub_i32 s5, 0, s4 3123; GFX90A-NEXT: v_subrev_u32_e32 v2, s2, v2 3124; GFX90A-NEXT: v_mul_f32_e32 v3, s12, v3 3125; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 3126; GFX90A-NEXT: s_ashr_i32 s2, s7, 31 3127; GFX90A-NEXT: s_add_i32 s3, s7, s2 3128; GFX90A-NEXT: s_xor_b32 s3, s3, s2 3129; GFX90A-NEXT: v_mul_lo_u32 v5, s5, v3 3130; GFX90A-NEXT: v_mul_hi_u32 v5, v3, v5 3131; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 3132; GFX90A-NEXT: v_mul_hi_u32 v3, s3, v3 3133; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s4 3134; GFX90A-NEXT: v_sub_u32_e32 v3, s3, v3 3135; GFX90A-NEXT: v_subrev_u32_e32 v5, s4, v3 3136; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 3137; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 3138; GFX90A-NEXT: v_subrev_u32_e32 v5, s4, v3 3139; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 3140; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 3141; GFX90A-NEXT: v_xor_b32_e32 v3, s2, v3 3142; GFX90A-NEXT: v_subrev_u32_e32 v3, s2, v3 3143; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 3144; GFX90A-NEXT: s_endpgm 3145 %r = srem <4 x i32> %x, %y 3146 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 3147 ret void 3148} 3149 3150define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 3151; CHECK-LABEL: @udiv_v4i16( 3152; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 3153; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 3154; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 3155; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 3156; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3157; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3158; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3159; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3160; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3161; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3162; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3163; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3164; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3165; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3166; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3167; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3168; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3169; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 3170; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 3171; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i16> undef, i16 [[TMP19]], i64 0 3172; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1 3173; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1 3174; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 3175; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 3176; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 3177; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 3178; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 3179; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 3180; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 3181; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 3182; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 3183; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 3184; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 3185; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 3186; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 3187; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 3188; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 3189; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 3190; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 3191; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1 3192; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2 3193; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2 3194; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 3195; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 3196; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 3197; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 3198; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 3199; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 3200; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 3201; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 3202; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 3203; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 3204; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 3205; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 3206; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 3207; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 3208; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 3209; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 3210; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 3211; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2 3212; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3 3213; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3 3214; CHECK-NEXT: [[TMP63:%.*]] = zext i16 [[TMP61]] to i32 3215; CHECK-NEXT: [[TMP64:%.*]] = zext i16 [[TMP62]] to i32 3216; CHECK-NEXT: [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float 3217; CHECK-NEXT: [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float 3218; CHECK-NEXT: [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]]) 3219; CHECK-NEXT: [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]] 3220; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]]) 3221; CHECK-NEXT: [[TMP70:%.*]] = fneg fast float [[TMP69]] 3222; CHECK-NEXT: [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]]) 3223; CHECK-NEXT: [[TMP72:%.*]] = fptoui float [[TMP69]] to i32 3224; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]]) 3225; CHECK-NEXT: [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 3226; CHECK-NEXT: [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]] 3227; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0 3228; CHECK-NEXT: [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]] 3229; CHECK-NEXT: [[TMP78:%.*]] = and i32 [[TMP77]], 65535 3230; CHECK-NEXT: [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16 3231; CHECK-NEXT: [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3 3232; CHECK-NEXT: store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 3233; CHECK-NEXT: ret void 3234; 3235; GFX6-LABEL: udiv_v4i16: 3236; GFX6: ; %bb.0: 3237; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3238; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 3239; GFX6-NEXT: s_mov_b32 s8, 0xffff 3240; GFX6-NEXT: s_mov_b32 s7, 0xf000 3241; GFX6-NEXT: s_mov_b32 s6, -1 3242; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3243; GFX6-NEXT: s_and_b32 s9, s2, s8 3244; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 3245; GFX6-NEXT: s_lshr_b32 s9, s0, 16 3246; GFX6-NEXT: s_and_b32 s0, s0, s8 3247; GFX6-NEXT: s_lshr_b32 s2, s2, 16 3248; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s0 3249; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3250; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 3251; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s9 3252; GFX6-NEXT: s_and_b32 s2, s3, s8 3253; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3254; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 3255; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3256; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3257; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 3258; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3259; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 3260; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3261; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 3262; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 3263; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s2 3264; GFX6-NEXT: s_lshr_b32 s0, s1, 16 3265; GFX6-NEXT: s_lshr_b32 s10, s3, 16 3266; GFX6-NEXT: s_and_b32 s1, s1, s8 3267; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 3268; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 3269; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 3270; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 3271; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 3272; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc 3273; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 3274; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s0 3275; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v3 3276; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3277; GFX6-NEXT: v_mad_f32 v5, -v1, v4, v5 3278; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 3279; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 3280; GFX6-NEXT: v_mul_f32_e32 v4, v6, v7 3281; GFX6-NEXT: v_trunc_f32_e32 v4, v4 3282; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v4 3283; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3284; GFX6-NEXT: v_mad_f32 v4, -v4, v3, v6 3285; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 3286; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 3287; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3288; GFX6-NEXT: v_and_b32_e32 v1, s8, v1 3289; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3290; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 3291; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 3292; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 3293; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3294; GFX6-NEXT: s_endpgm 3295; 3296; GFX9-LABEL: udiv_v4i16: 3297; GFX9: ; %bb.0: 3298; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3299; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3300; GFX9-NEXT: s_mov_b32 s8, 0xffff 3301; GFX9-NEXT: v_mov_b32_e32 v2, 0 3302; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3303; GFX9-NEXT: s_and_b32 s1, s6, s8 3304; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 3305; GFX9-NEXT: s_lshr_b32 s0, s4, 16 3306; GFX9-NEXT: s_and_b32 s4, s4, s8 3307; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 3308; GFX9-NEXT: s_lshr_b32 s4, s6, 16 3309; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3310; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 3311; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 3312; GFX9-NEXT: s_and_b32 s0, s7, s8 3313; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 3314; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 3315; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3316; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 3317; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 3318; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3319; GFX9-NEXT: v_mul_f32_e32 v1, v5, v6 3320; GFX9-NEXT: v_trunc_f32_e32 v1, v1 3321; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 3322; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 3323; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 3324; GFX9-NEXT: s_lshr_b32 s6, s7, 16 3325; GFX9-NEXT: s_and_b32 s0, s5, s8 3326; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 3327; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 3328; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 3329; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 3330; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 3331; GFX9-NEXT: s_lshr_b32 s1, s5, 16 3332; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc 3333; GFX9-NEXT: v_mul_f32_e32 v1, v6, v7 3334; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 3335; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v4 3336; GFX9-NEXT: v_trunc_f32_e32 v1, v1 3337; GFX9-NEXT: v_mad_f32 v6, -v1, v5, v6 3338; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 3339; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 3340; GFX9-NEXT: v_mul_f32_e32 v5, v7, v8 3341; GFX9-NEXT: v_trunc_f32_e32 v5, v5 3342; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v5 3343; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 3344; GFX9-NEXT: v_mad_f32 v5, -v5, v4, v7 3345; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 3346; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff 3347; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc 3348; GFX9-NEXT: v_and_b32_e32 v1, v5, v1 3349; GFX9-NEXT: v_and_b32_e32 v0, v5, v0 3350; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 3351; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 3352; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3353; GFX9-NEXT: s_endpgm 3354; 3355; GFX90A-LABEL: udiv_v4i16: 3356; GFX90A: ; %bb.0: 3357; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3358; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3359; GFX90A-NEXT: s_mov_b32 s8, 0xffff 3360; GFX90A-NEXT: v_mov_b32_e32 v2, 0 3361; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 3362; GFX90A-NEXT: s_and_b32 s1, s6, s8 3363; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s1 3364; GFX90A-NEXT: s_lshr_b32 s0, s4, 16 3365; GFX90A-NEXT: s_and_b32 s4, s4, s8 3366; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s4 3367; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 3368; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 3369; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s4 3370; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 3371; GFX90A-NEXT: s_and_b32 s0, s7, s8 3372; GFX90A-NEXT: v_mul_f32_e32 v3, v1, v3 3373; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 3374; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 3375; GFX90A-NEXT: v_mad_f32 v1, -v3, v0, v1 3376; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 3377; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3378; GFX90A-NEXT: v_mul_f32_e32 v1, v5, v6 3379; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 3380; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 3381; GFX90A-NEXT: v_mad_f32 v3, -v1, v4, v5 3382; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 3383; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 3384; GFX90A-NEXT: s_and_b32 s0, s5, s8 3385; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 3386; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 3387; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 3388; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 3389; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s6 3390; GFX90A-NEXT: s_lshr_b32 s1, s5, 16 3391; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc 3392; GFX90A-NEXT: v_mul_f32_e32 v1, v6, v7 3393; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s1 3394; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v4 3395; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 3396; GFX90A-NEXT: v_mad_f32 v6, -v1, v5, v6 3397; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 3398; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 3399; GFX90A-NEXT: v_mul_f32_e32 v5, v7, v8 3400; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 3401; GFX90A-NEXT: v_cvt_u32_f32_e32 v6, v5 3402; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 3403; GFX90A-NEXT: v_mad_f32 v5, -v5, v4, v7 3404; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 3405; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff 3406; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc 3407; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 3408; GFX90A-NEXT: v_and_b32_e32 v0, v5, v0 3409; GFX90A-NEXT: v_lshl_or_b32 v1, v4, 16, v1 3410; GFX90A-NEXT: v_lshl_or_b32 v0, v3, 16, v0 3411; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3412; GFX90A-NEXT: s_endpgm 3413 %r = udiv <4 x i16> %x, %y 3414 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 3415 ret void 3416} 3417 3418define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 3419; CHECK-LABEL: @urem_v4i16( 3420; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 3421; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 3422; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 3423; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 3424; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3425; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3426; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3427; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3428; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3429; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3430; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3431; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3432; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3433; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3434; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3435; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3436; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3437; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 3438; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 3439; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 3440; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 3441; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i16> undef, i16 [[TMP21]], i64 0 3442; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1 3443; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1 3444; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 3445; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 3446; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 3447; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 3448; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 3449; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 3450; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 3451; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 3452; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 3453; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 3454; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 3455; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 3456; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 3457; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 3458; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 3459; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 3460; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 3461; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 3462; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 3463; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1 3464; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2 3465; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2 3466; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 3467; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 3468; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 3469; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 3470; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 3471; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 3472; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 3473; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 3474; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 3475; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 3476; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 3477; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 3478; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 3479; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 3480; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 3481; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 3482; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 3483; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 3484; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 3485; CHECK-NEXT: [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2 3486; CHECK-NEXT: [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3 3487; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3 3488; CHECK-NEXT: [[TMP69:%.*]] = zext i16 [[TMP67]] to i32 3489; CHECK-NEXT: [[TMP70:%.*]] = zext i16 [[TMP68]] to i32 3490; CHECK-NEXT: [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float 3491; CHECK-NEXT: [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float 3492; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]]) 3493; CHECK-NEXT: [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]] 3494; CHECK-NEXT: [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]]) 3495; CHECK-NEXT: [[TMP76:%.*]] = fneg fast float [[TMP75]] 3496; CHECK-NEXT: [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]]) 3497; CHECK-NEXT: [[TMP78:%.*]] = fptoui float [[TMP75]] to i32 3498; CHECK-NEXT: [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]]) 3499; CHECK-NEXT: [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]]) 3500; CHECK-NEXT: [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]] 3501; CHECK-NEXT: [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0 3502; CHECK-NEXT: [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]] 3503; CHECK-NEXT: [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]] 3504; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]] 3505; CHECK-NEXT: [[TMP86:%.*]] = and i32 [[TMP85]], 65535 3506; CHECK-NEXT: [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16 3507; CHECK-NEXT: [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3 3508; CHECK-NEXT: store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 3509; CHECK-NEXT: ret void 3510; 3511; GFX6-LABEL: urem_v4i16: 3512; GFX6: ; %bb.0: 3513; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3514; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 3515; GFX6-NEXT: s_mov_b32 s8, 0xffff 3516; GFX6-NEXT: s_mov_b32 s7, 0xf000 3517; GFX6-NEXT: s_mov_b32 s6, -1 3518; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3519; GFX6-NEXT: s_and_b32 s9, s2, s8 3520; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 3521; GFX6-NEXT: s_and_b32 s10, s0, s8 3522; GFX6-NEXT: s_lshr_b32 s11, s2, 16 3523; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 3524; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3525; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s11 3526; GFX6-NEXT: s_lshr_b32 s9, s0, 16 3527; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s9 3528; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3529; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 3530; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3531; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3532; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 3533; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3534; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 3535; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3536; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 3537; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1 3538; GFX6-NEXT: v_mad_f32 v1, -v1, v3, v4 3539; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 3540; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3 3541; GFX6-NEXT: s_and_b32 s2, s3, s8 3542; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 3543; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s2 3544; GFX6-NEXT: s_and_b32 s2, s1, s8 3545; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11 3546; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 3547; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 3548; GFX6-NEXT: s_lshr_b32 s12, s3, 16 3549; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s9, v1 3550; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 3551; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s12 3552; GFX6-NEXT: s_lshr_b32 s10, s1, 16 3553; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s10 3554; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3555; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 3556; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 3557; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v3 3558; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 3559; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 3560; GFX6-NEXT: v_mul_f32_e32 v2, v6, v7 3561; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3562; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 3563; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3564; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v6 3565; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 3566; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 3567; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 3568; GFX6-NEXT: v_mul_lo_u32 v2, v2, s12 3569; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 3570; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 3571; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 3572; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3573; GFX6-NEXT: v_and_b32_e32 v1, s8, v1 3574; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 3575; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 3576; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 3577; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3578; GFX6-NEXT: s_endpgm 3579; 3580; GFX9-LABEL: urem_v4i16: 3581; GFX9: ; %bb.0: 3582; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3583; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3584; GFX9-NEXT: s_mov_b32 s8, 0xffff 3585; GFX9-NEXT: v_mov_b32_e32 v2, 0 3586; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3587; GFX9-NEXT: s_and_b32 s1, s6, s8 3588; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 3589; GFX9-NEXT: s_and_b32 s9, s4, s8 3590; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 3591; GFX9-NEXT: s_lshr_b32 s9, s6, 16 3592; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3593; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s9 3594; GFX9-NEXT: s_lshr_b32 s0, s4, 16 3595; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 3596; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 3597; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3598; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 3599; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 3600; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 3601; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3602; GFX9-NEXT: s_lshr_b32 s10, s7, 16 3603; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 3604; GFX9-NEXT: v_mul_f32_e32 v1, v5, v6 3605; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 3606; GFX9-NEXT: v_trunc_f32_e32 v1, v1 3607; GFX9-NEXT: s_and_b32 s6, s7, s8 3608; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 3609; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6 3610; GFX9-NEXT: s_and_b32 s6, s5, s8 3611; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s6 3612; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 3613; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 3614; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s10 3615; GFX9-NEXT: s_lshr_b32 s1, s5, 16 3616; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 3617; GFX9-NEXT: v_mul_f32_e32 v3, v6, v7 3618; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 3619; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v4 3620; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3621; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 3622; GFX9-NEXT: v_mad_f32 v6, -v3, v5, v6 3623; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 3624; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 3625; GFX9-NEXT: v_mul_f32_e32 v5, v7, v8 3626; GFX9-NEXT: v_trunc_f32_e32 v5, v5 3627; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v5 3628; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 3629; GFX9-NEXT: v_mad_f32 v5, -v5, v4, v7 3630; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 3631; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc 3632; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 3633; GFX9-NEXT: v_mul_lo_u32 v3, v3, s7 3634; GFX9-NEXT: v_mul_lo_u32 v4, v4, s10 3635; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 3636; GFX9-NEXT: v_sub_u32_e32 v5, s0, v1 3637; GFX9-NEXT: v_sub_u32_e32 v1, s5, v3 3638; GFX9-NEXT: v_sub_u32_e32 v3, s1, v4 3639; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff 3640; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 3641; GFX9-NEXT: v_and_b32_e32 v0, v4, v0 3642; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 3643; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 3644; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3645; GFX9-NEXT: s_endpgm 3646; 3647; GFX90A-LABEL: urem_v4i16: 3648; GFX90A: ; %bb.0: 3649; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3650; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3651; GFX90A-NEXT: s_mov_b32 s8, 0xffff 3652; GFX90A-NEXT: v_mov_b32_e32 v2, 0 3653; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 3654; GFX90A-NEXT: s_and_b32 s1, s6, s8 3655; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s1 3656; GFX90A-NEXT: s_and_b32 s9, s4, s8 3657; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 3658; GFX90A-NEXT: s_lshr_b32 s9, s6, 16 3659; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 3660; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s9 3661; GFX90A-NEXT: s_lshr_b32 s0, s4, 16 3662; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 3663; GFX90A-NEXT: v_mul_f32_e32 v3, v1, v3 3664; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 3665; GFX90A-NEXT: v_mad_f32 v1, -v3, v0, v1 3666; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 3667; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 3668; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3669; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 3670; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 3671; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s6 3672; GFX90A-NEXT: v_mul_f32_e32 v1, v5, v6 3673; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 3674; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 3675; GFX90A-NEXT: s_and_b32 s4, s7, s8 3676; GFX90A-NEXT: v_mad_f32 v3, -v1, v4, v5 3677; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s4 3678; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 3679; GFX90A-NEXT: s_and_b32 s4, s5, s8 3680; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s4 3681; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 3682; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 3683; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s10 3684; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 3685; GFX90A-NEXT: s_lshr_b32 s1, s5, 16 3686; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s9 3687; GFX90A-NEXT: v_sub_u32_e32 v3, s0, v1 3688; GFX90A-NEXT: v_mul_f32_e32 v1, v6, v7 3689; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s1 3690; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v4 3691; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 3692; GFX90A-NEXT: v_mad_f32 v6, -v1, v5, v6 3693; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 3694; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 3695; GFX90A-NEXT: v_mul_f32_e32 v5, v7, v8 3696; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 3697; GFX90A-NEXT: v_cvt_u32_f32_e32 v6, v5 3698; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 3699; GFX90A-NEXT: v_mad_f32 v5, -v5, v4, v7 3700; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 3701; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s7 3702; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc 3703; GFX90A-NEXT: v_sub_u32_e32 v1, s5, v1 3704; GFX90A-NEXT: v_mul_lo_u32 v4, v4, s10 3705; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff 3706; GFX90A-NEXT: v_sub_u32_e32 v4, s1, v4 3707; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 3708; GFX90A-NEXT: v_and_b32_e32 v0, v5, v0 3709; GFX90A-NEXT: v_lshl_or_b32 v1, v4, 16, v1 3710; GFX90A-NEXT: v_lshl_or_b32 v0, v3, 16, v0 3711; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3712; GFX90A-NEXT: s_endpgm 3713 %r = urem <4 x i16> %x, %y 3714 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 3715 ret void 3716} 3717 3718define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 3719; CHECK-LABEL: @sdiv_v4i16( 3720; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 3721; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 3722; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 3723; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 3724; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 3725; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 3726; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 3727; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 3728; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 3729; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 3730; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 3731; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 3732; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 3733; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 3734; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 3735; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 3736; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3737; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 3738; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 3739; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 3740; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 3741; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 3742; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 3743; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> undef, i16 [[TMP23]], i64 0 3744; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1 3745; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1 3746; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 3747; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 3748; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 3749; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 3750; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 3751; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 3752; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 3753; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 3754; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 3755; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 3756; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 3757; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 3758; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 3759; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 3760; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 3761; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 3762; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 3763; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 3764; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 3765; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 3766; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 3767; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1 3768; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2 3769; CHECK-NEXT: [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2 3770; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 3771; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 3772; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 3773; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 3774; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 3775; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 3776; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 3777; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 3778; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 3779; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 3780; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 3781; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 3782; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 3783; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 3784; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 3785; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 3786; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 3787; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 3788; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 3789; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 3790; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 3791; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2 3792; CHECK-NEXT: [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3 3793; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3 3794; CHECK-NEXT: [[TMP75:%.*]] = sext i16 [[TMP73]] to i32 3795; CHECK-NEXT: [[TMP76:%.*]] = sext i16 [[TMP74]] to i32 3796; CHECK-NEXT: [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]] 3797; CHECK-NEXT: [[TMP78:%.*]] = ashr i32 [[TMP77]], 30 3798; CHECK-NEXT: [[TMP79:%.*]] = or i32 [[TMP78]], 1 3799; CHECK-NEXT: [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float 3800; CHECK-NEXT: [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float 3801; CHECK-NEXT: [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]]) 3802; CHECK-NEXT: [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]] 3803; CHECK-NEXT: [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]]) 3804; CHECK-NEXT: [[TMP85:%.*]] = fneg fast float [[TMP84]] 3805; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]]) 3806; CHECK-NEXT: [[TMP87:%.*]] = fptosi float [[TMP84]] to i32 3807; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]]) 3808; CHECK-NEXT: [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]]) 3809; CHECK-NEXT: [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]] 3810; CHECK-NEXT: [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0 3811; CHECK-NEXT: [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]] 3812; CHECK-NEXT: [[TMP93:%.*]] = shl i32 [[TMP92]], 16 3813; CHECK-NEXT: [[TMP94:%.*]] = ashr i32 [[TMP93]], 16 3814; CHECK-NEXT: [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16 3815; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3 3816; CHECK-NEXT: store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 3817; CHECK-NEXT: ret void 3818; 3819; GFX6-LABEL: sdiv_v4i16: 3820; GFX6: ; %bb.0: 3821; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 3822; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 3823; GFX6-NEXT: s_mov_b32 s7, 0xf000 3824; GFX6-NEXT: s_mov_b32 s6, -1 3825; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3826; GFX6-NEXT: s_sext_i32_i16 s8, s2 3827; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 3828; GFX6-NEXT: s_sext_i32_i16 s9, s0 3829; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 3830; GFX6-NEXT: s_xor_b32 s8, s9, s8 3831; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3832; GFX6-NEXT: s_ashr_i32 s2, s2, 16 3833; GFX6-NEXT: s_ashr_i32 s8, s8, 30 3834; GFX6-NEXT: s_or_b32 s8, s8, 1 3835; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3836; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3837; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3838; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 3839; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3840; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2 3841; GFX6-NEXT: v_mov_b32_e32 v3, s8 3842; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3843; GFX6-NEXT: s_ashr_i32 s0, s0, 16 3844; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 3845; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 3846; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 3847; GFX6-NEXT: s_xor_b32 s0, s0, s2 3848; GFX6-NEXT: s_ashr_i32 s0, s0, 30 3849; GFX6-NEXT: s_or_b32 s0, s0, 1 3850; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 3851; GFX6-NEXT: v_trunc_f32_e32 v3, v3 3852; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 3853; GFX6-NEXT: v_mov_b32_e32 v4, s0 3854; GFX6-NEXT: s_sext_i32_i16 s0, s3 3855; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 3856; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 3857; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 3858; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 3859; GFX6-NEXT: s_sext_i32_i16 s2, s1 3860; GFX6-NEXT: v_add_i32_e32 v3, vcc, v1, v3 3861; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2 3862; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 3863; GFX6-NEXT: s_xor_b32 s0, s2, s0 3864; GFX6-NEXT: s_ashr_i32 s0, s0, 30 3865; GFX6-NEXT: s_or_b32 s0, s0, 1 3866; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 3867; GFX6-NEXT: v_trunc_f32_e32 v4, v4 3868; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1 3869; GFX6-NEXT: v_mov_b32_e32 v5, s0 3870; GFX6-NEXT: s_ashr_i32 s0, s3, 16 3871; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 3872; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 3873; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 3874; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc 3875; GFX6-NEXT: s_ashr_i32 s1, s1, 16 3876; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 3877; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 3878; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 3879; GFX6-NEXT: s_xor_b32 s0, s1, s0 3880; GFX6-NEXT: s_ashr_i32 s0, s0, 30 3881; GFX6-NEXT: s_or_b32 s0, s0, 1 3882; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 3883; GFX6-NEXT: v_trunc_f32_e32 v5, v5 3884; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 3885; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 3886; GFX6-NEXT: v_mov_b32_e32 v6, s0 3887; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| 3888; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc 3889; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 3890; GFX6-NEXT: s_mov_b32 s0, 0xffff 3891; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3892; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 3893; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 3894; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 3895; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 3896; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 3897; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 3898; GFX6-NEXT: s_endpgm 3899; 3900; GFX9-LABEL: sdiv_v4i16: 3901; GFX9: ; %bb.0: 3902; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3903; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3904; GFX9-NEXT: v_mov_b32_e32 v2, 0 3905; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3906; GFX9-NEXT: s_sext_i32_i16 s0, s6 3907; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 3908; GFX9-NEXT: s_sext_i32_i16 s1, s4 3909; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 3910; GFX9-NEXT: s_xor_b32 s0, s1, s0 3911; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3912; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3913; GFX9-NEXT: s_or_b32 s8, s0, 1 3914; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 3915; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3916; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 3917; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3918; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3919; GFX9-NEXT: s_cselect_b32 s0, s8, 0 3920; GFX9-NEXT: s_ashr_i32 s1, s6, 16 3921; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 3922; GFX9-NEXT: s_ashr_i32 s4, s4, 16 3923; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s4 3924; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 3925; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 3926; GFX9-NEXT: v_add_u32_e32 v3, s0, v3 3927; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 3928; GFX9-NEXT: s_xor_b32 s0, s4, s1 3929; GFX9-NEXT: v_trunc_f32_e32 v4, v4 3930; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3931; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 3932; GFX9-NEXT: s_or_b32 s4, s0, 1 3933; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3934; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3935; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 3936; GFX9-NEXT: s_sext_i32_i16 s1, s7 3937; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 3938; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3939; GFX9-NEXT: v_add_u32_e32 v4, s0, v4 3940; GFX9-NEXT: s_sext_i32_i16 s0, s5 3941; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 3942; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 3943; GFX9-NEXT: s_xor_b32 s0, s0, s1 3944; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3945; GFX9-NEXT: s_or_b32 s4, s0, 1 3946; GFX9-NEXT: v_mul_f32_e32 v5, v1, v5 3947; GFX9-NEXT: v_trunc_f32_e32 v5, v5 3948; GFX9-NEXT: v_mad_f32 v1, -v5, v0, v1 3949; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3950; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3951; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 3952; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3953; GFX9-NEXT: s_ashr_i32 s1, s7, 16 3954; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 3955; GFX9-NEXT: v_add_u32_e32 v1, s0, v5 3956; GFX9-NEXT: s_ashr_i32 s0, s5, 16 3957; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 3958; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v0 3959; GFX9-NEXT: s_xor_b32 s0, s0, s1 3960; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3961; GFX9-NEXT: s_or_b32 s4, s0, 1 3962; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 3963; GFX9-NEXT: v_trunc_f32_e32 v6, v6 3964; GFX9-NEXT: v_mad_f32 v5, -v6, v0, v5 3965; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 3966; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| 3967; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3968; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3969; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff 3970; GFX9-NEXT: v_add_u32_e32 v0, s0, v6 3971; GFX9-NEXT: v_and_b32_e32 v1, v5, v1 3972; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 3973; GFX9-NEXT: v_and_b32_e32 v0, v5, v3 3974; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 3975; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3976; GFX9-NEXT: s_endpgm 3977; 3978; GFX90A-LABEL: sdiv_v4i16: 3979; GFX90A: ; %bb.0: 3980; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3981; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3982; GFX90A-NEXT: v_mov_b32_e32 v2, 0 3983; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 3984; GFX90A-NEXT: s_sext_i32_i16 s0, s6 3985; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 3986; GFX90A-NEXT: s_sext_i32_i16 s1, s4 3987; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s1 3988; GFX90A-NEXT: s_xor_b32 s0, s1, s0 3989; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 3990; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 3991; GFX90A-NEXT: s_or_b32 s8, s0, 1 3992; GFX90A-NEXT: v_mul_f32_e32 v3, v1, v3 3993; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 3994; GFX90A-NEXT: v_mad_f32 v1, -v3, v0, v1 3995; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3996; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 3997; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 3998; GFX90A-NEXT: s_ashr_i32 s1, s6, 16 3999; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 4000; GFX90A-NEXT: s_ashr_i32 s4, s4, 16 4001; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s4 4002; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 4003; GFX90A-NEXT: v_rcp_iflag_f32_e32 v4, v0 4004; GFX90A-NEXT: v_add_u32_e32 v3, s0, v3 4005; GFX90A-NEXT: v_mul_f32_e32 v4, v1, v4 4006; GFX90A-NEXT: s_xor_b32 s0, s4, s1 4007; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 4008; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4009; GFX90A-NEXT: v_mad_f32 v1, -v4, v0, v1 4010; GFX90A-NEXT: s_or_b32 s4, s0, 1 4011; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 4012; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4013; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 4014; GFX90A-NEXT: s_sext_i32_i16 s1, s7 4015; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 4016; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 4017; GFX90A-NEXT: v_add_u32_e32 v4, s0, v4 4018; GFX90A-NEXT: s_sext_i32_i16 s0, s5 4019; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s0 4020; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v0 4021; GFX90A-NEXT: s_xor_b32 s0, s0, s1 4022; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4023; GFX90A-NEXT: s_or_b32 s4, s0, 1 4024; GFX90A-NEXT: v_mul_f32_e32 v5, v1, v5 4025; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 4026; GFX90A-NEXT: v_mad_f32 v1, -v5, v0, v1 4027; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 4028; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4029; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 4030; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 4031; GFX90A-NEXT: s_ashr_i32 s1, s7, 16 4032; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 4033; GFX90A-NEXT: v_add_u32_e32 v1, s0, v5 4034; GFX90A-NEXT: s_ashr_i32 s0, s5, 16 4035; GFX90A-NEXT: v_cvt_f32_i32_e32 v5, s0 4036; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v0 4037; GFX90A-NEXT: s_xor_b32 s0, s0, s1 4038; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4039; GFX90A-NEXT: s_or_b32 s4, s0, 1 4040; GFX90A-NEXT: v_mul_f32_e32 v6, v5, v6 4041; GFX90A-NEXT: v_trunc_f32_e32 v6, v6 4042; GFX90A-NEXT: v_mad_f32 v5, -v6, v0, v5 4043; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 4044; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| 4045; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4046; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 4047; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff 4048; GFX90A-NEXT: v_add_u32_e32 v0, s0, v6 4049; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 4050; GFX90A-NEXT: v_lshl_or_b32 v1, v0, 16, v1 4051; GFX90A-NEXT: v_and_b32_e32 v0, v5, v3 4052; GFX90A-NEXT: v_lshl_or_b32 v0, v4, 16, v0 4053; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 4054; GFX90A-NEXT: s_endpgm 4055 %r = sdiv <4 x i16> %x, %y 4056 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 4057 ret void 4058} 4059 4060define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 4061; CHECK-LABEL: @srem_v4i16( 4062; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 4063; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 4064; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 4065; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 4066; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 4067; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 4068; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 4069; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 4070; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 4071; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 4072; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 4073; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 4074; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 4075; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 4076; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 4077; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 4078; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4079; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 4080; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 4081; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 4082; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 4083; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 4084; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 4085; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 4086; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 4087; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> undef, i16 [[TMP25]], i64 0 4088; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1 4089; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1 4090; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 4091; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 4092; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 4093; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 4094; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 4095; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 4096; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 4097; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 4098; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 4099; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 4100; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 4101; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 4102; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 4103; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 4104; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 4105; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 4106; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 4107; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 4108; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 4109; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 4110; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 4111; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 4112; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 4113; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1 4114; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2 4115; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2 4116; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 4117; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 4118; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 4119; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 4120; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 4121; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 4122; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 4123; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 4124; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 4125; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 4126; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 4127; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 4128; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 4129; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 4130; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 4131; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 4132; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 4133; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 4134; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 4135; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 4136; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 4137; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 4138; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 4139; CHECK-NEXT: [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2 4140; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3 4141; CHECK-NEXT: [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3 4142; CHECK-NEXT: [[TMP81:%.*]] = sext i16 [[TMP79]] to i32 4143; CHECK-NEXT: [[TMP82:%.*]] = sext i16 [[TMP80]] to i32 4144; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]] 4145; CHECK-NEXT: [[TMP84:%.*]] = ashr i32 [[TMP83]], 30 4146; CHECK-NEXT: [[TMP85:%.*]] = or i32 [[TMP84]], 1 4147; CHECK-NEXT: [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float 4148; CHECK-NEXT: [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float 4149; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]]) 4150; CHECK-NEXT: [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]] 4151; CHECK-NEXT: [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]]) 4152; CHECK-NEXT: [[TMP91:%.*]] = fneg fast float [[TMP90]] 4153; CHECK-NEXT: [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]]) 4154; CHECK-NEXT: [[TMP93:%.*]] = fptosi float [[TMP90]] to i32 4155; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]]) 4156; CHECK-NEXT: [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]]) 4157; CHECK-NEXT: [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]] 4158; CHECK-NEXT: [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0 4159; CHECK-NEXT: [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]] 4160; CHECK-NEXT: [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]] 4161; CHECK-NEXT: [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]] 4162; CHECK-NEXT: [[TMP101:%.*]] = shl i32 [[TMP100]], 16 4163; CHECK-NEXT: [[TMP102:%.*]] = ashr i32 [[TMP101]], 16 4164; CHECK-NEXT: [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16 4165; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3 4166; CHECK-NEXT: store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 4167; CHECK-NEXT: ret void 4168; 4169; GFX6-LABEL: srem_v4i16: 4170; GFX6: ; %bb.0: 4171; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4172; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb 4173; GFX6-NEXT: s_mov_b32 s7, 0xf000 4174; GFX6-NEXT: s_mov_b32 s6, -1 4175; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4176; GFX6-NEXT: s_sext_i32_i16 s8, s2 4177; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 4178; GFX6-NEXT: s_sext_i32_i16 s9, s0 4179; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 4180; GFX6-NEXT: s_xor_b32 s8, s9, s8 4181; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 4182; GFX6-NEXT: s_ashr_i32 s8, s8, 30 4183; GFX6-NEXT: s_or_b32 s8, s8, 1 4184; GFX6-NEXT: v_mov_b32_e32 v3, s8 4185; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 4186; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4187; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 4188; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 4189; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 4190; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 4191; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 4192; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 4193; GFX6-NEXT: s_ashr_i32 s2, s2, 16 4194; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2 4195; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 4196; GFX6-NEXT: s_ashr_i32 s0, s0, 16 4197; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 4198; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 4199; GFX6-NEXT: s_xor_b32 s8, s0, s2 4200; GFX6-NEXT: s_ashr_i32 s8, s8, 30 4201; GFX6-NEXT: s_or_b32 s8, s8, 1 4202; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 4203; GFX6-NEXT: v_trunc_f32_e32 v3, v3 4204; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 4205; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 4206; GFX6-NEXT: v_mov_b32_e32 v4, s8 4207; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 4208; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 4209; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 4210; GFX6-NEXT: v_mul_lo_u32 v1, v1, s2 4211; GFX6-NEXT: s_sext_i32_i16 s2, s3 4212; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s2 4213; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s0, v1 4214; GFX6-NEXT: s_sext_i32_i16 s0, s1 4215; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 4216; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 4217; GFX6-NEXT: s_xor_b32 s0, s0, s2 4218; GFX6-NEXT: s_ashr_i32 s0, s0, 30 4219; GFX6-NEXT: s_or_b32 s0, s0, 1 4220; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 4221; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4222; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1 4223; GFX6-NEXT: v_mov_b32_e32 v5, s0 4224; GFX6-NEXT: s_ashr_i32 s0, s3, 16 4225; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 4226; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 4227; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 4228; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc 4229; GFX6-NEXT: s_ashr_i32 s2, s1, 16 4230; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 4231; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s2 4232; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 4233; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 4234; GFX6-NEXT: s_xor_b32 s3, s2, s0 4235; GFX6-NEXT: s_ashr_i32 s3, s3, 30 4236; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 4237; GFX6-NEXT: v_trunc_f32_e32 v5, v5 4238; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 4239; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 4240; GFX6-NEXT: s_or_b32 s3, s3, 1 4241; GFX6-NEXT: v_mov_b32_e32 v6, s3 4242; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| 4243; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc 4244; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 4245; GFX6-NEXT: v_mul_lo_u32 v2, v2, s0 4246; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 4247; GFX6-NEXT: s_mov_b32 s0, 0xffff 4248; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 4249; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 4250; GFX6-NEXT: v_and_b32_e32 v1, s0, v1 4251; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 4252; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 4253; GFX6-NEXT: v_and_b32_e32 v0, s0, v0 4254; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 4255; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 4256; GFX6-NEXT: s_endpgm 4257; 4258; GFX9-LABEL: srem_v4i16: 4259; GFX9: ; %bb.0: 4260; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4261; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 4262; GFX9-NEXT: v_mov_b32_e32 v2, 0 4263; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4264; GFX9-NEXT: s_sext_i32_i16 s0, s6 4265; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 4266; GFX9-NEXT: s_sext_i32_i16 s1, s4 4267; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 4268; GFX9-NEXT: s_xor_b32 s0, s1, s0 4269; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 4270; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4271; GFX9-NEXT: s_or_b32 s8, s0, 1 4272; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 4273; GFX9-NEXT: v_trunc_f32_e32 v3, v3 4274; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 4275; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 4276; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4277; GFX9-NEXT: s_cselect_b32 s0, s8, 0 4278; GFX9-NEXT: s_ashr_i32 s9, s6, 16 4279; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 4280; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s9 4281; GFX9-NEXT: s_ashr_i32 s8, s4, 16 4282; GFX9-NEXT: v_add_u32_e32 v1, s0, v3 4283; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s8 4284; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 4285; GFX9-NEXT: s_xor_b32 s0, s8, s9 4286; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4287; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 4288; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 4289; GFX9-NEXT: v_trunc_f32_e32 v4, v4 4290; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 4291; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 4292; GFX9-NEXT: s_or_b32 s6, s0, 1 4293; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| 4294; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4295; GFX9-NEXT: s_cselect_b32 s0, s6, 0 4296; GFX9-NEXT: v_add_u32_e32 v0, s0, v4 4297; GFX9-NEXT: s_sext_i32_i16 s0, s7 4298; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 4299; GFX9-NEXT: s_sext_i32_i16 s1, s5 4300; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 4301; GFX9-NEXT: s_xor_b32 s0, s1, s0 4302; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 4303; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4304; GFX9-NEXT: s_or_b32 s6, s0, 1 4305; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 4306; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 4307; GFX9-NEXT: v_trunc_f32_e32 v5, v5 4308; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 4309; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 4310; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 4311; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4312; GFX9-NEXT: s_cselect_b32 s0, s6, 0 4313; GFX9-NEXT: s_ashr_i32 s6, s7, 16 4314; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s6 4315; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 4316; GFX9-NEXT: v_mul_lo_u32 v3, v3, s7 4317; GFX9-NEXT: s_ashr_i32 s7, s5, 16 4318; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s7 4319; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 4320; GFX9-NEXT: s_xor_b32 s0, s7, s6 4321; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4322; GFX9-NEXT: s_or_b32 s9, s0, 1 4323; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 4324; GFX9-NEXT: v_trunc_f32_e32 v6, v6 4325; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 4326; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 4327; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| 4328; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4329; GFX9-NEXT: s_cselect_b32 s0, s9, 0 4330; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 4331; GFX9-NEXT: v_mul_lo_u32 v4, v4, s6 4332; GFX9-NEXT: v_sub_u32_e32 v5, s4, v1 4333; GFX9-NEXT: v_sub_u32_e32 v1, s5, v3 4334; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 4335; GFX9-NEXT: v_sub_u32_e32 v3, s7, v4 4336; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff 4337; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 4338; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 4339; GFX9-NEXT: v_and_b32_e32 v3, v4, v5 4340; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 4341; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 4342; GFX9-NEXT: s_endpgm 4343; 4344; GFX90A-LABEL: srem_v4i16: 4345; GFX90A: ; %bb.0: 4346; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4347; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 4348; GFX90A-NEXT: v_mov_b32_e32 v2, 0 4349; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 4350; GFX90A-NEXT: s_sext_i32_i16 s0, s6 4351; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 4352; GFX90A-NEXT: s_sext_i32_i16 s1, s4 4353; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s1 4354; GFX90A-NEXT: s_xor_b32 s0, s1, s0 4355; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 4356; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4357; GFX90A-NEXT: s_or_b32 s8, s0, 1 4358; GFX90A-NEXT: v_mul_f32_e32 v3, v1, v3 4359; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 4360; GFX90A-NEXT: v_mad_f32 v1, -v3, v0, v1 4361; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 4362; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 4363; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4364; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 4365; GFX90A-NEXT: s_ashr_i32 s8, s6, 16 4366; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s8 4367; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 4368; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s6 4369; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 4370; GFX90A-NEXT: s_ashr_i32 s4, s4, 16 4371; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s4 4372; GFX90A-NEXT: v_rcp_iflag_f32_e32 v4, v1 4373; GFX90A-NEXT: s_xor_b32 s0, s4, s8 4374; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4375; GFX90A-NEXT: s_or_b32 s6, s0, 1 4376; GFX90A-NEXT: v_mul_f32_e32 v4, v3, v4 4377; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 4378; GFX90A-NEXT: v_mad_f32 v3, -v4, v1, v3 4379; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 4380; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v1| 4381; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4382; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 4383; GFX90A-NEXT: v_add_u32_e32 v1, s0, v4 4384; GFX90A-NEXT: s_sext_i32_i16 s0, s7 4385; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s0 4386; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s8 4387; GFX90A-NEXT: s_sext_i32_i16 s1, s5 4388; GFX90A-NEXT: v_sub_u32_e32 v4, s4, v1 4389; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s1 4390; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v3 4391; GFX90A-NEXT: s_xor_b32 s0, s1, s0 4392; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4393; GFX90A-NEXT: s_or_b32 s4, s0, 1 4394; GFX90A-NEXT: v_mul_f32_e32 v5, v1, v5 4395; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 4396; GFX90A-NEXT: v_mad_f32 v1, -v5, v3, v1 4397; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 4398; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v3| 4399; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4400; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 4401; GFX90A-NEXT: s_ashr_i32 s4, s7, 16 4402; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s4 4403; GFX90A-NEXT: v_add_u32_e32 v1, s0, v5 4404; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s7 4405; GFX90A-NEXT: v_sub_u32_e32 v1, s5, v1 4406; GFX90A-NEXT: s_ashr_i32 s5, s5, 16 4407; GFX90A-NEXT: v_cvt_f32_i32_e32 v5, s5 4408; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v3 4409; GFX90A-NEXT: s_xor_b32 s0, s5, s4 4410; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4411; GFX90A-NEXT: s_or_b32 s6, s0, 1 4412; GFX90A-NEXT: v_mul_f32_e32 v6, v5, v6 4413; GFX90A-NEXT: v_trunc_f32_e32 v6, v6 4414; GFX90A-NEXT: v_mad_f32 v5, -v6, v3, v5 4415; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 4416; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| 4417; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4418; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 4419; GFX90A-NEXT: v_add_u32_e32 v3, s0, v6 4420; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s4 4421; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff 4422; GFX90A-NEXT: v_sub_u32_e32 v3, s5, v3 4423; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 4424; GFX90A-NEXT: v_and_b32_e32 v0, v5, v0 4425; GFX90A-NEXT: v_lshl_or_b32 v1, v3, 16, v1 4426; GFX90A-NEXT: v_lshl_or_b32 v0, v4, 16, v0 4427; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 4428; GFX90A-NEXT: s_endpgm 4429 %r = srem <4 x i16> %x, %y 4430 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 4431 ret void 4432} 4433 4434define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 4435; CHECK-LABEL: @udiv_i3( 4436; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 4437; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 4438; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 4439; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 4440; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 4441; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 4442; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 4443; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 4444; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 4445; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 4446; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4447; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 4448; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 4449; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 4450; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 4451; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 7 4452; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3 4453; CHECK-NEXT: store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]], align 1 4454; CHECK-NEXT: ret void 4455; 4456; GFX6-LABEL: udiv_i3: 4457; GFX6: ; %bb.0: 4458; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4459; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 4460; GFX6-NEXT: s_mov_b32 s7, 0xf000 4461; GFX6-NEXT: s_mov_b32 s6, -1 4462; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4463; GFX6-NEXT: s_bfe_u32 s1, s0, 0x30008 4464; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s1 4465; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 4466; GFX6-NEXT: s_and_b32 s0, s0, 7 4467; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s0 4468; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 4469; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4470; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 4471; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 4472; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 4473; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 4474; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 4475; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 4476; GFX6-NEXT: s_endpgm 4477; 4478; GFX9-LABEL: udiv_i3: 4479; GFX9: ; %bb.0: 4480; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4481; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 4482; GFX9-NEXT: v_mov_b32_e32 v2, 0 4483; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4484; GFX9-NEXT: s_bfe_u32 s0, s4, 0x30008 4485; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 4486; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 4487; GFX9-NEXT: s_and_b32 s0, s4, 7 4488; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 4489; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 4490; GFX9-NEXT: v_trunc_f32_e32 v1, v1 4491; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 4492; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v3 4493; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 4494; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 4495; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 4496; GFX9-NEXT: global_store_byte v2, v0, s[2:3] 4497; GFX9-NEXT: s_endpgm 4498; 4499; GFX90A-LABEL: udiv_i3: 4500; GFX90A: ; %bb.0: 4501; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4502; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 4503; GFX90A-NEXT: v_mov_b32_e32 v2, 0 4504; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 4505; GFX90A-NEXT: s_bfe_u32 s0, s4, 0x30008 4506; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 4507; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v0 4508; GFX90A-NEXT: s_and_b32 s0, s4, 7 4509; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 4510; GFX90A-NEXT: v_mul_f32_e32 v1, v3, v1 4511; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 4512; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v1 4513; GFX90A-NEXT: v_mad_f32 v1, -v1, v0, v3 4514; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 4515; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 4516; GFX90A-NEXT: v_and_b32_e32 v0, 7, v0 4517; GFX90A-NEXT: global_store_byte v2, v0, s[2:3] 4518; GFX90A-NEXT: s_endpgm 4519 %r = udiv i3 %x, %y 4520 store i3 %r, i3 addrspace(1)* %out 4521 ret void 4522} 4523 4524define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 4525; CHECK-LABEL: @urem_i3( 4526; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 4527; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 4528; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 4529; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 4530; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 4531; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 4532; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 4533; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 4534; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 4535; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 4536; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4537; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 4538; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 4539; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 4540; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 4541; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 4542; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 4543; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 7 4544; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3 4545; CHECK-NEXT: store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]], align 1 4546; CHECK-NEXT: ret void 4547; 4548; GFX6-LABEL: urem_i3: 4549; GFX6: ; %bb.0: 4550; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4551; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 4552; GFX6-NEXT: s_mov_b32 s7, 0xf000 4553; GFX6-NEXT: s_mov_b32 s6, -1 4554; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4555; GFX6-NEXT: s_bfe_u32 s1, s0, 0x30008 4556; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s1 4557; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 4558; GFX6-NEXT: s_and_b32 s2, s0, 7 4559; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 4560; GFX6-NEXT: s_lshr_b32 s1, s0, 8 4561; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 4562; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4563; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 4564; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 4565; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 4566; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 4567; GFX6-NEXT: v_mul_lo_u32 v0, v0, s1 4568; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 4569; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 4570; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 4571; GFX6-NEXT: s_endpgm 4572; 4573; GFX9-LABEL: urem_i3: 4574; GFX9: ; %bb.0: 4575; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 4576; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4577; GFX9-NEXT: s_bfe_u32 s3, s2, 0x30008 4578; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 4579; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 4580; GFX9-NEXT: s_and_b32 s4, s2, 7 4581; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 4582; GFX9-NEXT: s_lshr_b32 s3, s2, 8 4583; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 4584; GFX9-NEXT: v_trunc_f32_e32 v1, v1 4585; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 4586; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2 4587; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 4588; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4589; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 4590; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 4591; GFX9-NEXT: v_mov_b32_e32 v1, 0 4592; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 4593; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 4594; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4595; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 4596; GFX9-NEXT: s_endpgm 4597; 4598; GFX90A-LABEL: urem_i3: 4599; GFX90A: ; %bb.0: 4600; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4601; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 4602; GFX90A-NEXT: v_mov_b32_e32 v0, 0 4603; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 4604; GFX90A-NEXT: s_bfe_u32 s0, s4, 0x30008 4605; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 4606; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v1 4607; GFX90A-NEXT: s_and_b32 s1, s4, 7 4608; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v3, s1 4609; GFX90A-NEXT: s_lshr_b32 s0, s4, 8 4610; GFX90A-NEXT: v_mul_f32_e32 v2, v3, v2 4611; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 4612; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v2 4613; GFX90A-NEXT: v_mad_f32 v2, -v2, v1, v3 4614; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 4615; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc 4616; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s0 4617; GFX90A-NEXT: v_sub_u32_e32 v1, s4, v1 4618; GFX90A-NEXT: v_and_b32_e32 v1, 7, v1 4619; GFX90A-NEXT: global_store_byte v0, v1, s[2:3] 4620; GFX90A-NEXT: s_endpgm 4621 %r = urem i3 %x, %y 4622 store i3 %r, i3 addrspace(1)* %out 4623 ret void 4624} 4625 4626define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 4627; CHECK-LABEL: @sdiv_i3( 4628; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 4629; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 4630; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 4631; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 4632; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 4633; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 4634; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 4635; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 4636; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 4637; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 4638; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 4639; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 4640; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 4641; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 4642; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 4643; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 4644; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 4645; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 4646; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 29 4647; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 29 4648; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3 4649; CHECK-NEXT: store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]], align 1 4650; CHECK-NEXT: ret void 4651; 4652; GFX6-LABEL: sdiv_i3: 4653; GFX6: ; %bb.0: 4654; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4655; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 4656; GFX6-NEXT: s_mov_b32 s7, 0xf000 4657; GFX6-NEXT: s_mov_b32 s6, -1 4658; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4659; GFX6-NEXT: s_bfe_i32 s1, s0, 0x30008 4660; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 4661; GFX6-NEXT: s_bfe_i32 s0, s0, 0x30000 4662; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 4663; GFX6-NEXT: s_xor_b32 s0, s0, s1 4664; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 4665; GFX6-NEXT: s_ashr_i32 s0, s0, 30 4666; GFX6-NEXT: s_or_b32 s0, s0, 1 4667; GFX6-NEXT: v_mov_b32_e32 v3, s0 4668; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 4669; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4670; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 4671; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 4672; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 4673; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 4674; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 4675; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 4676; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 4677; GFX6-NEXT: s_endpgm 4678; 4679; GFX9-LABEL: sdiv_i3: 4680; GFX9: ; %bb.0: 4681; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4682; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 4683; GFX9-NEXT: v_mov_b32_e32 v1, 0 4684; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4685; GFX9-NEXT: s_bfe_i32 s0, s4, 0x30008 4686; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 4687; GFX9-NEXT: s_bfe_i32 s1, s4, 0x30000 4688; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 4689; GFX9-NEXT: s_xor_b32 s0, s1, s0 4690; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 4691; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4692; GFX9-NEXT: s_or_b32 s4, s0, 1 4693; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 4694; GFX9-NEXT: v_trunc_f32_e32 v3, v3 4695; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 4696; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 4697; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 4698; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4699; GFX9-NEXT: s_cselect_b32 s0, s4, 0 4700; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 4701; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 4702; GFX9-NEXT: global_store_byte v1, v0, s[2:3] 4703; GFX9-NEXT: s_endpgm 4704; 4705; GFX90A-LABEL: sdiv_i3: 4706; GFX90A: ; %bb.0: 4707; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4708; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 4709; GFX90A-NEXT: v_mov_b32_e32 v1, 0 4710; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 4711; GFX90A-NEXT: s_bfe_i32 s0, s4, 0x30008 4712; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 4713; GFX90A-NEXT: s_bfe_i32 s1, s4, 0x30000 4714; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 4715; GFX90A-NEXT: s_xor_b32 s0, s1, s0 4716; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 4717; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4718; GFX90A-NEXT: s_or_b32 s4, s0, 1 4719; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 4720; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 4721; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 4722; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 4723; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 4724; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4725; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 4726; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 4727; GFX90A-NEXT: v_and_b32_e32 v0, 7, v0 4728; GFX90A-NEXT: global_store_byte v1, v0, s[2:3] 4729; GFX90A-NEXT: s_endpgm 4730 %r = sdiv i3 %x, %y 4731 store i3 %r, i3 addrspace(1)* %out 4732 ret void 4733} 4734 4735define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 4736; CHECK-LABEL: @srem_i3( 4737; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 4738; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 4739; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 4740; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 4741; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 4742; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 4743; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 4744; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 4745; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 4746; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 4747; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 4748; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 4749; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 4750; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 4751; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 4752; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 4753; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 4754; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 4755; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 4756; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 4757; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 29 4758; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 29 4759; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3 4760; CHECK-NEXT: store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]], align 1 4761; CHECK-NEXT: ret void 4762; 4763; GFX6-LABEL: srem_i3: 4764; GFX6: ; %bb.0: 4765; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4766; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 4767; GFX6-NEXT: s_mov_b32 s7, 0xf000 4768; GFX6-NEXT: s_mov_b32 s6, -1 4769; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4770; GFX6-NEXT: s_bfe_i32 s1, s0, 0x30008 4771; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s1 4772; GFX6-NEXT: s_bfe_i32 s3, s0, 0x30000 4773; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 4774; GFX6-NEXT: s_xor_b32 s1, s3, s1 4775; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 4776; GFX6-NEXT: s_ashr_i32 s1, s1, 30 4777; GFX6-NEXT: s_or_b32 s1, s1, 1 4778; GFX6-NEXT: v_mov_b32_e32 v3, s1 4779; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 4780; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4781; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 4782; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 4783; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 4784; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 4785; GFX6-NEXT: s_lshr_b32 s2, s0, 8 4786; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 4787; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 4788; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 4789; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 4790; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 4791; GFX6-NEXT: s_endpgm 4792; 4793; GFX9-LABEL: srem_i3: 4794; GFX9: ; %bb.0: 4795; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 4796; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4797; GFX9-NEXT: s_bfe_i32 s2, s4, 0x30008 4798; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 4799; GFX9-NEXT: s_bfe_i32 s3, s4, 0x30000 4800; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 4801; GFX9-NEXT: s_xor_b32 s2, s3, s2 4802; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 4803; GFX9-NEXT: s_ashr_i32 s2, s2, 30 4804; GFX9-NEXT: s_lshr_b32 s5, s4, 8 4805; GFX9-NEXT: s_or_b32 s6, s2, 1 4806; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 4807; GFX9-NEXT: v_trunc_f32_e32 v2, v2 4808; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 4809; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 4810; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 4811; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 4812; GFX9-NEXT: s_cselect_b32 s2, s6, 0 4813; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 4814; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 4815; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4816; GFX9-NEXT: v_mov_b32_e32 v1, 0 4817; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 4818; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 4819; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4820; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 4821; GFX9-NEXT: s_endpgm 4822; 4823; GFX90A-LABEL: srem_i3: 4824; GFX90A: ; %bb.0: 4825; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4826; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 4827; GFX90A-NEXT: v_mov_b32_e32 v0, 0 4828; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 4829; GFX90A-NEXT: s_bfe_i32 s0, s4, 0x30008 4830; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s0 4831; GFX90A-NEXT: s_bfe_i32 s1, s4, 0x30000 4832; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 4833; GFX90A-NEXT: s_xor_b32 s0, s1, s0 4834; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v1 4835; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4836; GFX90A-NEXT: s_lshr_b32 s5, s4, 8 4837; GFX90A-NEXT: s_or_b32 s6, s0, 1 4838; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 4839; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 4840; GFX90A-NEXT: v_mad_f32 v2, -v3, v1, v2 4841; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 4842; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v1| 4843; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4844; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 4845; GFX90A-NEXT: v_add_u32_e32 v1, s0, v3 4846; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s5 4847; GFX90A-NEXT: v_sub_u32_e32 v1, s4, v1 4848; GFX90A-NEXT: v_and_b32_e32 v1, 7, v1 4849; GFX90A-NEXT: global_store_byte v0, v1, s[2:3] 4850; GFX90A-NEXT: s_endpgm 4851 %r = srem i3 %x, %y 4852 store i3 %r, i3 addrspace(1)* %out 4853 ret void 4854} 4855 4856define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 4857; CHECK-LABEL: @udiv_v3i16( 4858; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 4859; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 4860; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 4861; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 4862; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 4863; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 4864; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 4865; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 4866; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 4867; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 4868; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 4869; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 4870; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 4871; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 4872; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 4873; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 4874; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 4875; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 4876; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 4877; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i16> undef, i16 [[TMP19]], i64 0 4878; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1 4879; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1 4880; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 4881; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 4882; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 4883; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 4884; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 4885; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 4886; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 4887; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 4888; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 4889; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 4890; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 4891; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 4892; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 4893; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 4894; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 4895; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 4896; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 4897; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1 4898; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2 4899; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2 4900; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 4901; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 4902; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 4903; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 4904; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 4905; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 4906; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 4907; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 4908; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 4909; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 4910; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 4911; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 4912; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 4913; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 4914; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 4915; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 4916; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 4917; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2 4918; CHECK-NEXT: store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 4919; CHECK-NEXT: ret void 4920; 4921; GFX6-LABEL: udiv_v3i16: 4922; GFX6: ; %bb.0: 4923; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4924; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 4925; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 4926; GFX6-NEXT: s_mov_b32 s8, 0xffff 4927; GFX6-NEXT: s_mov_b32 s7, 0xf000 4928; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4929; GFX6-NEXT: s_and_b32 s6, s0, s8 4930; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 4931; GFX6-NEXT: s_and_b32 s6, s2, s8 4932; GFX6-NEXT: s_lshr_b32 s0, s0, 16 4933; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s6 4934; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 4935; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s0 4936; GFX6-NEXT: s_lshr_b32 s0, s2, 16 4937; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s0 4938; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 4939; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 4940; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4941; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 4942; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 4943; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 4944; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 4945; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4946; GFX6-NEXT: s_and_b32 s0, s1, s8 4947; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 4948; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 4949; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s0 4950; GFX6-NEXT: s_and_b32 s0, s3, s8 4951; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 4952; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 4953; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 4954; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 4955; GFX6-NEXT: s_mov_b32 s6, -1 4956; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4957; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6 4958; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4959; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 4960; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5 4961; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 4962; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 4963; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 4964; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 4965; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 4966; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 4967; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4968; GFX6-NEXT: s_endpgm 4969; 4970; GFX9-LABEL: udiv_v3i16: 4971; GFX9: ; %bb.0: 4972; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4973; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 4974; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 4975; GFX9-NEXT: s_mov_b32 s8, 0xffff 4976; GFX9-NEXT: v_mov_b32_e32 v1, 0 4977; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4978; GFX9-NEXT: s_and_b32 s0, s6, s8 4979; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 4980; GFX9-NEXT: s_and_b32 s0, s4, s8 4981; GFX9-NEXT: s_lshr_b32 s1, s6, 16 4982; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s0 4983; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 4984; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s1 4985; GFX9-NEXT: s_lshr_b32 s0, s4, 16 4986; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 4987; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 4988; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 4989; GFX9-NEXT: v_trunc_f32_e32 v3, v3 4990; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 4991; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 4992; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 4993; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6 4994; GFX9-NEXT: v_trunc_f32_e32 v2, v2 4995; GFX9-NEXT: s_and_b32 s0, s7, s8 4996; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 4997; GFX9-NEXT: v_mad_f32 v3, -v2, v4, v5 4998; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 4999; GFX9-NEXT: s_and_b32 s0, s5, s8 5000; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 5001; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 5002; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 5003; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 5004; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 5005; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 5006; GFX9-NEXT: v_mul_f32_e32 v3, v6, v7 5007; GFX9-NEXT: v_trunc_f32_e32 v3, v3 5008; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 5009; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v6 5010; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 5011; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 5012; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 5013; GFX9-NEXT: global_store_short v1, v3, s[2:3] offset:4 5014; GFX9-NEXT: global_store_dword v1, v0, s[2:3] 5015; GFX9-NEXT: s_endpgm 5016; 5017; GFX90A-LABEL: udiv_v3i16: 5018; GFX90A: ; %bb.0: 5019; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5020; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5021; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 5022; GFX90A-NEXT: s_mov_b32 s8, 0xffff 5023; GFX90A-NEXT: v_mov_b32_e32 v1, 0 5024; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 5025; GFX90A-NEXT: s_and_b32 s0, s6, s8 5026; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s0 5027; GFX90A-NEXT: s_and_b32 s0, s4, s8 5028; GFX90A-NEXT: s_lshr_b32 s1, s6, 16 5029; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s0 5030; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 5031; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s1 5032; GFX90A-NEXT: s_lshr_b32 s0, s4, 16 5033; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 5034; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 5035; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 5036; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 5037; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 5038; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 5039; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 5040; GFX90A-NEXT: v_mul_f32_e32 v2, v5, v6 5041; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 5042; GFX90A-NEXT: s_and_b32 s0, s7, s8 5043; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 5044; GFX90A-NEXT: v_mad_f32 v3, -v2, v4, v5 5045; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 5046; GFX90A-NEXT: s_and_b32 s0, s5, s8 5047; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 5048; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 5049; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 5050; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 5051; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 5052; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 5053; GFX90A-NEXT: v_mul_f32_e32 v3, v6, v7 5054; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 5055; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v3 5056; GFX90A-NEXT: v_mad_f32 v3, -v3, v5, v6 5057; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 5058; GFX90A-NEXT: v_lshl_or_b32 v0, v2, 16, v0 5059; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 5060; GFX90A-NEXT: global_store_short v1, v3, s[2:3] offset:4 5061; GFX90A-NEXT: global_store_dword v1, v0, s[2:3] 5062; GFX90A-NEXT: s_endpgm 5063 %r = udiv <3 x i16> %x, %y 5064 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 5065 ret void 5066} 5067 5068define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 5069; CHECK-LABEL: @urem_v3i16( 5070; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 5071; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 5072; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 5073; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 5074; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 5075; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 5076; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 5077; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 5078; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 5079; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 5080; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 5081; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 5082; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 5083; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 5084; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 5085; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 5086; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 5087; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 5088; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 5089; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 5090; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 5091; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i16> undef, i16 [[TMP21]], i64 0 5092; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1 5093; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1 5094; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 5095; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 5096; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 5097; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 5098; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 5099; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 5100; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 5101; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 5102; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 5103; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 5104; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 5105; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 5106; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 5107; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 5108; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 5109; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 5110; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 5111; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 5112; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 5113; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1 5114; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2 5115; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2 5116; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 5117; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 5118; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 5119; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 5120; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 5121; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 5122; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 5123; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 5124; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 5125; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 5126; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 5127; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 5128; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 5129; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 5130; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 5131; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 5132; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 5133; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 5134; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 5135; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2 5136; CHECK-NEXT: store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 5137; CHECK-NEXT: ret void 5138; 5139; GFX6-LABEL: urem_v3i16: 5140; GFX6: ; %bb.0: 5141; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5142; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 5143; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 5144; GFX6-NEXT: s_mov_b32 s8, 0xffff 5145; GFX6-NEXT: s_mov_b32 s7, 0xf000 5146; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5147; GFX6-NEXT: v_mov_b32_e32 v1, s2 5148; GFX6-NEXT: s_and_b32 s6, s0, s8 5149; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 5150; GFX6-NEXT: s_and_b32 s6, s2, s8 5151; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 5152; GFX6-NEXT: v_mov_b32_e32 v4, s0 5153; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 5154; GFX6-NEXT: v_alignbit_b32 v4, s1, v4, 16 5155; GFX6-NEXT: v_and_b32_e32 v5, s8, v4 5156; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 5157; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 5158; GFX6-NEXT: v_trunc_f32_e32 v3, v3 5159; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 5160; GFX6-NEXT: v_mad_f32 v2, -v3, v0, v2 5161; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 5162; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v5 5163; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc 5164; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 5165; GFX6-NEXT: v_mul_lo_u32 v0, v0, s0 5166; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 5167; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 5168; GFX6-NEXT: s_and_b32 s0, s1, s8 5169; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s0 5170; GFX6-NEXT: s_and_b32 s0, s3, s8 5171; GFX6-NEXT: v_mul_f32_e32 v5, v3, v5 5172; GFX6-NEXT: v_trunc_f32_e32 v5, v5 5173; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s0 5174; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 5175; GFX6-NEXT: v_mad_f32 v3, -v5, v2, v3 5176; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 5177; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 5178; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 5179; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 5180; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 5181; GFX6-NEXT: v_trunc_f32_e32 v3, v3 5182; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 5183; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3 5184; GFX6-NEXT: v_mad_f32 v3, -v3, v6, v7 5185; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v6 5186; GFX6-NEXT: s_mov_b32 s6, -1 5187; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 5188; GFX6-NEXT: v_mul_lo_u32 v3, v3, s1 5189; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 5190; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 5191; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 5192; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 5193; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 5194; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 5195; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5196; GFX6-NEXT: s_endpgm 5197; 5198; GFX9-LABEL: urem_v3i16: 5199; GFX9: ; %bb.0: 5200; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5201; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5202; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 5203; GFX9-NEXT: s_mov_b32 s8, 0xffff 5204; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5205; GFX9-NEXT: s_and_b32 s0, s4, s8 5206; GFX9-NEXT: s_and_b32 s1, s6, s8 5207; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 5208; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 5209; GFX9-NEXT: s_lshr_b32 s6, s6, 16 5210; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6 5211; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 5212; GFX9-NEXT: s_lshr_b32 s4, s4, 16 5213; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 5214; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v2 5215; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 5216; GFX9-NEXT: v_trunc_f32_e32 v3, v3 5217; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v3 5218; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 5219; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 5220; GFX9-NEXT: v_mul_f32_e32 v1, v4, v5 5221; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc 5222; GFX9-NEXT: v_mul_lo_u32 v0, v0, s1 5223; GFX9-NEXT: v_trunc_f32_e32 v1, v1 5224; GFX9-NEXT: s_and_b32 s1, s7, s8 5225; GFX9-NEXT: v_mad_f32 v3, -v1, v2, v4 5226; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s1 5227; GFX9-NEXT: s_and_b32 s5, s5, s8 5228; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5 5229; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 5230; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 5231; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 5232; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 5233; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 5234; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6 5235; GFX9-NEXT: v_trunc_f32_e32 v2, v2 5236; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 5237; GFX9-NEXT: v_mad_f32 v2, -v2, v4, v5 5238; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 5239; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 5240; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc 5241; GFX9-NEXT: v_mul_lo_u32 v2, v2, s1 5242; GFX9-NEXT: v_mov_b32_e32 v3, 0 5243; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 5244; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 5245; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 5246; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 5247; GFX9-NEXT: global_store_short v3, v2, s[2:3] offset:4 5248; GFX9-NEXT: global_store_dword v3, v0, s[2:3] 5249; GFX9-NEXT: s_endpgm 5250; 5251; GFX90A-LABEL: urem_v3i16: 5252; GFX90A: ; %bb.0: 5253; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5254; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5255; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 5256; GFX90A-NEXT: s_mov_b32 s8, 0xffff 5257; GFX90A-NEXT: v_mov_b32_e32 v1, 0 5258; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 5259; GFX90A-NEXT: s_and_b32 s1, s4, s8 5260; GFX90A-NEXT: s_and_b32 s0, s6, s8 5261; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s0 5262; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s1 5263; GFX90A-NEXT: s_lshr_b32 s6, s6, 16 5264; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s6 5265; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 5266; GFX90A-NEXT: s_lshr_b32 s4, s4, 16 5267; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s4 5268; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 5269; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 5270; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 5271; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 5272; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 5273; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 5274; GFX90A-NEXT: v_mul_f32_e32 v2, v5, v6 5275; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 5276; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 5277; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s0 5278; GFX90A-NEXT: s_and_b32 s0, s7, s8 5279; GFX90A-NEXT: v_mad_f32 v3, -v2, v4, v5 5280; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s0 5281; GFX90A-NEXT: v_sub_u32_e32 v0, s1, v0 5282; GFX90A-NEXT: s_and_b32 s1, s5, s8 5283; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s1 5284; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 5285; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 5286; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 5287; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 5288; GFX90A-NEXT: v_mul_f32_e32 v3, v6, v7 5289; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 5290; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v3 5291; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 5292; GFX90A-NEXT: v_mad_f32 v3, -v3, v5, v6 5293; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 5294; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 5295; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s6 5296; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s0 5297; GFX90A-NEXT: v_sub_u32_e32 v2, s4, v2 5298; GFX90A-NEXT: v_sub_u32_e32 v3, s1, v3 5299; GFX90A-NEXT: v_lshl_or_b32 v0, v2, 16, v0 5300; GFX90A-NEXT: global_store_short v1, v3, s[2:3] offset:4 5301; GFX90A-NEXT: global_store_dword v1, v0, s[2:3] 5302; GFX90A-NEXT: s_endpgm 5303 %r = urem <3 x i16> %x, %y 5304 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 5305 ret void 5306} 5307 5308define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 5309; CHECK-LABEL: @sdiv_v3i16( 5310; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 5311; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 5312; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 5313; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 5314; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 5315; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 5316; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 5317; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 5318; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 5319; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 5320; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 5321; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 5322; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 5323; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 5324; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 5325; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 5326; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 5327; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 5328; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 5329; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 5330; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 5331; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 5332; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 5333; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i16> undef, i16 [[TMP23]], i64 0 5334; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1 5335; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1 5336; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 5337; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 5338; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 5339; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 5340; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 5341; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 5342; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 5343; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 5344; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 5345; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 5346; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 5347; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 5348; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 5349; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 5350; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 5351; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 5352; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 5353; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 5354; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 5355; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 5356; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 5357; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1 5358; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2 5359; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2 5360; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 5361; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 5362; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 5363; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 5364; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 5365; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 5366; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 5367; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 5368; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 5369; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 5370; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 5371; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 5372; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 5373; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 5374; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 5375; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 5376; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 5377; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 5378; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 5379; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 5380; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 5381; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2 5382; CHECK-NEXT: store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 5383; CHECK-NEXT: ret void 5384; 5385; GFX6-LABEL: sdiv_v3i16: 5386; GFX6: ; %bb.0: 5387; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5388; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 5389; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 5390; GFX6-NEXT: s_mov_b32 s7, 0xf000 5391; GFX6-NEXT: s_mov_b32 s6, -1 5392; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5393; GFX6-NEXT: s_sext_i32_i16 s9, s2 5394; GFX6-NEXT: s_sext_i32_i16 s8, s0 5395; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 5396; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 5397; GFX6-NEXT: s_xor_b32 s8, s9, s8 5398; GFX6-NEXT: s_ashr_i32 s0, s0, 16 5399; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 5400; GFX6-NEXT: s_ashr_i32 s8, s8, 30 5401; GFX6-NEXT: s_or_b32 s8, s8, 1 5402; GFX6-NEXT: v_mov_b32_e32 v3, s8 5403; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 5404; GFX6-NEXT: v_trunc_f32_e32 v2, v2 5405; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 5406; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 5407; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 5408; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s0 5409; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 5410; GFX6-NEXT: s_ashr_i32 s2, s2, 16 5411; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 5412; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s2 5413; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 5414; GFX6-NEXT: s_xor_b32 s0, s2, s0 5415; GFX6-NEXT: s_ashr_i32 s0, s0, 30 5416; GFX6-NEXT: s_or_b32 s0, s0, 1 5417; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 5418; GFX6-NEXT: v_trunc_f32_e32 v3, v3 5419; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 5420; GFX6-NEXT: v_mov_b32_e32 v4, s0 5421; GFX6-NEXT: s_sext_i32_i16 s0, s1 5422; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 5423; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 5424; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 5425; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 5426; GFX6-NEXT: s_sext_i32_i16 s1, s3 5427; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 5428; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s1 5429; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 5430; GFX6-NEXT: s_xor_b32 s0, s1, s0 5431; GFX6-NEXT: s_ashr_i32 s0, s0, 30 5432; GFX6-NEXT: s_or_b32 s0, s0, 1 5433; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 5434; GFX6-NEXT: v_trunc_f32_e32 v4, v4 5435; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 5436; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 5437; GFX6-NEXT: v_mov_b32_e32 v5, s0 5438; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 5439; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 5440; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 5441; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 5442; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 5443; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 5444; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 5445; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5446; GFX6-NEXT: s_endpgm 5447; 5448; GFX9-LABEL: sdiv_v3i16: 5449; GFX9: ; %bb.0: 5450; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5451; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5452; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 5453; GFX9-NEXT: v_mov_b32_e32 v1, 0 5454; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5455; GFX9-NEXT: s_sext_i32_i16 s1, s4 5456; GFX9-NEXT: s_sext_i32_i16 s0, s6 5457; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 5458; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 5459; GFX9-NEXT: s_xor_b32 s0, s1, s0 5460; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5461; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 5462; GFX9-NEXT: s_or_b32 s8, s0, 1 5463; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 5464; GFX9-NEXT: v_trunc_f32_e32 v3, v3 5465; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 5466; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 5467; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5468; GFX9-NEXT: s_cselect_b32 s0, s8, 0 5469; GFX9-NEXT: s_ashr_i32 s1, s6, 16 5470; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 5471; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 5472; GFX9-NEXT: s_ashr_i32 s4, s4, 16 5473; GFX9-NEXT: v_add_u32_e32 v2, s0, v3 5474; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s4 5475; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 5476; GFX9-NEXT: s_xor_b32 s0, s4, s1 5477; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5478; GFX9-NEXT: s_or_b32 s4, s0, 1 5479; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 5480; GFX9-NEXT: v_trunc_f32_e32 v4, v4 5481; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 5482; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| 5483; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5484; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 5485; GFX9-NEXT: s_sext_i32_i16 s1, s7 5486; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 5487; GFX9-NEXT: s_cselect_b32 s0, s4, 0 5488; GFX9-NEXT: v_add_u32_e32 v3, s0, v4 5489; GFX9-NEXT: s_sext_i32_i16 s0, s5 5490; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 5491; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 5492; GFX9-NEXT: s_xor_b32 s0, s0, s1 5493; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5494; GFX9-NEXT: s_or_b32 s4, s0, 1 5495; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 5496; GFX9-NEXT: v_trunc_f32_e32 v5, v5 5497; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 5498; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 5499; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| 5500; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5501; GFX9-NEXT: s_cselect_b32 s0, s4, 0 5502; GFX9-NEXT: v_add_u32_e32 v0, s0, v5 5503; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 5504; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 5505; GFX9-NEXT: global_store_short v1, v0, s[2:3] offset:4 5506; GFX9-NEXT: global_store_dword v1, v2, s[2:3] 5507; GFX9-NEXT: s_endpgm 5508; 5509; GFX90A-LABEL: sdiv_v3i16: 5510; GFX90A: ; %bb.0: 5511; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5512; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5513; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 5514; GFX90A-NEXT: v_mov_b32_e32 v1, 0 5515; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 5516; GFX90A-NEXT: s_sext_i32_i16 s1, s4 5517; GFX90A-NEXT: s_sext_i32_i16 s0, s6 5518; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 5519; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 5520; GFX90A-NEXT: s_xor_b32 s0, s1, s0 5521; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 5522; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 5523; GFX90A-NEXT: s_or_b32 s8, s0, 1 5524; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 5525; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 5526; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 5527; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 5528; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 5529; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 5530; GFX90A-NEXT: s_ashr_i32 s1, s6, 16 5531; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 5532; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 5533; GFX90A-NEXT: s_ashr_i32 s4, s4, 16 5534; GFX90A-NEXT: v_add_u32_e32 v2, s0, v3 5535; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s4 5536; GFX90A-NEXT: v_rcp_iflag_f32_e32 v4, v0 5537; GFX90A-NEXT: s_xor_b32 s0, s4, s1 5538; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 5539; GFX90A-NEXT: s_or_b32 s4, s0, 1 5540; GFX90A-NEXT: v_mul_f32_e32 v4, v3, v4 5541; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 5542; GFX90A-NEXT: v_mad_f32 v3, -v4, v0, v3 5543; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| 5544; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 5545; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 5546; GFX90A-NEXT: s_sext_i32_i16 s1, s7 5547; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 5548; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 5549; GFX90A-NEXT: v_add_u32_e32 v3, s0, v4 5550; GFX90A-NEXT: s_sext_i32_i16 s0, s5 5551; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s0 5552; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v0 5553; GFX90A-NEXT: s_xor_b32 s0, s0, s1 5554; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 5555; GFX90A-NEXT: s_or_b32 s4, s0, 1 5556; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 5557; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 5558; GFX90A-NEXT: v_mad_f32 v4, -v5, v0, v4 5559; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 5560; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| 5561; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 5562; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 5563; GFX90A-NEXT: v_add_u32_e32 v0, s0, v5 5564; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff, v2 5565; GFX90A-NEXT: v_lshl_or_b32 v2, v3, 16, v2 5566; GFX90A-NEXT: global_store_short v1, v0, s[2:3] offset:4 5567; GFX90A-NEXT: global_store_dword v1, v2, s[2:3] 5568; GFX90A-NEXT: s_endpgm 5569 %r = sdiv <3 x i16> %x, %y 5570 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 5571 ret void 5572} 5573 5574define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 5575; CHECK-LABEL: @srem_v3i16( 5576; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 5577; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 5578; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 5579; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 5580; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 5581; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 5582; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 5583; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 5584; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 5585; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 5586; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 5587; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 5588; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 5589; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 5590; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 5591; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 5592; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 5593; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 5594; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 5595; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 5596; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 5597; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 5598; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 5599; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 5600; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 5601; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i16> undef, i16 [[TMP25]], i64 0 5602; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1 5603; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1 5604; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 5605; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 5606; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 5607; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 5608; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 5609; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 5610; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 5611; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 5612; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 5613; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 5614; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 5615; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 5616; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 5617; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 5618; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 5619; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 5620; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 5621; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 5622; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 5623; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 5624; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 5625; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 5626; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 5627; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1 5628; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2 5629; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2 5630; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 5631; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 5632; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 5633; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 5634; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 5635; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 5636; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 5637; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 5638; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 5639; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 5640; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 5641; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 5642; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 5643; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 5644; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 5645; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 5646; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 5647; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 5648; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 5649; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 5650; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 5651; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 5652; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 5653; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2 5654; CHECK-NEXT: store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 5655; CHECK-NEXT: ret void 5656; 5657; GFX6-LABEL: srem_v3i16: 5658; GFX6: ; %bb.0: 5659; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5660; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 5661; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 5662; GFX6-NEXT: s_mov_b32 s7, 0xf000 5663; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5664; GFX6-NEXT: s_sext_i32_i16 s8, s2 5665; GFX6-NEXT: s_sext_i32_i16 s6, s0 5666; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s6 5667; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s8 5668; GFX6-NEXT: s_xor_b32 s6, s8, s6 5669; GFX6-NEXT: s_ashr_i32 s6, s6, 30 5670; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 5671; GFX6-NEXT: s_or_b32 s6, s6, 1 5672; GFX6-NEXT: v_mov_b32_e32 v3, s6 5673; GFX6-NEXT: s_mov_b32 s6, -1 5674; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 5675; GFX6-NEXT: v_trunc_f32_e32 v2, v2 5676; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 5677; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 5678; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 5679; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 5680; GFX6-NEXT: v_mov_b32_e32 v1, s2 5681; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 5682; GFX6-NEXT: v_mov_b32_e32 v2, s0 5683; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 16 5684; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 5685; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v3 5686; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 5687; GFX6-NEXT: v_bfe_i32 v5, v1, 0, 16 5688; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v5 5689; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 5690; GFX6-NEXT: v_mul_lo_u32 v0, v0, s0 5691; GFX6-NEXT: v_xor_b32_e32 v3, v5, v3 5692; GFX6-NEXT: s_sext_i32_i16 s0, s1 5693; GFX6-NEXT: v_mul_f32_e32 v5, v6, v7 5694; GFX6-NEXT: v_trunc_f32_e32 v5, v5 5695; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 5696; GFX6-NEXT: v_mad_f32 v6, -v5, v4, v6 5697; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 5698; GFX6-NEXT: v_ashrrev_i32_e32 v3, 30, v3 5699; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4| 5700; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s0 5701; GFX6-NEXT: v_or_b32_e32 v3, 1, v3 5702; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 5703; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 5704; GFX6-NEXT: s_sext_i32_i16 s2, s3 5705; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 5706; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s2 5707; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v4 5708; GFX6-NEXT: s_xor_b32 s0, s2, s0 5709; GFX6-NEXT: s_ashr_i32 s0, s0, 30 5710; GFX6-NEXT: s_or_b32 s0, s0, 1 5711; GFX6-NEXT: v_mul_f32_e32 v5, v3, v5 5712; GFX6-NEXT: v_trunc_f32_e32 v5, v5 5713; GFX6-NEXT: v_mad_f32 v3, -v5, v4, v3 5714; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 5715; GFX6-NEXT: v_mov_b32_e32 v6, s0 5716; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v4| 5717; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 5718; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 5719; GFX6-NEXT: v_mul_lo_u32 v3, v3, s1 5720; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 5721; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 5722; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v3 5723; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 5724; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 5725; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 5726; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5727; GFX6-NEXT: s_endpgm 5728; 5729; GFX9-LABEL: srem_v3i16: 5730; GFX9: ; %bb.0: 5731; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 5732; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 5733; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 5734; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5735; GFX9-NEXT: s_sext_i32_i16 s8, s2 5736; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 5737; GFX9-NEXT: s_sext_i32_i16 s9, s6 5738; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 5739; GFX9-NEXT: s_xor_b32 s0, s9, s8 5740; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 5741; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5742; GFX9-NEXT: s_or_b32 s10, s0, 1 5743; GFX9-NEXT: s_sext_i32_i16 s3, s3 5744; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 5745; GFX9-NEXT: v_trunc_f32_e32 v2, v2 5746; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 5747; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 5748; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5749; GFX9-NEXT: s_cselect_b32 s0, s10, 0 5750; GFX9-NEXT: s_ashr_i32 s2, s2, 16 5751; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 5752; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 5753; GFX9-NEXT: s_ashr_i32 s6, s6, 16 5754; GFX9-NEXT: v_add_u32_e32 v1, s0, v2 5755; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s6 5756; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 5757; GFX9-NEXT: s_xor_b32 s0, s6, s2 5758; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5759; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 5760; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 5761; GFX9-NEXT: v_trunc_f32_e32 v3, v3 5762; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 5763; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 5764; GFX9-NEXT: s_or_b32 s8, s0, 1 5765; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 5766; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5767; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s3 5768; GFX9-NEXT: s_cselect_b32 s0, s8, 0 5769; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 5770; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 5771; GFX9-NEXT: s_sext_i32_i16 s2, s7 5772; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2 5773; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 5774; GFX9-NEXT: s_xor_b32 s0, s2, s3 5775; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5776; GFX9-NEXT: s_or_b32 s7, s0, 1 5777; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 5778; GFX9-NEXT: v_trunc_f32_e32 v4, v4 5779; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 5780; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 5781; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| 5782; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5783; GFX9-NEXT: s_cselect_b32 s0, s7, 0 5784; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 5785; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3 5786; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 5787; GFX9-NEXT: v_mov_b32_e32 v3, 0 5788; GFX9-NEXT: v_sub_u32_e32 v0, s6, v0 5789; GFX9-NEXT: v_sub_u32_e32 v2, s2, v2 5790; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 5791; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 5792; GFX9-NEXT: global_store_short v3, v2, s[4:5] offset:4 5793; GFX9-NEXT: global_store_dword v3, v0, s[4:5] 5794; GFX9-NEXT: s_endpgm 5795; 5796; GFX90A-LABEL: srem_v3i16: 5797; GFX90A: ; %bb.0: 5798; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5799; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5800; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 5801; GFX90A-NEXT: v_mov_b32_e32 v1, 0 5802; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 5803; GFX90A-NEXT: s_sext_i32_i16 s9, s4 5804; GFX90A-NEXT: s_sext_i32_i16 s8, s6 5805; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s8 5806; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s9 5807; GFX90A-NEXT: s_xor_b32 s0, s9, s8 5808; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 5809; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 5810; GFX90A-NEXT: s_or_b32 s10, s0, 1 5811; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 5812; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 5813; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 5814; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 5815; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 5816; GFX90A-NEXT: s_cselect_b32 s0, s10, 0 5817; GFX90A-NEXT: s_ashr_i32 s6, s6, 16 5818; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 5819; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s6 5820; GFX90A-NEXT: s_ashr_i32 s4, s4, 16 5821; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 5822; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s4 5823; GFX90A-NEXT: v_rcp_iflag_f32_e32 v4, v2 5824; GFX90A-NEXT: s_xor_b32 s0, s4, s6 5825; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 5826; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s8 5827; GFX90A-NEXT: v_mul_f32_e32 v4, v3, v4 5828; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 5829; GFX90A-NEXT: v_mad_f32 v3, -v4, v2, v3 5830; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 5831; GFX90A-NEXT: s_or_b32 s8, s0, 1 5832; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| 5833; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 5834; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 5835; GFX90A-NEXT: v_add_u32_e32 v2, s0, v4 5836; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s6 5837; GFX90A-NEXT: s_sext_i32_i16 s6, s7 5838; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s6 5839; GFX90A-NEXT: v_sub_u32_e32 v2, s4, v2 5840; GFX90A-NEXT: s_sext_i32_i16 s4, s5 5841; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s4 5842; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v3 5843; GFX90A-NEXT: s_xor_b32 s0, s4, s6 5844; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 5845; GFX90A-NEXT: s_or_b32 s5, s0, 1 5846; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 5847; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 5848; GFX90A-NEXT: v_mad_f32 v4, -v5, v3, v4 5849; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 5850; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 5851; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 5852; GFX90A-NEXT: s_cselect_b32 s0, s5, 0 5853; GFX90A-NEXT: v_add_u32_e32 v3, s0, v5 5854; GFX90A-NEXT: v_sub_u32_e32 v0, s9, v0 5855; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s6 5856; GFX90A-NEXT: v_sub_u32_e32 v3, s4, v3 5857; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 5858; GFX90A-NEXT: v_lshl_or_b32 v0, v2, 16, v0 5859; GFX90A-NEXT: global_store_short v1, v3, s[2:3] offset:4 5860; GFX90A-NEXT: global_store_dword v1, v0, s[2:3] 5861; GFX90A-NEXT: s_endpgm 5862 %r = srem <3 x i16> %x, %y 5863 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 5864 ret void 5865} 5866 5867define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 5868; CHECK-LABEL: @udiv_v3i15( 5869; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 5870; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 5871; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 5872; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 5873; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 5874; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 5875; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 5876; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 5877; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 5878; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 5879; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 5880; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 5881; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 5882; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 5883; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 5884; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 5885; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 5886; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 32767 5887; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15 5888; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i15> undef, i15 [[TMP19]], i64 0 5889; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1 5890; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1 5891; CHECK-NEXT: [[TMP23:%.*]] = zext i15 [[TMP21]] to i32 5892; CHECK-NEXT: [[TMP24:%.*]] = zext i15 [[TMP22]] to i32 5893; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 5894; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 5895; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 5896; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 5897; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 5898; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 5899; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 5900; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 5901; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 5902; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 5903; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 5904; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 5905; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 5906; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 32767 5907; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15 5908; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1 5909; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2 5910; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2 5911; CHECK-NEXT: [[TMP43:%.*]] = zext i15 [[TMP41]] to i32 5912; CHECK-NEXT: [[TMP44:%.*]] = zext i15 [[TMP42]] to i32 5913; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 5914; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 5915; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 5916; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 5917; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 5918; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 5919; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 5920; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 5921; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 5922; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 5923; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 5924; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 5925; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 5926; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 32767 5927; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15 5928; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2 5929; CHECK-NEXT: store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 5930; CHECK-NEXT: ret void 5931; 5932; GFX6-LABEL: udiv_v3i15: 5933; GFX6: ; %bb.0: 5934; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5935; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 5936; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 5937; GFX6-NEXT: s_mov_b32 s7, 0xf000 5938; GFX6-NEXT: s_mov_b32 s6, -1 5939; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5940; GFX6-NEXT: v_mov_b32_e32 v0, s2 5941; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 5942; GFX6-NEXT: s_movk_i32 s3, 0x7fff 5943; GFX6-NEXT: s_and_b32 s9, s0, s3 5944; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 5945; GFX6-NEXT: s_and_b32 s8, s2, s3 5946; GFX6-NEXT: v_mov_b32_e32 v2, s0 5947; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf000f 5948; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 5949; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 5950; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 5951; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf000f 5952; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 5953; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 5954; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s2 5955; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 5956; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 5957; GFX6-NEXT: v_trunc_f32_e32 v4, v4 5958; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 5959; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 5960; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v2 5961; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 5962; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7 5963; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 5964; GFX6-NEXT: v_trunc_f32_e32 v1, v1 5965; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 5966; GFX6-NEXT: v_mad_f32 v4, -v1, v5, v6 5967; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 5968; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0 5969; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 5970; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v5 5971; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 5972; GFX6-NEXT: v_mul_f32_e32 v1, v0, v6 5973; GFX6-NEXT: v_trunc_f32_e32 v1, v1 5974; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v1 5975; GFX6-NEXT: v_mad_f32 v0, -v1, v2, v0 5976; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 5977; GFX6-NEXT: v_and_b32_e32 v2, s3, v3 5978; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc 5979; GFX6-NEXT: v_and_b32_e32 v3, s3, v4 5980; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 5981; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 5982; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 5983; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 5984; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5985; GFX6-NEXT: s_waitcnt expcnt(0) 5986; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5987; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 5988; GFX6-NEXT: s_endpgm 5989; 5990; GFX9-LABEL: udiv_v3i15: 5991; GFX9: ; %bb.0: 5992; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 5993; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5994; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 5995; GFX9-NEXT: s_movk_i32 s8, 0x7fff 5996; GFX9-NEXT: v_mov_b32_e32 v2, 0 5997; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5998; GFX9-NEXT: s_and_b32 s0, s4, s8 5999; GFX9-NEXT: s_and_b32 s1, s6, s8 6000; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 6001; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 6002; GFX9-NEXT: s_bfe_u32 s0, s6, 0xf000f 6003; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 6004; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 6005; GFX9-NEXT: s_bfe_u32 s1, s4, 0xf000f 6006; GFX9-NEXT: v_mov_b32_e32 v3, s6 6007; GFX9-NEXT: v_alignbit_b32 v3, s7, v3, 30 6008; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 6009; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 6010; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 6011; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 6012; GFX9-NEXT: v_trunc_f32_e32 v5, v5 6013; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 6014; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 6015; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 6016; GFX9-NEXT: v_mov_b32_e32 v0, s4 6017; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 6018; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 6019; GFX9-NEXT: v_mul_f32_e32 v1, v7, v8 6020; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 6021; GFX9-NEXT: v_trunc_f32_e32 v1, v1 6022; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 6023; GFX9-NEXT: v_mad_f32 v5, -v1, v6, v7 6024; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 6025; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 6026; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 6027; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v6 6028; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc 6029; GFX9-NEXT: v_mul_f32_e32 v1, v0, v7 6030; GFX9-NEXT: v_trunc_f32_e32 v1, v1 6031; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v1 6032; GFX9-NEXT: v_mad_f32 v0, -v1, v3, v0 6033; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v3 6034; GFX9-NEXT: v_and_b32_e32 v3, s8, v4 6035; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc 6036; GFX9-NEXT: v_and_b32_e32 v4, s8, v5 6037; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6038; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 6039; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 6040; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 6041; GFX9-NEXT: global_store_dword v2, v0, s[2:3] 6042; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6043; GFX9-NEXT: global_store_short v2, v0, s[2:3] offset:4 6044; GFX9-NEXT: s_endpgm 6045; 6046; GFX90A-LABEL: udiv_v3i15: 6047; GFX90A: ; %bb.0: 6048; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6049; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 6050; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 6051; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 6052; GFX90A-NEXT: v_mov_b32_e32 v2, 0 6053; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 6054; GFX90A-NEXT: s_and_b32 s0, s4, s8 6055; GFX90A-NEXT: s_and_b32 s1, s6, s8 6056; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s1 6057; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s0 6058; GFX90A-NEXT: s_bfe_u32 s0, s6, 0xf000f 6059; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 6060; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v1 6061; GFX90A-NEXT: s_bfe_u32 s1, s4, 0xf000f 6062; GFX90A-NEXT: v_mov_b32_e32 v3, s6 6063; GFX90A-NEXT: v_alignbit_b32 v3, s7, v3, 30 6064; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 6065; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s1 6066; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v6 6067; GFX90A-NEXT: v_and_b32_e32 v3, s8, v3 6068; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 6069; GFX90A-NEXT: v_mad_f32 v4, -v5, v1, v4 6070; GFX90A-NEXT: v_cvt_u32_f32_e32 v5, v5 6071; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, v3 6072; GFX90A-NEXT: v_mov_b32_e32 v0, s4 6073; GFX90A-NEXT: v_alignbit_b32 v0, s5, v0, 30 6074; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 6075; GFX90A-NEXT: v_mul_f32_e32 v1, v7, v8 6076; GFX90A-NEXT: v_and_b32_e32 v0, s8, v0 6077; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 6078; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 6079; GFX90A-NEXT: v_mad_f32 v5, -v1, v6, v7 6080; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 6081; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, v0 6082; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v3 6083; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v6 6084; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc 6085; GFX90A-NEXT: v_mul_f32_e32 v1, v0, v7 6086; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 6087; GFX90A-NEXT: v_cvt_u32_f32_e32 v6, v1 6088; GFX90A-NEXT: v_mad_f32 v0, -v1, v3, v0 6089; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v3 6090; GFX90A-NEXT: v_and_b32_e32 v3, s8, v4 6091; GFX90A-NEXT: v_and_b32_e32 v4, s8, v5 6092; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc 6093; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 6094; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6095; GFX90A-NEXT: v_or_b32_e32 v3, v3, v4 6096; GFX90A-NEXT: v_or_b32_e32 v0, v3, v0 6097; GFX90A-NEXT: global_store_dword v2, v0, s[2:3] 6098; GFX90A-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6099; GFX90A-NEXT: global_store_short v2, v0, s[2:3] offset:4 6100; GFX90A-NEXT: s_endpgm 6101 %r = udiv <3 x i15> %x, %y 6102 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 6103 ret void 6104} 6105 6106define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 6107; CHECK-LABEL: @urem_v3i15( 6108; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 6109; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 6110; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 6111; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 6112; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 6113; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 6114; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 6115; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 6116; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 6117; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 6118; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 6119; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 6120; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 6121; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 6122; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 6123; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 6124; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 6125; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 6126; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 6127; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 32767 6128; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15 6129; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i15> undef, i15 [[TMP21]], i64 0 6130; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1 6131; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1 6132; CHECK-NEXT: [[TMP25:%.*]] = zext i15 [[TMP23]] to i32 6133; CHECK-NEXT: [[TMP26:%.*]] = zext i15 [[TMP24]] to i32 6134; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 6135; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 6136; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 6137; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 6138; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 6139; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 6140; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 6141; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 6142; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 6143; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 6144; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 6145; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 6146; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 6147; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 6148; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 6149; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 32767 6150; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15 6151; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1 6152; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2 6153; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2 6154; CHECK-NEXT: [[TMP47:%.*]] = zext i15 [[TMP45]] to i32 6155; CHECK-NEXT: [[TMP48:%.*]] = zext i15 [[TMP46]] to i32 6156; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 6157; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 6158; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 6159; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 6160; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 6161; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 6162; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 6163; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 6164; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 6165; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 6166; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 6167; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 6168; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 6169; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 6170; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 6171; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 32767 6172; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15 6173; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2 6174; CHECK-NEXT: store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 6175; CHECK-NEXT: ret void 6176; 6177; GFX6-LABEL: urem_v3i15: 6178; GFX6: ; %bb.0: 6179; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6180; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 6181; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 6182; GFX6-NEXT: s_mov_b32 s7, 0xf000 6183; GFX6-NEXT: s_mov_b32 s6, -1 6184; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6185; GFX6-NEXT: v_mov_b32_e32 v0, s2 6186; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 6187; GFX6-NEXT: s_movk_i32 s3, 0x7fff 6188; GFX6-NEXT: s_and_b32 s10, s0, s3 6189; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 6190; GFX6-NEXT: s_and_b32 s9, s2, s3 6191; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s9 6192; GFX6-NEXT: v_mov_b32_e32 v2, s0 6193; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 6194; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 6195; GFX6-NEXT: s_bfe_u32 s1, s0, 0xf000f 6196; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 6197; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 6198; GFX6-NEXT: v_trunc_f32_e32 v4, v4 6199; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 6200; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 6201; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 6202; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf000f 6203; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 6204; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc 6205; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 6206; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 6207; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 6208; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 6209; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s2, v1 6210; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 6211; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2 6212; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0 6213; GFX6-NEXT: v_trunc_f32_e32 v1, v1 6214; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 6215; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 6216; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 6217; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 6218; GFX6-NEXT: s_lshr_b32 s0, s0, 15 6219; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 6220; GFX6-NEXT: v_trunc_f32_e32 v3, v3 6221; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3 6222; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6223; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 6224; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 6225; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 6226; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6227; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 6228; GFX6-NEXT: s_lshr_b32 s8, s2, 15 6229; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v1 6230; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 6231; GFX6-NEXT: v_and_b32_e32 v3, s3, v3 6232; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 6233; GFX6-NEXT: v_and_b32_e32 v2, s3, v6 6234; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 6235; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 6236; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 6237; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 6238; GFX6-NEXT: s_waitcnt expcnt(0) 6239; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6240; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 6241; GFX6-NEXT: s_endpgm 6242; 6243; GFX9-LABEL: urem_v3i15: 6244; GFX9: ; %bb.0: 6245; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6246; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 6247; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 6248; GFX9-NEXT: s_movk_i32 s8, 0x7fff 6249; GFX9-NEXT: v_mov_b32_e32 v2, 0 6250; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6251; GFX9-NEXT: v_mov_b32_e32 v0, s4 6252; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 6253; GFX9-NEXT: s_and_b32 s5, s6, s8 6254; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 6255; GFX9-NEXT: s_and_b32 s0, s4, s8 6256; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0 6257; GFX9-NEXT: s_bfe_u32 s5, s6, 0xf000f 6258; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 6259; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s5 6260; GFX9-NEXT: v_mov_b32_e32 v3, s6 6261; GFX9-NEXT: v_alignbit_b32 v3, s7, v3, 30 6262; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 6263; GFX9-NEXT: v_trunc_f32_e32 v5, v5 6264; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 6265; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 6266; GFX9-NEXT: s_bfe_u32 s1, s4, 0xf000f 6267; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 6268; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 6269; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1 6270; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 6271; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc 6272; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 6273; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 6274; GFX9-NEXT: v_mul_f32_e32 v4, v7, v8 6275; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 6276; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v5 6277; GFX9-NEXT: v_trunc_f32_e32 v4, v4 6278; GFX9-NEXT: v_mad_f32 v7, -v4, v6, v7 6279; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 6280; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 6281; GFX9-NEXT: v_mul_f32_e32 v6, v8, v9 6282; GFX9-NEXT: v_trunc_f32_e32 v6, v6 6283; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 6284; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 6285; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v8 6286; GFX9-NEXT: s_lshr_b32 s0, s6, 15 6287; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 6288; GFX9-NEXT: v_mul_lo_u32 v4, v4, s0 6289; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc 6290; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 6291; GFX9-NEXT: v_mul_lo_u32 v3, v5, v3 6292; GFX9-NEXT: s_lshr_b32 s0, s4, 15 6293; GFX9-NEXT: v_sub_u32_e32 v4, s0, v4 6294; GFX9-NEXT: v_sub_u32_e32 v5, s4, v1 6295; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 6296; GFX9-NEXT: v_and_b32_e32 v4, s8, v4 6297; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6298; GFX9-NEXT: v_and_b32_e32 v3, s8, v5 6299; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 6300; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 6301; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 6302; GFX9-NEXT: global_store_dword v2, v0, s[2:3] 6303; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6304; GFX9-NEXT: global_store_short v2, v0, s[2:3] offset:4 6305; GFX9-NEXT: s_endpgm 6306; 6307; GFX90A-LABEL: urem_v3i15: 6308; GFX90A: ; %bb.0: 6309; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6310; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 6311; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 6312; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 6313; GFX90A-NEXT: v_mov_b32_e32 v2, 0 6314; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 6315; GFX90A-NEXT: s_and_b32 s1, s4, s8 6316; GFX90A-NEXT: s_and_b32 s9, s6, s8 6317; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 6318; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s1 6319; GFX90A-NEXT: v_mov_b32_e32 v3, s6 6320; GFX90A-NEXT: v_alignbit_b32 v3, s7, v3, 30 6321; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v1 6322; GFX90A-NEXT: s_bfe_u32 s7, s6, 0xf000f 6323; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s7 6324; GFX90A-NEXT: v_mov_b32_e32 v0, s4 6325; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 6326; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 6327; GFX90A-NEXT: v_mad_f32 v4, -v5, v1, v4 6328; GFX90A-NEXT: v_cvt_u32_f32_e32 v5, v5 6329; GFX90A-NEXT: v_alignbit_b32 v0, s5, v0, 30 6330; GFX90A-NEXT: s_bfe_u32 s5, s4, 0xf000f 6331; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s5 6332; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v6 6333; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 6334; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc 6335; GFX90A-NEXT: v_and_b32_e32 v3, s8, v3 6336; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s6 6337; GFX90A-NEXT: v_sub_u32_e32 v4, s4, v1 6338; GFX90A-NEXT: v_mul_f32_e32 v1, v7, v8 6339; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, v3 6340; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 6341; GFX90A-NEXT: v_mad_f32 v7, -v1, v6, v7 6342; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 6343; GFX90A-NEXT: v_and_b32_e32 v0, s8, v0 6344; GFX90A-NEXT: v_cvt_f32_u32_e32 v8, v0 6345; GFX90A-NEXT: v_rcp_iflag_f32_e32 v9, v5 6346; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 6347; GFX90A-NEXT: s_lshr_b32 s1, s6, 15 6348; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 6349; GFX90A-NEXT: s_lshr_b32 s0, s4, 15 6350; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s1 6351; GFX90A-NEXT: v_sub_u32_e32 v6, s0, v1 6352; GFX90A-NEXT: v_mul_f32_e32 v1, v8, v9 6353; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 6354; GFX90A-NEXT: v_cvt_u32_f32_e32 v7, v1 6355; GFX90A-NEXT: v_mad_f32 v1, -v1, v5, v8 6356; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v5 6357; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc 6358; GFX90A-NEXT: v_mul_lo_u32 v1, v1, v3 6359; GFX90A-NEXT: v_and_b32_e32 v3, s8, v4 6360; GFX90A-NEXT: v_and_b32_e32 v4, s8, v6 6361; GFX90A-NEXT: v_sub_u32_e32 v0, v0, v1 6362; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 6363; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6364; GFX90A-NEXT: v_or_b32_e32 v3, v3, v4 6365; GFX90A-NEXT: v_or_b32_e32 v0, v3, v0 6366; GFX90A-NEXT: global_store_dword v2, v0, s[2:3] 6367; GFX90A-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6368; GFX90A-NEXT: global_store_short v2, v0, s[2:3] offset:4 6369; GFX90A-NEXT: s_endpgm 6370 %r = urem <3 x i15> %x, %y 6371 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 6372 ret void 6373} 6374 6375define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 6376; CHECK-LABEL: @sdiv_v3i15( 6377; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 6378; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 6379; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 6380; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 6381; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 6382; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 6383; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 6384; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 6385; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 6386; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 6387; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 6388; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 6389; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 6390; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 6391; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 6392; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 6393; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 6394; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 6395; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 6396; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 6397; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 17 6398; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 17 6399; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15 6400; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i15> undef, i15 [[TMP23]], i64 0 6401; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1 6402; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1 6403; CHECK-NEXT: [[TMP27:%.*]] = sext i15 [[TMP25]] to i32 6404; CHECK-NEXT: [[TMP28:%.*]] = sext i15 [[TMP26]] to i32 6405; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 6406; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 6407; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 6408; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 6409; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 6410; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 6411; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 6412; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 6413; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 6414; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 6415; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 6416; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 6417; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 6418; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 6419; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 6420; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 6421; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 17 6422; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 17 6423; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15 6424; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1 6425; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2 6426; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2 6427; CHECK-NEXT: [[TMP51:%.*]] = sext i15 [[TMP49]] to i32 6428; CHECK-NEXT: [[TMP52:%.*]] = sext i15 [[TMP50]] to i32 6429; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 6430; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 6431; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 6432; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 6433; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 6434; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 6435; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 6436; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 6437; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 6438; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 6439; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 6440; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 6441; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 6442; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 6443; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 6444; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 6445; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 17 6446; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 17 6447; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15 6448; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2 6449; CHECK-NEXT: store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 6450; CHECK-NEXT: ret void 6451; 6452; GFX6-LABEL: sdiv_v3i15: 6453; GFX6: ; %bb.0: 6454; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6455; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 6456; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 6457; GFX6-NEXT: s_mov_b32 s7, 0xf000 6458; GFX6-NEXT: s_mov_b32 s6, -1 6459; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6460; GFX6-NEXT: v_mov_b32_e32 v0, s2 6461; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 6462; GFX6-NEXT: s_bfe_i32 s3, s0, 0xf0000 6463; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s3 6464; GFX6-NEXT: v_mov_b32_e32 v1, s0 6465; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 30 6466; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf0000 6467; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s1 6468; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 6469; GFX6-NEXT: s_xor_b32 s1, s1, s3 6470; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f 6471; GFX6-NEXT: s_ashr_i32 s1, s1, 30 6472; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 6473; GFX6-NEXT: v_trunc_f32_e32 v4, v4 6474; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 6475; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 6476; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 6477; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s0 6478; GFX6-NEXT: s_or_b32 s1, s1, 1 6479; GFX6-NEXT: v_mov_b32_e32 v5, s1 6480; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 6481; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf000f 6482; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 6483; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 6484; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 6485; GFX6-NEXT: s_xor_b32 s0, s1, s0 6486; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 6487; GFX6-NEXT: s_ashr_i32 s0, s0, 30 6488; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 6489; GFX6-NEXT: v_trunc_f32_e32 v5, v5 6490; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 6491; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 6492; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| 6493; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v1 6494; GFX6-NEXT: s_or_b32 s0, s0, 1 6495; GFX6-NEXT: v_mov_b32_e32 v6, s0 6496; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 6497; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 6498; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 6499; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v0 6500; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 6501; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 6502; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 6503; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 6504; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 6505; GFX6-NEXT: v_trunc_f32_e32 v1, v1 6506; GFX6-NEXT: v_mad_f32 v5, -v1, v4, v5 6507; GFX6-NEXT: v_cvt_i32_f32_e32 v1, v1 6508; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| 6509; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 6510; GFX6-NEXT: s_movk_i32 s0, 0x7fff 6511; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6512; GFX6-NEXT: v_and_b32_e32 v3, s0, v3 6513; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 6514; GFX6-NEXT: v_and_b32_e32 v2, s0, v2 6515; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 6516; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 6517; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 6518; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 6519; GFX6-NEXT: s_waitcnt expcnt(0) 6520; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6521; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 6522; GFX6-NEXT: s_endpgm 6523; 6524; GFX9-LABEL: sdiv_v3i15: 6525; GFX9: ; %bb.0: 6526; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6527; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 6528; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 6529; GFX9-NEXT: v_mov_b32_e32 v2, 0 6530; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6531; GFX9-NEXT: s_bfe_i32 s1, s4, 0xf0000 6532; GFX9-NEXT: s_bfe_i32 s0, s6, 0xf0000 6533; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 6534; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 6535; GFX9-NEXT: s_xor_b32 s0, s1, s0 6536; GFX9-NEXT: v_mov_b32_e32 v0, s4 6537; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 6538; GFX9-NEXT: s_ashr_i32 s0, s0, 30 6539; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 6540; GFX9-NEXT: s_or_b32 s5, s0, 1 6541; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 6542; GFX9-NEXT: v_trunc_f32_e32 v5, v5 6543; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 6544; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 6545; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 6546; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 6547; GFX9-NEXT: s_cselect_b32 s0, s5, 0 6548; GFX9-NEXT: s_bfe_i32 s1, s6, 0xf000f 6549; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s1 6550; GFX9-NEXT: v_add_u32_e32 v4, s0, v5 6551; GFX9-NEXT: s_bfe_i32 s0, s4, 0xf000f 6552; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 6553; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 6554; GFX9-NEXT: v_mov_b32_e32 v1, s6 6555; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 6556; GFX9-NEXT: s_xor_b32 s0, s0, s1 6557; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 6558; GFX9-NEXT: v_trunc_f32_e32 v6, v6 6559; GFX9-NEXT: s_ashr_i32 s0, s0, 30 6560; GFX9-NEXT: v_mad_f32 v5, -v6, v3, v5 6561; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 6562; GFX9-NEXT: s_or_b32 s4, s0, 1 6563; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 6564; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| 6565; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 6566; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 6567; GFX9-NEXT: s_cselect_b32 s0, s4, 0 6568; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 15 6569; GFX9-NEXT: v_add_u32_e32 v5, s0, v6 6570; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v0 6571; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 6572; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 6573; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 6574; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 6575; GFX9-NEXT: v_mul_f32_e32 v1, v6, v7 6576; GFX9-NEXT: v_trunc_f32_e32 v1, v1 6577; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v1 6578; GFX9-NEXT: v_mad_f32 v1, -v1, v3, v6 6579; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| 6580; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 6581; GFX9-NEXT: s_movk_i32 s0, 0x7fff 6582; GFX9-NEXT: v_add_u32_e32 v0, v7, v0 6583; GFX9-NEXT: v_and_b32_e32 v3, s0, v4 6584; GFX9-NEXT: v_and_b32_e32 v4, s0, v5 6585; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6586; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 6587; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 6588; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 6589; GFX9-NEXT: global_store_dword v2, v0, s[2:3] 6590; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6591; GFX9-NEXT: global_store_short v2, v0, s[2:3] offset:4 6592; GFX9-NEXT: s_endpgm 6593; 6594; GFX90A-LABEL: sdiv_v3i15: 6595; GFX90A: ; %bb.0: 6596; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6597; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 6598; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 6599; GFX90A-NEXT: v_mov_b32_e32 v2, 0 6600; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 6601; GFX90A-NEXT: s_bfe_i32 s1, s4, 0xf0000 6602; GFX90A-NEXT: s_bfe_i32 s0, s6, 0xf0000 6603; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s0 6604; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s1 6605; GFX90A-NEXT: s_xor_b32 s0, s1, s0 6606; GFX90A-NEXT: v_mov_b32_e32 v0, s4 6607; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v3 6608; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 6609; GFX90A-NEXT: v_alignbit_b32 v0, s5, v0, 30 6610; GFX90A-NEXT: s_or_b32 s5, s0, 1 6611; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 6612; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 6613; GFX90A-NEXT: v_mad_f32 v4, -v5, v3, v4 6614; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 6615; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 6616; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 6617; GFX90A-NEXT: s_cselect_b32 s0, s5, 0 6618; GFX90A-NEXT: s_bfe_i32 s1, s6, 0xf000f 6619; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s1 6620; GFX90A-NEXT: v_add_u32_e32 v4, s0, v5 6621; GFX90A-NEXT: s_bfe_i32 s0, s4, 0xf000f 6622; GFX90A-NEXT: v_cvt_f32_i32_e32 v5, s0 6623; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v3 6624; GFX90A-NEXT: v_mov_b32_e32 v1, s6 6625; GFX90A-NEXT: v_alignbit_b32 v1, s7, v1, 30 6626; GFX90A-NEXT: s_xor_b32 s0, s0, s1 6627; GFX90A-NEXT: v_mul_f32_e32 v6, v5, v6 6628; GFX90A-NEXT: v_trunc_f32_e32 v6, v6 6629; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 6630; GFX90A-NEXT: v_mad_f32 v5, -v6, v3, v5 6631; GFX90A-NEXT: v_bfe_i32 v1, v1, 0, 15 6632; GFX90A-NEXT: s_or_b32 s4, s0, 1 6633; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 6634; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| 6635; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, v1 6636; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 6637; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 6638; GFX90A-NEXT: v_bfe_i32 v0, v0, 0, 15 6639; GFX90A-NEXT: v_add_u32_e32 v5, s0, v6 6640; GFX90A-NEXT: v_cvt_f32_i32_e32 v6, v0 6641; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v3 6642; GFX90A-NEXT: v_xor_b32_e32 v0, v0, v1 6643; GFX90A-NEXT: v_ashrrev_i32_e32 v0, 30, v0 6644; GFX90A-NEXT: v_or_b32_e32 v0, 1, v0 6645; GFX90A-NEXT: v_mul_f32_e32 v1, v6, v7 6646; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 6647; GFX90A-NEXT: v_cvt_i32_f32_e32 v7, v1 6648; GFX90A-NEXT: v_mad_f32 v1, -v1, v3, v6 6649; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| 6650; GFX90A-NEXT: s_movk_i32 s0, 0x7fff 6651; GFX90A-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 6652; GFX90A-NEXT: v_and_b32_e32 v3, s0, v4 6653; GFX90A-NEXT: v_and_b32_e32 v4, s0, v5 6654; GFX90A-NEXT: v_add_u32_e32 v0, v7, v0 6655; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 6656; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6657; GFX90A-NEXT: v_or_b32_e32 v3, v3, v4 6658; GFX90A-NEXT: v_or_b32_e32 v0, v3, v0 6659; GFX90A-NEXT: global_store_dword v2, v0, s[2:3] 6660; GFX90A-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6661; GFX90A-NEXT: global_store_short v2, v0, s[2:3] offset:4 6662; GFX90A-NEXT: s_endpgm 6663 %r = sdiv <3 x i15> %x, %y 6664 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 6665 ret void 6666} 6667 6668define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 6669; CHECK-LABEL: @srem_v3i15( 6670; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 6671; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 6672; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 6673; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 6674; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 6675; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 6676; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 6677; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 6678; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 6679; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 6680; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 6681; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 6682; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 6683; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 6684; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 6685; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 6686; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 6687; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 6688; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 6689; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 6690; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 6691; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 6692; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 17 6693; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 17 6694; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15 6695; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i15> undef, i15 [[TMP25]], i64 0 6696; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1 6697; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1 6698; CHECK-NEXT: [[TMP29:%.*]] = sext i15 [[TMP27]] to i32 6699; CHECK-NEXT: [[TMP30:%.*]] = sext i15 [[TMP28]] to i32 6700; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 6701; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 6702; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 6703; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 6704; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 6705; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 6706; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 6707; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 6708; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 6709; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 6710; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 6711; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 6712; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 6713; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 6714; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 6715; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 6716; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 6717; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 6718; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 17 6719; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 17 6720; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15 6721; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1 6722; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2 6723; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2 6724; CHECK-NEXT: [[TMP55:%.*]] = sext i15 [[TMP53]] to i32 6725; CHECK-NEXT: [[TMP56:%.*]] = sext i15 [[TMP54]] to i32 6726; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 6727; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 6728; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 6729; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 6730; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 6731; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 6732; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 6733; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 6734; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 6735; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 6736; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 6737; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 6738; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 6739; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 6740; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 6741; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 6742; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 6743; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 6744; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 17 6745; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 17 6746; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15 6747; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2 6748; CHECK-NEXT: store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 6749; CHECK-NEXT: ret void 6750; 6751; GFX6-LABEL: srem_v3i15: 6752; GFX6: ; %bb.0: 6753; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6754; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 6755; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 6756; GFX6-NEXT: s_mov_b32 s7, 0xf000 6757; GFX6-NEXT: s_mov_b32 s6, -1 6758; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6759; GFX6-NEXT: v_mov_b32_e32 v0, s2 6760; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 6761; GFX6-NEXT: s_movk_i32 s3, 0x7fff 6762; GFX6-NEXT: s_and_b32 s11, s0, s3 6763; GFX6-NEXT: s_bfe_i32 s11, s11, 0xf0000 6764; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s11 6765; GFX6-NEXT: s_and_b32 s9, s2, s3 6766; GFX6-NEXT: s_bfe_i32 s9, s9, 0xf0000 6767; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s9 6768; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 6769; GFX6-NEXT: s_xor_b32 s9, s9, s11 6770; GFX6-NEXT: s_ashr_i32 s9, s9, 30 6771; GFX6-NEXT: s_or_b32 s9, s9, 1 6772; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 6773; GFX6-NEXT: v_trunc_f32_e32 v4, v4 6774; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 6775; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 6776; GFX6-NEXT: v_mov_b32_e32 v5, s9 6777; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 6778; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 6779; GFX6-NEXT: v_mov_b32_e32 v1, s0 6780; GFX6-NEXT: s_bfe_u32 s12, s0, 0xf000f 6781; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 6782; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 30 6783; GFX6-NEXT: s_lshr_b32 s1, s0, 15 6784; GFX6-NEXT: v_mul_lo_u32 v2, v2, s0 6785; GFX6-NEXT: s_bfe_i32 s0, s12, 0xf0000 6786; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s0 6787; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf000f 6788; GFX6-NEXT: s_lshr_b32 s8, s2, 15 6789; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 6790; GFX6-NEXT: s_bfe_i32 s2, s10, 0xf0000 6791; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s2 6792; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 6793; GFX6-NEXT: s_xor_b32 s0, s2, s0 6794; GFX6-NEXT: s_ashr_i32 s0, s0, 30 6795; GFX6-NEXT: s_or_b32 s0, s0, 1 6796; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 6797; GFX6-NEXT: v_trunc_f32_e32 v5, v5 6798; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 6799; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 6800; GFX6-NEXT: v_and_b32_e32 v1, s3, v1 6801; GFX6-NEXT: v_mov_b32_e32 v6, s0 6802; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| 6803; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 6804; GFX6-NEXT: v_bfe_i32 v4, v1, 0, 15 6805; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 6806; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v4 6807; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 6808; GFX6-NEXT: v_bfe_i32 v6, v0, 0, 15 6809; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v6 6810; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v5 6811; GFX6-NEXT: v_xor_b32_e32 v4, v6, v4 6812; GFX6-NEXT: v_ashrrev_i32_e32 v4, 30, v4 6813; GFX6-NEXT: v_or_b32_e32 v4, 1, v4 6814; GFX6-NEXT: v_mul_f32_e32 v6, v7, v8 6815; GFX6-NEXT: v_trunc_f32_e32 v6, v6 6816; GFX6-NEXT: v_mad_f32 v7, -v6, v5, v7 6817; GFX6-NEXT: v_cvt_i32_f32_e32 v6, v6 6818; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v5| 6819; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc 6820; GFX6-NEXT: v_mul_lo_u32 v3, v3, s1 6821; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 6822; GFX6-NEXT: v_mul_lo_u32 v1, v4, v1 6823; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 6824; GFX6-NEXT: v_and_b32_e32 v3, s3, v3 6825; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 6826; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 6827; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 6828; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 6829; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 6830; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 6831; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 6832; GFX6-NEXT: s_waitcnt expcnt(0) 6833; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6834; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 6835; GFX6-NEXT: s_endpgm 6836; 6837; GFX9-LABEL: srem_v3i15: 6838; GFX9: ; %bb.0: 6839; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6840; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 6841; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 6842; GFX9-NEXT: s_movk_i32 s8, 0x7fff 6843; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6844; GFX9-NEXT: s_and_b32 s0, s4, s8 6845; GFX9-NEXT: s_and_b32 s1, s6, s8 6846; GFX9-NEXT: s_bfe_i32 s1, s1, 0xf0000 6847; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 6848; GFX9-NEXT: s_bfe_i32 s0, s0, 0xf0000 6849; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 6850; GFX9-NEXT: s_xor_b32 s0, s0, s1 6851; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 6852; GFX9-NEXT: v_mov_b32_e32 v0, s4 6853; GFX9-NEXT: v_mov_b32_e32 v1, s6 6854; GFX9-NEXT: s_ashr_i32 s0, s0, 30 6855; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 6856; GFX9-NEXT: v_trunc_f32_e32 v4, v4 6857; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 6858; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 6859; GFX9-NEXT: s_lshr_b32 s9, s4, 15 6860; GFX9-NEXT: v_alignbit_b32 v0, s5, v0, 30 6861; GFX9-NEXT: s_bfe_u32 s5, s4, 0xf000f 6862; GFX9-NEXT: v_alignbit_b32 v1, s7, v1, 30 6863; GFX9-NEXT: s_lshr_b32 s7, s6, 15 6864; GFX9-NEXT: s_bfe_u32 s10, s6, 0xf000f 6865; GFX9-NEXT: s_or_b32 s11, s0, 1 6866; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| 6867; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 6868; GFX9-NEXT: s_cselect_b32 s0, s11, 0 6869; GFX9-NEXT: v_add_u32_e32 v2, s0, v4 6870; GFX9-NEXT: s_bfe_i32 s0, s10, 0xf0000 6871; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 6872; GFX9-NEXT: s_bfe_i32 s1, s5, 0xf0000 6873; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 6874; GFX9-NEXT: s_xor_b32 s0, s1, s0 6875; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 6876; GFX9-NEXT: s_ashr_i32 s0, s0, 30 6877; GFX9-NEXT: s_or_b32 s5, s0, 1 6878; GFX9-NEXT: v_and_b32_e32 v1, s8, v1 6879; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 6880; GFX9-NEXT: v_trunc_f32_e32 v5, v5 6881; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 6882; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 6883; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 6884; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 6885; GFX9-NEXT: s_cselect_b32 s0, s5, 0 6886; GFX9-NEXT: v_bfe_i32 v4, v1, 0, 15 6887; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 6888; GFX9-NEXT: v_cvt_f32_i32_e32 v5, v4 6889; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 6890; GFX9-NEXT: v_bfe_i32 v6, v0, 0, 15 6891; GFX9-NEXT: v_cvt_f32_i32_e32 v7, v6 6892; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 6893; GFX9-NEXT: v_xor_b32_e32 v4, v6, v4 6894; GFX9-NEXT: v_ashrrev_i32_e32 v4, 30, v4 6895; GFX9-NEXT: v_or_b32_e32 v4, 1, v4 6896; GFX9-NEXT: v_mul_f32_e32 v6, v7, v8 6897; GFX9-NEXT: v_trunc_f32_e32 v6, v6 6898; GFX9-NEXT: v_cvt_i32_f32_e32 v8, v6 6899; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v7 6900; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v5| 6901; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc 6902; GFX9-NEXT: v_mul_lo_u32 v3, v3, s7 6903; GFX9-NEXT: v_add_u32_e32 v4, v8, v4 6904; GFX9-NEXT: v_mul_lo_u32 v2, v2, s6 6905; GFX9-NEXT: v_mul_lo_u32 v1, v4, v1 6906; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3 6907; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 6908; GFX9-NEXT: v_sub_u32_e32 v2, s4, v2 6909; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 6910; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6911; GFX9-NEXT: v_and_b32_e32 v2, s8, v2 6912; GFX9-NEXT: v_lshlrev_b32_e32 v3, 15, v3 6913; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 6914; GFX9-NEXT: v_mov_b32_e32 v4, 0 6915; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 6916; GFX9-NEXT: global_store_dword v4, v0, s[2:3] 6917; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6918; GFX9-NEXT: global_store_short v4, v0, s[2:3] offset:4 6919; GFX9-NEXT: s_endpgm 6920; 6921; GFX90A-LABEL: srem_v3i15: 6922; GFX90A: ; %bb.0: 6923; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 6924; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 6925; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 6926; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 6927; GFX90A-NEXT: v_mov_b32_e32 v2, 0 6928; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 6929; GFX90A-NEXT: s_and_b32 s0, s4, s8 6930; GFX90A-NEXT: s_and_b32 s1, s6, s8 6931; GFX90A-NEXT: s_bfe_i32 s1, s1, 0xf0000 6932; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s1 6933; GFX90A-NEXT: s_bfe_i32 s0, s0, 0xf0000 6934; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s0 6935; GFX90A-NEXT: s_xor_b32 s0, s0, s1 6936; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v3 6937; GFX90A-NEXT: v_mov_b32_e32 v0, s4 6938; GFX90A-NEXT: v_mov_b32_e32 v1, s6 6939; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 6940; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 6941; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 6942; GFX90A-NEXT: v_mad_f32 v4, -v5, v3, v4 6943; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 6944; GFX90A-NEXT: v_alignbit_b32 v0, s5, v0, 30 6945; GFX90A-NEXT: s_lshr_b32 s5, s4, 15 6946; GFX90A-NEXT: s_bfe_u32 s9, s4, 0xf000f 6947; GFX90A-NEXT: v_alignbit_b32 v1, s7, v1, 30 6948; GFX90A-NEXT: s_lshr_b32 s7, s6, 15 6949; GFX90A-NEXT: s_bfe_u32 s10, s6, 0xf000f 6950; GFX90A-NEXT: s_or_b32 s11, s0, 1 6951; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 6952; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 6953; GFX90A-NEXT: s_cselect_b32 s0, s11, 0 6954; GFX90A-NEXT: v_add_u32_e32 v3, s0, v5 6955; GFX90A-NEXT: s_bfe_i32 s0, s10, 0xf0000 6956; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s0 6957; GFX90A-NEXT: s_bfe_i32 s1, s9, 0xf0000 6958; GFX90A-NEXT: v_cvt_f32_i32_e32 v5, s1 6959; GFX90A-NEXT: s_xor_b32 s0, s1, s0 6960; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 6961; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s6 6962; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 6963; GFX90A-NEXT: v_sub_u32_e32 v3, s4, v3 6964; GFX90A-NEXT: v_mul_f32_e32 v6, v5, v6 6965; GFX90A-NEXT: v_trunc_f32_e32 v6, v6 6966; GFX90A-NEXT: v_mad_f32 v5, -v6, v4, v5 6967; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 6968; GFX90A-NEXT: s_or_b32 s4, s0, 1 6969; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| 6970; GFX90A-NEXT: v_and_b32_e32 v1, s8, v1 6971; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 6972; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 6973; GFX90A-NEXT: v_bfe_i32 v5, v1, 0, 15 6974; GFX90A-NEXT: v_add_u32_e32 v4, s0, v6 6975; GFX90A-NEXT: v_cvt_f32_i32_e32 v6, v5 6976; GFX90A-NEXT: v_and_b32_e32 v0, s8, v0 6977; GFX90A-NEXT: v_bfe_i32 v7, v0, 0, 15 6978; GFX90A-NEXT: v_cvt_f32_i32_e32 v8, v7 6979; GFX90A-NEXT: v_rcp_iflag_f32_e32 v9, v6 6980; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v5 6981; GFX90A-NEXT: v_ashrrev_i32_e32 v5, 30, v5 6982; GFX90A-NEXT: v_or_b32_e32 v5, 1, v5 6983; GFX90A-NEXT: v_mul_f32_e32 v7, v8, v9 6984; GFX90A-NEXT: v_trunc_f32_e32 v7, v7 6985; GFX90A-NEXT: v_cvt_i32_f32_e32 v9, v7 6986; GFX90A-NEXT: v_mad_f32 v7, -v7, v6, v8 6987; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| 6988; GFX90A-NEXT: v_mul_lo_u32 v4, v4, s7 6989; GFX90A-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc 6990; GFX90A-NEXT: v_sub_u32_e32 v4, s5, v4 6991; GFX90A-NEXT: v_add_u32_e32 v5, v9, v5 6992; GFX90A-NEXT: v_mul_lo_u32 v1, v5, v1 6993; GFX90A-NEXT: v_and_b32_e32 v4, s8, v4 6994; GFX90A-NEXT: v_sub_u32_e32 v0, v0, v1 6995; GFX90A-NEXT: v_and_b32_e32 v3, s8, v3 6996; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 6997; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6998; GFX90A-NEXT: v_or_b32_e32 v3, v3, v4 6999; GFX90A-NEXT: v_or_b32_e32 v0, v3, v0 7000; GFX90A-NEXT: global_store_dword v2, v0, s[2:3] 7001; GFX90A-NEXT: v_and_b32_e32 v0, 0x1fff, v1 7002; GFX90A-NEXT: global_store_short v2, v0, s[2:3] offset:4 7003; GFX90A-NEXT: s_endpgm 7004 %r = srem <3 x i15> %x, %y 7005 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 7006 ret void 7007} 7008 7009define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 7010; CHECK-LABEL: @udiv_i32_oddk_denom( 7011; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 1235195 7012; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 7013; CHECK-NEXT: ret void 7014; 7015; GFX6-LABEL: udiv_i32_oddk_denom: 7016; GFX6: ; %bb.0: 7017; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7018; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 7019; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 7020; GFX6-NEXT: s_mov_b32 s7, 0xf000 7021; GFX6-NEXT: s_mov_b32 s6, -1 7022; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7023; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 7024; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v0 7025; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 7026; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7027; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 7028; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 7029; GFX6-NEXT: s_endpgm 7030; 7031; GFX9-LABEL: udiv_i32_oddk_denom: 7032; GFX9: ; %bb.0: 7033; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7034; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 7035; GFX9-NEXT: v_mov_b32_e32 v0, 0 7036; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7037; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 7038; GFX9-NEXT: s_sub_i32 s1, s4, s0 7039; GFX9-NEXT: s_lshr_b32 s1, s1, 1 7040; GFX9-NEXT: s_add_i32 s1, s1, s0 7041; GFX9-NEXT: s_lshr_b32 s0, s1, 20 7042; GFX9-NEXT: v_mov_b32_e32 v1, s0 7043; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 7044; GFX9-NEXT: s_endpgm 7045; 7046; GFX90A-LABEL: udiv_i32_oddk_denom: 7047; GFX90A: ; %bb.0: 7048; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7049; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 7050; GFX90A-NEXT: v_mov_b32_e32 v0, 0 7051; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7052; GFX90A-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 7053; GFX90A-NEXT: s_sub_i32 s1, s4, s0 7054; GFX90A-NEXT: s_lshr_b32 s1, s1, 1 7055; GFX90A-NEXT: s_add_i32 s1, s1, s0 7056; GFX90A-NEXT: s_lshr_b32 s0, s1, 20 7057; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7058; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 7059; GFX90A-NEXT: s_endpgm 7060 %r = udiv i32 %x, 1235195 7061 store i32 %r, i32 addrspace(1)* %out 7062 ret void 7063} 7064 7065define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 7066; CHECK-LABEL: @udiv_i32_pow2k_denom( 7067; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 4096 7068; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 7069; CHECK-NEXT: ret void 7070; 7071; GFX6-LABEL: udiv_i32_pow2k_denom: 7072; GFX6: ; %bb.0: 7073; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7074; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 7075; GFX6-NEXT: s_mov_b32 s7, 0xf000 7076; GFX6-NEXT: s_mov_b32 s6, -1 7077; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7078; GFX6-NEXT: s_lshr_b32 s0, s0, 12 7079; GFX6-NEXT: v_mov_b32_e32 v0, s0 7080; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 7081; GFX6-NEXT: s_endpgm 7082; 7083; GFX9-LABEL: udiv_i32_pow2k_denom: 7084; GFX9: ; %bb.0: 7085; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7086; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 7087; GFX9-NEXT: v_mov_b32_e32 v0, 0 7088; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7089; GFX9-NEXT: s_lshr_b32 s0, s4, 12 7090; GFX9-NEXT: v_mov_b32_e32 v1, s0 7091; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 7092; GFX9-NEXT: s_endpgm 7093; 7094; GFX90A-LABEL: udiv_i32_pow2k_denom: 7095; GFX90A: ; %bb.0: 7096; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7097; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 7098; GFX90A-NEXT: v_mov_b32_e32 v0, 0 7099; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7100; GFX90A-NEXT: s_lshr_b32 s0, s4, 12 7101; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7102; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 7103; GFX90A-NEXT: s_endpgm 7104 %r = udiv i32 %x, 4096 7105 store i32 %r, i32 addrspace(1)* %out 7106 ret void 7107} 7108 7109define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 7110; CHECK-LABEL: @udiv_i32_pow2_shl_denom( 7111; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 7112; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]] 7113; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 7114; CHECK-NEXT: ret void 7115; 7116; GFX6-LABEL: udiv_i32_pow2_shl_denom: 7117; GFX6: ; %bb.0: 7118; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7119; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7120; GFX6-NEXT: s_mov_b32 s7, 0xf000 7121; GFX6-NEXT: s_mov_b32 s6, -1 7122; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7123; GFX6-NEXT: s_add_i32 s1, s1, 12 7124; GFX6-NEXT: s_lshr_b32 s0, s0, s1 7125; GFX6-NEXT: v_mov_b32_e32 v0, s0 7126; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 7127; GFX6-NEXT: s_endpgm 7128; 7129; GFX9-LABEL: udiv_i32_pow2_shl_denom: 7130; GFX9: ; %bb.0: 7131; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7132; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 7133; GFX9-NEXT: v_mov_b32_e32 v0, 0 7134; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7135; GFX9-NEXT: s_add_i32 s0, s5, 12 7136; GFX9-NEXT: s_lshr_b32 s0, s4, s0 7137; GFX9-NEXT: v_mov_b32_e32 v1, s0 7138; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 7139; GFX9-NEXT: s_endpgm 7140; 7141; GFX90A-LABEL: udiv_i32_pow2_shl_denom: 7142; GFX90A: ; %bb.0: 7143; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7144; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 7145; GFX90A-NEXT: v_mov_b32_e32 v0, 0 7146; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7147; GFX90A-NEXT: s_add_i32 s0, s5, 12 7148; GFX90A-NEXT: s_lshr_b32 s0, s4, s0 7149; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7150; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 7151; GFX90A-NEXT: s_endpgm 7152 %shl.y = shl i32 4096, %y 7153 %r = udiv i32 %x, %shl.y 7154 store i32 %r, i32 addrspace(1)* %out 7155 ret void 7156} 7157 7158define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 7159; CHECK-LABEL: @udiv_v2i32_pow2k_denom( 7160; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 7161; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 7162; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 7163; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 7164; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096 7165; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 7166; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 7167; CHECK-NEXT: ret void 7168; 7169; GFX6-LABEL: udiv_v2i32_pow2k_denom: 7170; GFX6: ; %bb.0: 7171; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7172; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7173; GFX6-NEXT: s_mov_b32 s7, 0xf000 7174; GFX6-NEXT: s_mov_b32 s6, -1 7175; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7176; GFX6-NEXT: s_lshr_b32 s0, s0, 12 7177; GFX6-NEXT: s_lshr_b32 s1, s1, 12 7178; GFX6-NEXT: v_mov_b32_e32 v0, s0 7179; GFX6-NEXT: v_mov_b32_e32 v1, s1 7180; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7181; GFX6-NEXT: s_endpgm 7182; 7183; GFX9-LABEL: udiv_v2i32_pow2k_denom: 7184; GFX9: ; %bb.0: 7185; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7186; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 7187; GFX9-NEXT: v_mov_b32_e32 v2, 0 7188; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7189; GFX9-NEXT: s_lshr_b32 s0, s4, 12 7190; GFX9-NEXT: s_lshr_b32 s1, s5, 12 7191; GFX9-NEXT: v_mov_b32_e32 v0, s0 7192; GFX9-NEXT: v_mov_b32_e32 v1, s1 7193; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 7194; GFX9-NEXT: s_endpgm 7195; 7196; GFX90A-LABEL: udiv_v2i32_pow2k_denom: 7197; GFX90A: ; %bb.0: 7198; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7199; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 7200; GFX90A-NEXT: v_mov_b32_e32 v2, 0 7201; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7202; GFX90A-NEXT: s_lshr_b32 s0, s4, 12 7203; GFX90A-NEXT: s_lshr_b32 s1, s5, 12 7204; GFX90A-NEXT: v_mov_b32_e32 v0, s0 7205; GFX90A-NEXT: v_mov_b32_e32 v1, s1 7206; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 7207; GFX90A-NEXT: s_endpgm 7208 %r = udiv <2 x i32> %x, <i32 4096, i32 4096> 7209 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 7210 ret void 7211} 7212 7213define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 7214; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom( 7215; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 7216; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 7217; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 7218; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 7219; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095 7220; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 7221; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 7222; CHECK-NEXT: ret void 7223; 7224; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom: 7225; GFX6: ; %bb.0: 7226; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7227; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7228; GFX6-NEXT: v_mov_b32_e32 v0, 0x100101 7229; GFX6-NEXT: s_mov_b32 s7, 0xf000 7230; GFX6-NEXT: s_mov_b32 s6, -1 7231; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7232; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 7233; GFX6-NEXT: s_lshr_b32 s0, s0, 12 7234; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v0 7235; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 7236; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7237; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v0 7238; GFX6-NEXT: v_mov_b32_e32 v0, s0 7239; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7240; GFX6-NEXT: s_endpgm 7241; 7242; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: 7243; GFX9: ; %bb.0: 7244; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7245; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 7246; GFX9-NEXT: v_mov_b32_e32 v2, 0 7247; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7248; GFX9-NEXT: s_mul_hi_u32 s1, s5, 0x100101 7249; GFX9-NEXT: s_lshr_b32 s0, s4, 12 7250; GFX9-NEXT: s_sub_i32 s4, s5, s1 7251; GFX9-NEXT: s_lshr_b32 s4, s4, 1 7252; GFX9-NEXT: s_add_i32 s4, s4, s1 7253; GFX9-NEXT: s_lshr_b32 s1, s4, 11 7254; GFX9-NEXT: v_mov_b32_e32 v0, s0 7255; GFX9-NEXT: v_mov_b32_e32 v1, s1 7256; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 7257; GFX9-NEXT: s_endpgm 7258; 7259; GFX90A-LABEL: udiv_v2i32_mixed_pow2k_denom: 7260; GFX90A: ; %bb.0: 7261; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7262; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 7263; GFX90A-NEXT: v_mov_b32_e32 v2, 0 7264; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7265; GFX90A-NEXT: s_mul_hi_u32 s1, s5, 0x100101 7266; GFX90A-NEXT: s_lshr_b32 s0, s4, 12 7267; GFX90A-NEXT: s_sub_i32 s4, s5, s1 7268; GFX90A-NEXT: s_lshr_b32 s4, s4, 1 7269; GFX90A-NEXT: s_add_i32 s4, s4, s1 7270; GFX90A-NEXT: s_lshr_b32 s1, s4, 11 7271; GFX90A-NEXT: v_mov_b32_e32 v0, s0 7272; GFX90A-NEXT: v_mov_b32_e32 v1, s1 7273; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 7274; GFX90A-NEXT: s_endpgm 7275 %r = udiv <2 x i32> %x, <i32 4096, i32 4095> 7276 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 7277 ret void 7278} 7279 7280define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 7281; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom( 7282; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 7283; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 7284; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 7285; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 7286; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 7287; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 7288; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 7289; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 7290; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 7291; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 7292; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 7293; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 7294; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 7295; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 7296; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 7297; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 7298; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 7299; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 7300; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 7301; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 7302; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 7303; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 7304; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 7305; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 7306; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 7307; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1 7308; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]] 7309; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]] 7310; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]] 7311; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]] 7312; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 7313; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 7314; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> undef, i32 [[TMP31]], i64 0 7315; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x i32> [[X]], i64 1 7316; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 7317; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float 7318; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 7319; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000 7320; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32 7321; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]] 7322; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]] 7323; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64 7324; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64 7325; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]] 7326; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 7327; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32 7328; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32 7329; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]] 7330; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64 7331; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64 7332; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]] 7333; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 7334; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32 7335; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32 7336; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]] 7337; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]] 7338; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]] 7339; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1 7340; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]] 7341; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]] 7342; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]] 7343; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]] 7344; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1 7345; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]] 7346; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP63]], i64 1 7347; CHECK-NEXT: store <2 x i32> [[TMP64]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 7348; CHECK-NEXT: ret void 7349; 7350; GFX6-LABEL: udiv_v2i32_pow2_shl_denom: 7351; GFX6: ; %bb.0: 7352; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 7353; GFX6-NEXT: s_movk_i32 s4, 0x1000 7354; GFX6-NEXT: s_mov_b32 s7, 0xf000 7355; GFX6-NEXT: s_mov_b32 s6, -1 7356; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7357; GFX6-NEXT: s_lshl_b32 s8, s4, s2 7358; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 7359; GFX6-NEXT: s_lshl_b32 s9, s4, s3 7360; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 7361; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7362; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 7363; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 7364; GFX6-NEXT: s_mov_b32 s0, 0x4f7ffffe 7365; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 7366; GFX6-NEXT: v_mul_f32_e32 v0, s0, v0 7367; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 7368; GFX6-NEXT: v_mul_f32_e32 v1, s0, v1 7369; GFX6-NEXT: s_sub_i32 s0, 0, s8 7370; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 7371; GFX6-NEXT: v_mul_lo_u32 v2, s0, v0 7372; GFX6-NEXT: s_sub_i32 s0, 0, s9 7373; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1 7374; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 7375; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 7376; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 7377; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7378; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 7379; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 7380; GFX6-NEXT: v_mul_hi_u32 v1, s3, v1 7381; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 7382; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 7383; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9 7384; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 7385; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 7386; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 7387; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v2 7388; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 7389; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 7390; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 7391; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 7392; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v4 7393; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 7394; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 7395; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 7396; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v2 7397; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 7398; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 7399; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 7400; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 7401; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7402; GFX6-NEXT: s_endpgm 7403; 7404; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: 7405; GFX9: ; %bb.0: 7406; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 7407; GFX9-NEXT: s_movk_i32 s4, 0x1000 7408; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7409; GFX9-NEXT: s_lshl_b32 s5, s4, s3 7410; GFX9-NEXT: s_lshl_b32 s4, s4, s2 7411; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 7412; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 7413; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe 7414; GFX9-NEXT: s_sub_i32 s3, 0, s5 7415; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 7416; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 7417; GFX9-NEXT: v_mul_f32_e32 v0, s2, v0 7418; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 7419; GFX9-NEXT: v_mul_f32_e32 v1, s2, v1 7420; GFX9-NEXT: s_sub_i32 s2, 0, s4 7421; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 7422; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 7423; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 7424; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 7425; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 7426; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 7427; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 7428; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 7429; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7430; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 7431; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 7432; GFX9-NEXT: v_mul_hi_u32 v1, s3, v1 7433; GFX9-NEXT: v_mov_b32_e32 v2, 0 7434; GFX9-NEXT: v_mul_lo_u32 v3, v0, s4 7435; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 7436; GFX9-NEXT: v_mul_lo_u32 v4, v1, s5 7437; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 7438; GFX9-NEXT: v_sub_u32_e32 v3, s2, v3 7439; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 7440; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 7441; GFX9-NEXT: v_subrev_u32_e32 v5, s4, v3 7442; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 7443; GFX9-NEXT: v_sub_u32_e32 v4, s3, v4 7444; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 7445; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 7446; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 7447; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v4 7448; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v4 7449; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 7450; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc 7451; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 7452; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 7453; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 7454; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 7455; GFX9-NEXT: s_endpgm 7456; 7457; GFX90A-LABEL: udiv_v2i32_pow2_shl_denom: 7458; GFX90A: ; %bb.0: 7459; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 7460; GFX90A-NEXT: s_movk_i32 s8, 0x1000 7461; GFX90A-NEXT: s_mov_b32 s9, 0x4f7ffffe 7462; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 7463; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 7464; GFX90A-NEXT: v_mov_b32_e32 v2, 0 7465; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7466; GFX90A-NEXT: s_lshl_b32 s2, s8, s2 7467; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s2 7468; GFX90A-NEXT: s_lshl_b32 s0, s8, s3 7469; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s0 7470; GFX90A-NEXT: s_sub_i32 s1, 0, s2 7471; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 7472; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 7473; GFX90A-NEXT: v_mul_f32_e32 v0, s9, v0 7474; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 7475; GFX90A-NEXT: v_mul_f32_e32 v1, s9, v1 7476; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 7477; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v0 7478; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v3 7479; GFX90A-NEXT: v_add_u32_e32 v0, v0, v3 7480; GFX90A-NEXT: v_mul_hi_u32 v0, s6, v0 7481; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s2 7482; GFX90A-NEXT: v_sub_u32_e32 v3, s6, v3 7483; GFX90A-NEXT: v_add_u32_e32 v4, 1, v0 7484; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 7485; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 7486; GFX90A-NEXT: v_subrev_u32_e32 v4, s2, v3 7487; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 7488; GFX90A-NEXT: s_sub_i32 s1, 0, s0 7489; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 7490; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v1 7491; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 7492; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 7493; GFX90A-NEXT: v_mul_hi_u32 v1, s7, v1 7494; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s0 7495; GFX90A-NEXT: v_add_u32_e32 v4, 1, v0 7496; GFX90A-NEXT: v_sub_u32_e32 v3, s7, v3 7497; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 7498; GFX90A-NEXT: v_add_u32_e32 v4, 1, v1 7499; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 7500; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 7501; GFX90A-NEXT: v_subrev_u32_e32 v4, s0, v3 7502; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 7503; GFX90A-NEXT: v_add_u32_e32 v4, 1, v1 7504; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 7505; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 7506; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 7507; GFX90A-NEXT: s_endpgm 7508 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 7509 %r = udiv <2 x i32> %x, %shl.y 7510 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 7511 ret void 7512} 7513 7514define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 7515; CHECK-LABEL: @urem_i32_oddk_denom( 7516; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 1235195 7517; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 7518; CHECK-NEXT: ret void 7519; 7520; GFX6-LABEL: urem_i32_oddk_denom: 7521; GFX6: ; %bb.0: 7522; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 7523; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 7524; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb 7525; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 7526; GFX6-NEXT: s_mov_b32 s3, 0xf000 7527; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7528; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 7529; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v0 7530; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 7531; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7532; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 7533; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 7534; GFX6-NEXT: s_mov_b32 s2, -1 7535; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 7536; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 7537; GFX6-NEXT: s_endpgm 7538; 7539; GFX9-LABEL: urem_i32_oddk_denom: 7540; GFX9: ; %bb.0: 7541; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7542; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 7543; GFX9-NEXT: v_mov_b32_e32 v0, 0 7544; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7545; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 7546; GFX9-NEXT: s_sub_i32 s1, s4, s0 7547; GFX9-NEXT: s_lshr_b32 s1, s1, 1 7548; GFX9-NEXT: s_add_i32 s1, s1, s0 7549; GFX9-NEXT: s_lshr_b32 s0, s1, 20 7550; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb 7551; GFX9-NEXT: s_sub_i32 s0, s4, s0 7552; GFX9-NEXT: v_mov_b32_e32 v1, s0 7553; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 7554; GFX9-NEXT: s_endpgm 7555; 7556; GFX90A-LABEL: urem_i32_oddk_denom: 7557; GFX90A: ; %bb.0: 7558; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7559; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 7560; GFX90A-NEXT: v_mov_b32_e32 v0, 0 7561; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7562; GFX90A-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 7563; GFX90A-NEXT: s_sub_i32 s1, s4, s0 7564; GFX90A-NEXT: s_lshr_b32 s1, s1, 1 7565; GFX90A-NEXT: s_add_i32 s1, s1, s0 7566; GFX90A-NEXT: s_lshr_b32 s0, s1, 20 7567; GFX90A-NEXT: s_mul_i32 s0, s0, 0x12d8fb 7568; GFX90A-NEXT: s_sub_i32 s0, s4, s0 7569; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7570; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 7571; GFX90A-NEXT: s_endpgm 7572 %r = urem i32 %x, 1235195 7573 store i32 %r, i32 addrspace(1)* %out 7574 ret void 7575} 7576 7577define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 7578; CHECK-LABEL: @urem_i32_pow2k_denom( 7579; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 4096 7580; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 7581; CHECK-NEXT: ret void 7582; 7583; GFX6-LABEL: urem_i32_pow2k_denom: 7584; GFX6: ; %bb.0: 7585; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7586; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 7587; GFX6-NEXT: s_mov_b32 s7, 0xf000 7588; GFX6-NEXT: s_mov_b32 s6, -1 7589; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7590; GFX6-NEXT: s_and_b32 s0, s0, 0xfff 7591; GFX6-NEXT: v_mov_b32_e32 v0, s0 7592; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 7593; GFX6-NEXT: s_endpgm 7594; 7595; GFX9-LABEL: urem_i32_pow2k_denom: 7596; GFX9: ; %bb.0: 7597; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7598; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 7599; GFX9-NEXT: v_mov_b32_e32 v0, 0 7600; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7601; GFX9-NEXT: s_and_b32 s0, s4, 0xfff 7602; GFX9-NEXT: v_mov_b32_e32 v1, s0 7603; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 7604; GFX9-NEXT: s_endpgm 7605; 7606; GFX90A-LABEL: urem_i32_pow2k_denom: 7607; GFX90A: ; %bb.0: 7608; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7609; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 7610; GFX90A-NEXT: v_mov_b32_e32 v0, 0 7611; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7612; GFX90A-NEXT: s_and_b32 s0, s4, 0xfff 7613; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7614; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 7615; GFX90A-NEXT: s_endpgm 7616 %r = urem i32 %x, 4096 7617 store i32 %r, i32 addrspace(1)* %out 7618 ret void 7619} 7620 7621define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 7622; CHECK-LABEL: @urem_i32_pow2_shl_denom( 7623; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 7624; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]] 7625; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 7626; CHECK-NEXT: ret void 7627; 7628; GFX6-LABEL: urem_i32_pow2_shl_denom: 7629; GFX6: ; %bb.0: 7630; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7631; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7632; GFX6-NEXT: s_mov_b32 s7, 0xf000 7633; GFX6-NEXT: s_mov_b32 s6, -1 7634; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7635; GFX6-NEXT: s_lshl_b32 s1, 0x1000, s1 7636; GFX6-NEXT: s_add_i32 s1, s1, -1 7637; GFX6-NEXT: s_and_b32 s0, s0, s1 7638; GFX6-NEXT: v_mov_b32_e32 v0, s0 7639; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 7640; GFX6-NEXT: s_endpgm 7641; 7642; GFX9-LABEL: urem_i32_pow2_shl_denom: 7643; GFX9: ; %bb.0: 7644; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7645; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 7646; GFX9-NEXT: v_mov_b32_e32 v0, 0 7647; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7648; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s5 7649; GFX9-NEXT: s_add_i32 s0, s0, -1 7650; GFX9-NEXT: s_and_b32 s0, s4, s0 7651; GFX9-NEXT: v_mov_b32_e32 v1, s0 7652; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 7653; GFX9-NEXT: s_endpgm 7654; 7655; GFX90A-LABEL: urem_i32_pow2_shl_denom: 7656; GFX90A: ; %bb.0: 7657; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7658; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 7659; GFX90A-NEXT: v_mov_b32_e32 v0, 0 7660; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7661; GFX90A-NEXT: s_lshl_b32 s0, 0x1000, s5 7662; GFX90A-NEXT: s_add_i32 s0, s0, -1 7663; GFX90A-NEXT: s_and_b32 s0, s4, s0 7664; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7665; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 7666; GFX90A-NEXT: s_endpgm 7667 %shl.y = shl i32 4096, %y 7668 %r = urem i32 %x, %shl.y 7669 store i32 %r, i32 addrspace(1)* %out 7670 ret void 7671} 7672 7673define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 7674; CHECK-LABEL: @urem_v2i32_pow2k_denom( 7675; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 7676; CHECK-NEXT: [[TMP2:%.*]] = urem i32 [[TMP1]], 4096 7677; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 7678; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 7679; CHECK-NEXT: [[TMP5:%.*]] = urem i32 [[TMP4]], 4096 7680; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 7681; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 7682; CHECK-NEXT: ret void 7683; 7684; GFX6-LABEL: urem_v2i32_pow2k_denom: 7685; GFX6: ; %bb.0: 7686; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7687; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7688; GFX6-NEXT: s_movk_i32 s2, 0xfff 7689; GFX6-NEXT: s_mov_b32 s7, 0xf000 7690; GFX6-NEXT: s_mov_b32 s6, -1 7691; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7692; GFX6-NEXT: s_and_b32 s0, s0, s2 7693; GFX6-NEXT: s_and_b32 s1, s1, s2 7694; GFX6-NEXT: v_mov_b32_e32 v0, s0 7695; GFX6-NEXT: v_mov_b32_e32 v1, s1 7696; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7697; GFX6-NEXT: s_endpgm 7698; 7699; GFX9-LABEL: urem_v2i32_pow2k_denom: 7700; GFX9: ; %bb.0: 7701; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7702; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 7703; GFX9-NEXT: s_movk_i32 s0, 0xfff 7704; GFX9-NEXT: v_mov_b32_e32 v2, 0 7705; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7706; GFX9-NEXT: s_and_b32 s1, s4, s0 7707; GFX9-NEXT: s_and_b32 s0, s5, s0 7708; GFX9-NEXT: v_mov_b32_e32 v0, s1 7709; GFX9-NEXT: v_mov_b32_e32 v1, s0 7710; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 7711; GFX9-NEXT: s_endpgm 7712; 7713; GFX90A-LABEL: urem_v2i32_pow2k_denom: 7714; GFX90A: ; %bb.0: 7715; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7716; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 7717; GFX90A-NEXT: s_movk_i32 s0, 0xfff 7718; GFX90A-NEXT: v_mov_b32_e32 v2, 0 7719; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7720; GFX90A-NEXT: s_and_b32 s1, s4, s0 7721; GFX90A-NEXT: s_and_b32 s0, s5, s0 7722; GFX90A-NEXT: v_mov_b32_e32 v0, s1 7723; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7724; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 7725; GFX90A-NEXT: s_endpgm 7726 %r = urem <2 x i32> %x, <i32 4096, i32 4096> 7727 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 7728 ret void 7729} 7730 7731define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 7732; CHECK-LABEL: @urem_v2i32_pow2_shl_denom( 7733; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 7734; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 7735; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 7736; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 7737; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 7738; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 7739; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 7740; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 7741; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 7742; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 7743; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 7744; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 7745; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 7746; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 7747; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 7748; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 7749; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 7750; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 7751; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 7752; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 7753; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 7754; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 7755; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 7756; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 7757; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 7758; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]] 7759; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]] 7760; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]] 7761; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]] 7762; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]] 7763; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> undef, i32 [[TMP29]], i64 0 7764; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[X]], i64 1 7765; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 7766; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float 7767; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 7768; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000 7769; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32 7770; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]] 7771; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]] 7772; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64 7773; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64 7774; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]] 7775; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32 7776; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32 7777; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 7778; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]] 7779; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64 7780; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64 7781; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]] 7782; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32 7783; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32 7784; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 7785; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]] 7786; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]] 7787; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]] 7788; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]] 7789; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]] 7790; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]] 7791; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]] 7792; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]] 7793; CHECK-NEXT: [[TMP60:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP59]], i64 1 7794; CHECK-NEXT: store <2 x i32> [[TMP60]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 7795; CHECK-NEXT: ret void 7796; 7797; GFX6-LABEL: urem_v2i32_pow2_shl_denom: 7798; GFX6: ; %bb.0: 7799; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 7800; GFX6-NEXT: s_movk_i32 s4, 0x1000 7801; GFX6-NEXT: s_mov_b32 s5, 0x4f7ffffe 7802; GFX6-NEXT: s_mov_b32 s7, 0xf000 7803; GFX6-NEXT: s_mov_b32 s6, -1 7804; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7805; GFX6-NEXT: s_lshl_b32 s2, s4, s2 7806; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 7807; GFX6-NEXT: s_lshl_b32 s3, s4, s3 7808; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 7809; GFX6-NEXT: s_sub_i32 s4, 0, s2 7810; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 7811; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 7812; GFX6-NEXT: v_mul_f32_e32 v0, s5, v0 7813; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 7814; GFX6-NEXT: v_mul_f32_e32 v1, s5, v1 7815; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 7816; GFX6-NEXT: v_mul_lo_u32 v2, s4, v0 7817; GFX6-NEXT: s_sub_i32 s4, 0, s3 7818; GFX6-NEXT: v_mul_lo_u32 v3, s4, v1 7819; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7820; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 7821; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 7822; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 7823; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 7824; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7825; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 7826; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 7827; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 7828; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 7829; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 7830; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 7831; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v0 7832; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 7833; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 7834; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s2, v0 7835; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 7836; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 7837; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 7838; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 7839; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 7840; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 7841; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 7842; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 7843; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 7844; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7845; GFX6-NEXT: s_endpgm 7846; 7847; GFX9-LABEL: urem_v2i32_pow2_shl_denom: 7848; GFX9: ; %bb.0: 7849; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 7850; GFX9-NEXT: s_movk_i32 s4, 0x1000 7851; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7852; GFX9-NEXT: s_lshl_b32 s5, s4, s3 7853; GFX9-NEXT: s_lshl_b32 s4, s4, s2 7854; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 7855; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 7856; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe 7857; GFX9-NEXT: s_sub_i32 s3, 0, s5 7858; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 7859; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 7860; GFX9-NEXT: v_mul_f32_e32 v0, s2, v0 7861; GFX9-NEXT: v_mul_f32_e32 v1, s2, v1 7862; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 7863; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 7864; GFX9-NEXT: s_sub_i32 s2, 0, s4 7865; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 7866; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 7867; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 7868; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 7869; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 7870; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 7871; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 7872; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 7873; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7874; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 7875; GFX9-NEXT: v_mul_hi_u32 v1, s3, v1 7876; GFX9-NEXT: v_mov_b32_e32 v2, 0 7877; GFX9-NEXT: v_mul_lo_u32 v0, v0, s4 7878; GFX9-NEXT: v_mul_lo_u32 v1, v1, s5 7879; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 7880; GFX9-NEXT: v_sub_u32_e32 v1, s3, v1 7881; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v0 7882; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 7883; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v1 7884; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 7885; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 7886; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 7887; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v0 7888; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 7889; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v1 7890; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 7891; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 7892; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 7893; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 7894; GFX9-NEXT: s_endpgm 7895; 7896; GFX90A-LABEL: urem_v2i32_pow2_shl_denom: 7897; GFX90A: ; %bb.0: 7898; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 7899; GFX90A-NEXT: s_movk_i32 s8, 0x1000 7900; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 7901; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 7902; GFX90A-NEXT: v_mov_b32_e32 v2, 0 7903; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7904; GFX90A-NEXT: s_lshl_b32 s2, s8, s2 7905; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s2 7906; GFX90A-NEXT: s_lshl_b32 s0, s8, s3 7907; GFX90A-NEXT: s_mov_b32 s3, 0x4f7ffffe 7908; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s0 7909; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 7910; GFX90A-NEXT: s_sub_i32 s1, 0, s2 7911; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 7912; GFX90A-NEXT: v_mul_f32_e32 v0, s3, v0 7913; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 7914; GFX90A-NEXT: v_mul_f32_e32 v1, s3, v1 7915; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 7916; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v0 7917; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v3 7918; GFX90A-NEXT: v_add_u32_e32 v0, v0, v3 7919; GFX90A-NEXT: v_mul_hi_u32 v0, s6, v0 7920; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s2 7921; GFX90A-NEXT: v_sub_u32_e32 v0, s6, v0 7922; GFX90A-NEXT: v_subrev_u32_e32 v3, s2, v0 7923; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 7924; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 7925; GFX90A-NEXT: v_subrev_u32_e32 v3, s2, v0 7926; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 7927; GFX90A-NEXT: s_sub_i32 s1, 0, s0 7928; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 7929; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v1 7930; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 7931; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 7932; GFX90A-NEXT: v_mul_hi_u32 v1, s7, v1 7933; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s0 7934; GFX90A-NEXT: v_sub_u32_e32 v1, s7, v1 7935; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v1 7936; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v1 7937; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 7938; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v1 7939; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v1 7940; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 7941; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 7942; GFX90A-NEXT: s_endpgm 7943 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 7944 %r = urem <2 x i32> %x, %shl.y 7945 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 7946 ret void 7947} 7948 7949define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 7950; CHECK-LABEL: @sdiv_i32_oddk_denom( 7951; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195 7952; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 7953; CHECK-NEXT: ret void 7954; 7955; GFX6-LABEL: sdiv_i32_oddk_denom: 7956; GFX6: ; %bb.0: 7957; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7958; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 7959; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 7960; GFX6-NEXT: s_mov_b32 s7, 0xf000 7961; GFX6-NEXT: s_mov_b32 s6, -1 7962; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7963; GFX6-NEXT: v_mul_hi_i32 v0, s0, v0 7964; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 7965; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 7966; GFX6-NEXT: v_ashrrev_i32_e32 v0, 20, v0 7967; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 7968; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 7969; GFX6-NEXT: s_endpgm 7970; 7971; GFX9-LABEL: sdiv_i32_oddk_denom: 7972; GFX9: ; %bb.0: 7973; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7974; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 7975; GFX9-NEXT: v_mov_b32_e32 v0, 0 7976; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7977; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 7978; GFX9-NEXT: s_add_i32 s0, s0, s4 7979; GFX9-NEXT: s_lshr_b32 s1, s0, 31 7980; GFX9-NEXT: s_ashr_i32 s0, s0, 20 7981; GFX9-NEXT: s_add_i32 s0, s0, s1 7982; GFX9-NEXT: v_mov_b32_e32 v1, s0 7983; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 7984; GFX9-NEXT: s_endpgm 7985; 7986; GFX90A-LABEL: sdiv_i32_oddk_denom: 7987; GFX90A: ; %bb.0: 7988; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7989; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 7990; GFX90A-NEXT: v_mov_b32_e32 v0, 0 7991; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7992; GFX90A-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 7993; GFX90A-NEXT: s_add_i32 s0, s0, s4 7994; GFX90A-NEXT: s_lshr_b32 s1, s0, 31 7995; GFX90A-NEXT: s_ashr_i32 s0, s0, 20 7996; GFX90A-NEXT: s_add_i32 s0, s0, s1 7997; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7998; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 7999; GFX90A-NEXT: s_endpgm 8000 %r = sdiv i32 %x, 1235195 8001 store i32 %r, i32 addrspace(1)* %out 8002 ret void 8003} 8004 8005define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 8006; CHECK-LABEL: @sdiv_i32_pow2k_denom( 8007; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 4096 8008; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 8009; CHECK-NEXT: ret void 8010; 8011; GFX6-LABEL: sdiv_i32_pow2k_denom: 8012; GFX6: ; %bb.0: 8013; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8014; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 8015; GFX6-NEXT: s_mov_b32 s7, 0xf000 8016; GFX6-NEXT: s_mov_b32 s6, -1 8017; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8018; GFX6-NEXT: s_ashr_i32 s1, s0, 31 8019; GFX6-NEXT: s_lshr_b32 s1, s1, 20 8020; GFX6-NEXT: s_add_i32 s0, s0, s1 8021; GFX6-NEXT: s_ashr_i32 s0, s0, 12 8022; GFX6-NEXT: v_mov_b32_e32 v0, s0 8023; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 8024; GFX6-NEXT: s_endpgm 8025; 8026; GFX9-LABEL: sdiv_i32_pow2k_denom: 8027; GFX9: ; %bb.0: 8028; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8029; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 8030; GFX9-NEXT: v_mov_b32_e32 v0, 0 8031; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8032; GFX9-NEXT: s_ashr_i32 s0, s4, 31 8033; GFX9-NEXT: s_lshr_b32 s0, s0, 20 8034; GFX9-NEXT: s_add_i32 s4, s4, s0 8035; GFX9-NEXT: s_ashr_i32 s0, s4, 12 8036; GFX9-NEXT: v_mov_b32_e32 v1, s0 8037; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 8038; GFX9-NEXT: s_endpgm 8039; 8040; GFX90A-LABEL: sdiv_i32_pow2k_denom: 8041; GFX90A: ; %bb.0: 8042; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8043; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 8044; GFX90A-NEXT: v_mov_b32_e32 v0, 0 8045; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8046; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 8047; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 8048; GFX90A-NEXT: s_add_i32 s4, s4, s0 8049; GFX90A-NEXT: s_ashr_i32 s0, s4, 12 8050; GFX90A-NEXT: v_mov_b32_e32 v1, s0 8051; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 8052; GFX90A-NEXT: s_endpgm 8053 %r = sdiv i32 %x, 4096 8054 store i32 %r, i32 addrspace(1)* %out 8055 ret void 8056} 8057 8058define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 8059; CHECK-LABEL: @sdiv_i32_pow2_shl_denom( 8060; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 8061; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]] 8062; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 8063; CHECK-NEXT: ret void 8064; 8065; GFX6-LABEL: sdiv_i32_pow2_shl_denom: 8066; GFX6: ; %bb.0: 8067; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 8068; GFX6-NEXT: s_mov_b32 s7, 0xf000 8069; GFX6-NEXT: s_mov_b32 s6, -1 8070; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8071; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 8072; GFX6-NEXT: s_ashr_i32 s8, s3, 31 8073; GFX6-NEXT: s_add_i32 s3, s3, s8 8074; GFX6-NEXT: s_xor_b32 s3, s3, s8 8075; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 8076; GFX6-NEXT: s_sub_i32 s4, 0, s3 8077; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 8078; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 8079; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 8080; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 8081; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8082; GFX6-NEXT: s_ashr_i32 s0, s2, 31 8083; GFX6-NEXT: s_add_i32 s1, s2, s0 8084; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 8085; GFX6-NEXT: s_xor_b32 s1, s1, s0 8086; GFX6-NEXT: s_xor_b32 s2, s0, s8 8087; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 8088; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 8089; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 8090; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 8091; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 8092; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 8093; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 8094; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 8095; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 8096; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 8097; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 8098; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 8099; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 8100; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 8101; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8102; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 8103; GFX6-NEXT: s_endpgm 8104; 8105; GFX9-LABEL: sdiv_i32_pow2_shl_denom: 8106; GFX9: ; %bb.0: 8107; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 8108; GFX9-NEXT: v_mov_b32_e32 v2, 0 8109; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 8110; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8111; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 8112; GFX9-NEXT: s_ashr_i32 s4, s3, 31 8113; GFX9-NEXT: s_add_i32 s3, s3, s4 8114; GFX9-NEXT: s_xor_b32 s3, s3, s4 8115; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 8116; GFX9-NEXT: s_sub_i32 s5, 0, s3 8117; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 8118; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 8119; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 8120; GFX9-NEXT: v_mul_lo_u32 v1, s5, v0 8121; GFX9-NEXT: s_ashr_i32 s5, s2, 31 8122; GFX9-NEXT: s_add_i32 s2, s2, s5 8123; GFX9-NEXT: s_xor_b32 s2, s2, s5 8124; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 8125; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 8126; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 8127; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3 8128; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 8129; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 8130; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 8131; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 8132; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 8133; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 8134; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 8135; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 8136; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 8137; GFX9-NEXT: s_xor_b32 s2, s5, s4 8138; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 8139; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 8140; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 8141; GFX9-NEXT: s_endpgm 8142; 8143; GFX90A-LABEL: sdiv_i32_pow2_shl_denom: 8144; GFX90A: ; %bb.0: 8145; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 8146; GFX90A-NEXT: v_mov_b32_e32 v1, 0 8147; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 8148; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8149; GFX90A-NEXT: s_lshl_b32 s3, 0x1000, s3 8150; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 8151; GFX90A-NEXT: s_add_i32 s3, s3, s4 8152; GFX90A-NEXT: s_xor_b32 s3, s3, s4 8153; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 8154; GFX90A-NEXT: s_sub_i32 s6, 0, s3 8155; GFX90A-NEXT: s_ashr_i32 s5, s2, 31 8156; GFX90A-NEXT: s_add_i32 s2, s2, s5 8157; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 8158; GFX90A-NEXT: s_xor_b32 s2, s2, s5 8159; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 8160; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 8161; GFX90A-NEXT: v_mul_lo_u32 v2, s6, v0 8162; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 8163; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 8164; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 8165; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s3 8166; GFX90A-NEXT: v_sub_u32_e32 v3, s2, v3 8167; GFX90A-NEXT: v_add_u32_e32 v2, 1, v0 8168; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 8169; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 8170; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v3 8171; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 8172; GFX90A-NEXT: v_add_u32_e32 v4, 1, v0 8173; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 8174; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 8175; GFX90A-NEXT: s_xor_b32 s2, s5, s4 8176; GFX90A-NEXT: v_xor_b32_e32 v0, s2, v0 8177; GFX90A-NEXT: v_subrev_u32_e32 v0, s2, v0 8178; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] 8179; GFX90A-NEXT: s_endpgm 8180 %shl.y = shl i32 4096, %y 8181 %r = sdiv i32 %x, %shl.y 8182 store i32 %r, i32 addrspace(1)* %out 8183 ret void 8184} 8185 8186define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 8187; CHECK-LABEL: @sdiv_v2i32_pow2k_denom( 8188; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 8189; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 8190; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 8191; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 8192; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096 8193; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 8194; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 8195; CHECK-NEXT: ret void 8196; 8197; GFX6-LABEL: sdiv_v2i32_pow2k_denom: 8198; GFX6: ; %bb.0: 8199; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8200; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 8201; GFX6-NEXT: s_mov_b32 s7, 0xf000 8202; GFX6-NEXT: s_mov_b32 s6, -1 8203; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8204; GFX6-NEXT: s_ashr_i32 s2, s0, 31 8205; GFX6-NEXT: s_ashr_i32 s3, s1, 31 8206; GFX6-NEXT: s_lshr_b32 s2, s2, 20 8207; GFX6-NEXT: s_add_i32 s0, s0, s2 8208; GFX6-NEXT: s_lshr_b32 s2, s3, 20 8209; GFX6-NEXT: s_add_i32 s1, s1, s2 8210; GFX6-NEXT: s_ashr_i32 s0, s0, 12 8211; GFX6-NEXT: s_ashr_i32 s1, s1, 12 8212; GFX6-NEXT: v_mov_b32_e32 v0, s0 8213; GFX6-NEXT: v_mov_b32_e32 v1, s1 8214; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 8215; GFX6-NEXT: s_endpgm 8216; 8217; GFX9-LABEL: sdiv_v2i32_pow2k_denom: 8218; GFX9: ; %bb.0: 8219; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8220; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 8221; GFX9-NEXT: v_mov_b32_e32 v2, 0 8222; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8223; GFX9-NEXT: s_ashr_i32 s0, s4, 31 8224; GFX9-NEXT: s_ashr_i32 s1, s5, 31 8225; GFX9-NEXT: s_lshr_b32 s0, s0, 20 8226; GFX9-NEXT: s_lshr_b32 s1, s1, 20 8227; GFX9-NEXT: s_add_i32 s0, s4, s0 8228; GFX9-NEXT: s_add_i32 s1, s5, s1 8229; GFX9-NEXT: s_ashr_i32 s0, s0, 12 8230; GFX9-NEXT: s_ashr_i32 s1, s1, 12 8231; GFX9-NEXT: v_mov_b32_e32 v0, s0 8232; GFX9-NEXT: v_mov_b32_e32 v1, s1 8233; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 8234; GFX9-NEXT: s_endpgm 8235; 8236; GFX90A-LABEL: sdiv_v2i32_pow2k_denom: 8237; GFX90A: ; %bb.0: 8238; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8239; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 8240; GFX90A-NEXT: v_mov_b32_e32 v2, 0 8241; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8242; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 8243; GFX90A-NEXT: s_ashr_i32 s1, s5, 31 8244; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 8245; GFX90A-NEXT: s_lshr_b32 s1, s1, 20 8246; GFX90A-NEXT: s_add_i32 s0, s4, s0 8247; GFX90A-NEXT: s_add_i32 s1, s5, s1 8248; GFX90A-NEXT: s_ashr_i32 s0, s0, 12 8249; GFX90A-NEXT: s_ashr_i32 s1, s1, 12 8250; GFX90A-NEXT: v_mov_b32_e32 v0, s0 8251; GFX90A-NEXT: v_mov_b32_e32 v1, s1 8252; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 8253; GFX90A-NEXT: s_endpgm 8254 %r = sdiv <2 x i32> %x, <i32 4096, i32 4096> 8255 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 8256 ret void 8257} 8258 8259define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 8260; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom( 8261; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 8262; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 8263; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 8264; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 8265; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095 8266; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 8267; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 8268; CHECK-NEXT: ret void 8269; 8270; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 8271; GFX6: ; %bb.0: 8272; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8273; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 8274; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080081 8275; GFX6-NEXT: s_mov_b32 s7, 0xf000 8276; GFX6-NEXT: s_mov_b32 s6, -1 8277; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8278; GFX6-NEXT: v_mul_hi_i32 v0, s1, v0 8279; GFX6-NEXT: s_ashr_i32 s2, s0, 31 8280; GFX6-NEXT: s_lshr_b32 s2, s2, 20 8281; GFX6-NEXT: s_add_i32 s0, s0, s2 8282; GFX6-NEXT: v_add_i32_e32 v0, vcc, s1, v0 8283; GFX6-NEXT: s_ashr_i32 s0, s0, 12 8284; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 8285; GFX6-NEXT: v_ashrrev_i32_e32 v0, 11, v0 8286; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v0 8287; GFX6-NEXT: v_mov_b32_e32 v0, s0 8288; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 8289; GFX6-NEXT: s_endpgm 8290; 8291; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 8292; GFX9: ; %bb.0: 8293; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8294; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 8295; GFX9-NEXT: v_mov_b32_e32 v2, 0 8296; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8297; GFX9-NEXT: s_ashr_i32 s0, s4, 31 8298; GFX9-NEXT: s_mul_hi_i32 s1, s5, 0x80080081 8299; GFX9-NEXT: s_lshr_b32 s0, s0, 20 8300; GFX9-NEXT: s_add_i32 s1, s1, s5 8301; GFX9-NEXT: s_add_i32 s0, s4, s0 8302; GFX9-NEXT: s_lshr_b32 s4, s1, 31 8303; GFX9-NEXT: s_ashr_i32 s1, s1, 11 8304; GFX9-NEXT: s_ashr_i32 s0, s0, 12 8305; GFX9-NEXT: s_add_i32 s1, s1, s4 8306; GFX9-NEXT: v_mov_b32_e32 v0, s0 8307; GFX9-NEXT: v_mov_b32_e32 v1, s1 8308; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 8309; GFX9-NEXT: s_endpgm 8310; 8311; GFX90A-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 8312; GFX90A: ; %bb.0: 8313; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8314; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 8315; GFX90A-NEXT: v_mov_b32_e32 v2, 0 8316; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8317; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 8318; GFX90A-NEXT: s_mul_hi_i32 s1, s5, 0x80080081 8319; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 8320; GFX90A-NEXT: s_add_i32 s1, s1, s5 8321; GFX90A-NEXT: s_add_i32 s0, s4, s0 8322; GFX90A-NEXT: s_lshr_b32 s4, s1, 31 8323; GFX90A-NEXT: s_ashr_i32 s1, s1, 11 8324; GFX90A-NEXT: s_ashr_i32 s0, s0, 12 8325; GFX90A-NEXT: s_add_i32 s1, s1, s4 8326; GFX90A-NEXT: v_mov_b32_e32 v0, s0 8327; GFX90A-NEXT: v_mov_b32_e32 v1, s1 8328; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 8329; GFX90A-NEXT: s_endpgm 8330 %r = sdiv <2 x i32> %x, <i32 4096, i32 4095> 8331 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 8332 ret void 8333} 8334 8335define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 8336; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom( 8337; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 8338; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 8339; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 8340; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 8341; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 8342; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 8343; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 8344; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 8345; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 8346; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 8347; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 8348; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 8349; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000 8350; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 8351; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]] 8352; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]] 8353; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64 8354; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 8355; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 8356; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 8357; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 8358; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 8359; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]] 8360; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64 8361; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64 8362; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 8363; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 8364; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 8365; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 8366; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]] 8367; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]] 8368; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]] 8369; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1 8370; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]] 8371; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]] 8372; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]] 8373; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]] 8374; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1 8375; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]] 8376; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]] 8377; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]] 8378; CHECK-NEXT: [[TMP41:%.*]] = insertelement <2 x i32> undef, i32 [[TMP40]], i64 0 8379; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[X]], i64 1 8380; CHECK-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 8381; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31 8382; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31 8383; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]] 8384; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]] 8385; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]] 8386; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]] 8387; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]] 8388; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float 8389; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]]) 8390; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000 8391; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32 8392; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]] 8393; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]] 8394; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64 8395; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64 8396; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]] 8397; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32 8398; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32 8399; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32 8400; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]] 8401; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64 8402; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64 8403; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]] 8404; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32 8405; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32 8406; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32 8407; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]] 8408; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]] 8409; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]] 8410; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1 8411; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]] 8412; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]] 8413; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]] 8414; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]] 8415; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1 8416; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]] 8417; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]] 8418; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]] 8419; CHECK-NEXT: [[TMP82:%.*]] = insertelement <2 x i32> [[TMP41]], i32 [[TMP81]], i64 1 8420; CHECK-NEXT: store <2 x i32> [[TMP82]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 8421; CHECK-NEXT: ret void 8422; 8423; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom: 8424; GFX6: ; %bb.0: 8425; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 8426; GFX6-NEXT: s_movk_i32 s10, 0x1000 8427; GFX6-NEXT: s_mov_b32 s12, 0x4f7ffffe 8428; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8429; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb 8430; GFX6-NEXT: s_mov_b32 s7, 0xf000 8431; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8432; GFX6-NEXT: s_lshl_b32 s2, s10, s2 8433; GFX6-NEXT: s_ashr_i32 s11, s2, 31 8434; GFX6-NEXT: s_add_i32 s2, s2, s11 8435; GFX6-NEXT: s_xor_b32 s2, s2, s11 8436; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 8437; GFX6-NEXT: s_lshl_b32 s0, s10, s3 8438; GFX6-NEXT: s_sub_i32 s10, 0, s2 8439; GFX6-NEXT: s_ashr_i32 s3, s0, 31 8440; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 8441; GFX6-NEXT: s_add_i32 s0, s0, s3 8442; GFX6-NEXT: s_ashr_i32 s1, s8, 31 8443; GFX6-NEXT: s_mov_b32 s6, -1 8444; GFX6-NEXT: v_mul_f32_e32 v0, s12, v0 8445; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 8446; GFX6-NEXT: v_mul_lo_u32 v1, s10, v0 8447; GFX6-NEXT: s_xor_b32 s10, s0, s3 8448; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s10 8449; GFX6-NEXT: s_add_i32 s0, s8, s1 8450; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 8451; GFX6-NEXT: s_xor_b32 s0, s0, s1 8452; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 8453; GFX6-NEXT: s_xor_b32 s8, s1, s11 8454; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 8455; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 8456; GFX6-NEXT: v_mul_f32_e32 v1, s12, v2 8457; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 8458; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 8459; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 8460; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s0, v2 8461; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 8462; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 8463; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v2 8464; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 8465; GFX6-NEXT: s_sub_i32 s0, 0, s10 8466; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1 8467; GFX6-NEXT: s_ashr_i32 s0, s9, 31 8468; GFX6-NEXT: s_add_i32 s1, s9, s0 8469; GFX6-NEXT: s_xor_b32 s1, s1, s0 8470; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 8471; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 8472; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 8473; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 8474; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 8475; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 8476; GFX6-NEXT: s_xor_b32 s2, s0, s3 8477; GFX6-NEXT: v_mul_lo_u32 v2, v1, s10 8478; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 8479; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 8480; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 8481; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 8482; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 8483; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s10, v2 8484; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 8485; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 8486; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 8487; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 8488; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 8489; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 8490; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 8491; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 8492; GFX6-NEXT: s_endpgm 8493; 8494; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: 8495; GFX9: ; %bb.0: 8496; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 8497; GFX9-NEXT: s_movk_i32 s8, 0x1000 8498; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 8499; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 8500; GFX9-NEXT: s_mov_b32 s10, 0x4f7ffffe 8501; GFX9-NEXT: v_mov_b32_e32 v2, 0 8502; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8503; GFX9-NEXT: s_lshl_b32 s2, s8, s2 8504; GFX9-NEXT: s_ashr_i32 s9, s2, 31 8505; GFX9-NEXT: s_add_i32 s2, s2, s9 8506; GFX9-NEXT: s_xor_b32 s2, s2, s9 8507; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 8508; GFX9-NEXT: s_lshl_b32 s0, s8, s3 8509; GFX9-NEXT: s_ashr_i32 s1, s0, 31 8510; GFX9-NEXT: s_add_i32 s0, s0, s1 8511; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 8512; GFX9-NEXT: s_xor_b32 s0, s0, s1 8513; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 8514; GFX9-NEXT: s_sub_i32 s3, 0, s2 8515; GFX9-NEXT: v_mul_f32_e32 v0, s10, v0 8516; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 8517; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 8518; GFX9-NEXT: s_sub_i32 s8, 0, s0 8519; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 8520; GFX9-NEXT: v_mul_f32_e32 v1, s10, v1 8521; GFX9-NEXT: s_ashr_i32 s3, s6, 31 8522; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 8523; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 8524; GFX9-NEXT: s_add_i32 s6, s6, s3 8525; GFX9-NEXT: s_xor_b32 s6, s6, s3 8526; GFX9-NEXT: s_xor_b32 s3, s3, s9 8527; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 8528; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 8529; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 8530; GFX9-NEXT: s_ashr_i32 s8, s7, 31 8531; GFX9-NEXT: s_xor_b32 s1, s8, s1 8532; GFX9-NEXT: v_mul_lo_u32 v4, v0, s2 8533; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 8534; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 8535; GFX9-NEXT: v_sub_u32_e32 v4, s6, v4 8536; GFX9-NEXT: s_add_i32 s6, s7, s8 8537; GFX9-NEXT: s_xor_b32 s6, s6, s8 8538; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 8539; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 8540; GFX9-NEXT: v_mul_hi_u32 v1, s6, v1 8541; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 8542; GFX9-NEXT: v_subrev_u32_e32 v5, s2, v4 8543; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc 8544; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 8545; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 8546; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 8547; GFX9-NEXT: v_mul_lo_u32 v3, v1, s0 8548; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 8549; GFX9-NEXT: v_xor_b32_e32 v0, s3, v0 8550; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0 8551; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 8552; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 8553; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 8554; GFX9-NEXT: v_subrev_u32_e32 v4, s0, v3 8555; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 8556; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 8557; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 8558; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 8559; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 8560; GFX9-NEXT: v_subrev_u32_e32 v1, s1, v1 8561; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 8562; GFX9-NEXT: s_endpgm 8563; 8564; GFX90A-LABEL: sdiv_v2i32_pow2_shl_denom: 8565; GFX90A: ; %bb.0: 8566; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 8567; GFX90A-NEXT: s_movk_i32 s8, 0x1000 8568; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 8569; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 8570; GFX90A-NEXT: s_mov_b32 s10, 0x4f7ffffe 8571; GFX90A-NEXT: v_mov_b32_e32 v2, 0 8572; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8573; GFX90A-NEXT: s_lshl_b32 s2, s8, s2 8574; GFX90A-NEXT: s_ashr_i32 s9, s2, 31 8575; GFX90A-NEXT: s_add_i32 s2, s2, s9 8576; GFX90A-NEXT: s_xor_b32 s2, s2, s9 8577; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s2 8578; GFX90A-NEXT: s_ashr_i32 s1, s6, 31 8579; GFX90A-NEXT: s_lshl_b32 s0, s8, s3 8580; GFX90A-NEXT: s_add_i32 s3, s6, s1 8581; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 8582; GFX90A-NEXT: s_xor_b32 s6, s1, s9 8583; GFX90A-NEXT: s_xor_b32 s1, s3, s1 8584; GFX90A-NEXT: s_sub_i32 s3, 0, s2 8585; GFX90A-NEXT: v_mul_f32_e32 v0, s10, v0 8586; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 8587; GFX90A-NEXT: v_mul_lo_u32 v1, s3, v0 8588; GFX90A-NEXT: v_mul_hi_u32 v1, v0, v1 8589; GFX90A-NEXT: v_add_u32_e32 v0, v0, v1 8590; GFX90A-NEXT: v_mul_hi_u32 v0, s1, v0 8591; GFX90A-NEXT: v_mul_lo_u32 v1, v0, s2 8592; GFX90A-NEXT: v_sub_u32_e32 v1, s1, v1 8593; GFX90A-NEXT: s_ashr_i32 s1, s0, 31 8594; GFX90A-NEXT: s_add_i32 s0, s0, s1 8595; GFX90A-NEXT: s_xor_b32 s0, s0, s1 8596; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s0 8597; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 8598; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 8599; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 8600; GFX90A-NEXT: v_subrev_u32_e32 v3, s2, v1 8601; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 8602; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 8603; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v4 8604; GFX90A-NEXT: s_ashr_i32 s2, s7, 31 8605; GFX90A-NEXT: s_add_i32 s3, s7, s2 8606; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 8607; GFX90A-NEXT: v_mul_f32_e32 v1, s10, v1 8608; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 8609; GFX90A-NEXT: s_xor_b32 s1, s2, s1 8610; GFX90A-NEXT: s_xor_b32 s2, s3, s2 8611; GFX90A-NEXT: s_sub_i32 s3, 0, s0 8612; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 8613; GFX90A-NEXT: v_mul_lo_u32 v3, s3, v1 8614; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 8615; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 8616; GFX90A-NEXT: v_mul_hi_u32 v1, s2, v1 8617; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s0 8618; GFX90A-NEXT: v_sub_u32_e32 v3, s2, v3 8619; GFX90A-NEXT: v_add_u32_e32 v4, 1, v1 8620; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 8621; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 8622; GFX90A-NEXT: v_subrev_u32_e32 v4, s0, v3 8623; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 8624; GFX90A-NEXT: v_add_u32_e32 v4, 1, v1 8625; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 8626; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 8627; GFX90A-NEXT: v_xor_b32_e32 v0, s6, v0 8628; GFX90A-NEXT: v_xor_b32_e32 v1, s1, v1 8629; GFX90A-NEXT: v_subrev_u32_e32 v0, s6, v0 8630; GFX90A-NEXT: v_subrev_u32_e32 v1, s1, v1 8631; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 8632; GFX90A-NEXT: s_endpgm 8633 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 8634 %r = sdiv <2 x i32> %x, %shl.y 8635 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 8636 ret void 8637} 8638 8639define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 8640; CHECK-LABEL: @srem_i32_oddk_denom( 8641; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 1235195 8642; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 8643; CHECK-NEXT: ret void 8644; 8645; GFX6-LABEL: srem_i32_oddk_denom: 8646; GFX6: ; %bb.0: 8647; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 8648; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 8649; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb 8650; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 8651; GFX6-NEXT: s_mov_b32 s3, 0xf000 8652; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8653; GFX6-NEXT: v_mul_hi_i32 v0, s4, v0 8654; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 8655; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 8656; GFX6-NEXT: v_ashrrev_i32_e32 v0, 20, v0 8657; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 8658; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 8659; GFX6-NEXT: s_mov_b32 s2, -1 8660; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 8661; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 8662; GFX6-NEXT: s_endpgm 8663; 8664; GFX9-LABEL: srem_i32_oddk_denom: 8665; GFX9: ; %bb.0: 8666; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8667; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 8668; GFX9-NEXT: v_mov_b32_e32 v0, 0 8669; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8670; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 8671; GFX9-NEXT: s_add_i32 s0, s0, s4 8672; GFX9-NEXT: s_lshr_b32 s1, s0, 31 8673; GFX9-NEXT: s_ashr_i32 s0, s0, 20 8674; GFX9-NEXT: s_add_i32 s0, s0, s1 8675; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb 8676; GFX9-NEXT: s_sub_i32 s0, s4, s0 8677; GFX9-NEXT: v_mov_b32_e32 v1, s0 8678; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 8679; GFX9-NEXT: s_endpgm 8680; 8681; GFX90A-LABEL: srem_i32_oddk_denom: 8682; GFX90A: ; %bb.0: 8683; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8684; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 8685; GFX90A-NEXT: v_mov_b32_e32 v0, 0 8686; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8687; GFX90A-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 8688; GFX90A-NEXT: s_add_i32 s0, s0, s4 8689; GFX90A-NEXT: s_lshr_b32 s1, s0, 31 8690; GFX90A-NEXT: s_ashr_i32 s0, s0, 20 8691; GFX90A-NEXT: s_add_i32 s0, s0, s1 8692; GFX90A-NEXT: s_mul_i32 s0, s0, 0x12d8fb 8693; GFX90A-NEXT: s_sub_i32 s0, s4, s0 8694; GFX90A-NEXT: v_mov_b32_e32 v1, s0 8695; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 8696; GFX90A-NEXT: s_endpgm 8697 %r = srem i32 %x, 1235195 8698 store i32 %r, i32 addrspace(1)* %out 8699 ret void 8700} 8701 8702define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 8703; CHECK-LABEL: @srem_i32_pow2k_denom( 8704; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 4096 8705; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 8706; CHECK-NEXT: ret void 8707; 8708; GFX6-LABEL: srem_i32_pow2k_denom: 8709; GFX6: ; %bb.0: 8710; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8711; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb 8712; GFX6-NEXT: s_mov_b32 s7, 0xf000 8713; GFX6-NEXT: s_mov_b32 s6, -1 8714; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8715; GFX6-NEXT: s_ashr_i32 s1, s0, 31 8716; GFX6-NEXT: s_lshr_b32 s1, s1, 20 8717; GFX6-NEXT: s_add_i32 s1, s0, s1 8718; GFX6-NEXT: s_and_b32 s1, s1, 0xfffff000 8719; GFX6-NEXT: s_sub_i32 s0, s0, s1 8720; GFX6-NEXT: v_mov_b32_e32 v0, s0 8721; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 8722; GFX6-NEXT: s_endpgm 8723; 8724; GFX9-LABEL: srem_i32_pow2k_denom: 8725; GFX9: ; %bb.0: 8726; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8727; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 8728; GFX9-NEXT: v_mov_b32_e32 v0, 0 8729; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8730; GFX9-NEXT: s_ashr_i32 s0, s4, 31 8731; GFX9-NEXT: s_lshr_b32 s0, s0, 20 8732; GFX9-NEXT: s_add_i32 s0, s4, s0 8733; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 8734; GFX9-NEXT: s_sub_i32 s0, s4, s0 8735; GFX9-NEXT: v_mov_b32_e32 v1, s0 8736; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 8737; GFX9-NEXT: s_endpgm 8738; 8739; GFX90A-LABEL: srem_i32_pow2k_denom: 8740; GFX90A: ; %bb.0: 8741; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8742; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 8743; GFX90A-NEXT: v_mov_b32_e32 v0, 0 8744; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8745; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 8746; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 8747; GFX90A-NEXT: s_add_i32 s0, s4, s0 8748; GFX90A-NEXT: s_and_b32 s0, s0, 0xfffff000 8749; GFX90A-NEXT: s_sub_i32 s0, s4, s0 8750; GFX90A-NEXT: v_mov_b32_e32 v1, s0 8751; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 8752; GFX90A-NEXT: s_endpgm 8753 %r = srem i32 %x, 4096 8754 store i32 %r, i32 addrspace(1)* %out 8755 ret void 8756} 8757 8758define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 8759; CHECK-LABEL: @srem_i32_pow2_shl_denom( 8760; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 8761; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]] 8762; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 8763; CHECK-NEXT: ret void 8764; 8765; GFX6-LABEL: srem_i32_pow2_shl_denom: 8766; GFX6: ; %bb.0: 8767; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 8768; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 8769; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8770; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 8771; GFX6-NEXT: s_ashr_i32 s4, s3, 31 8772; GFX6-NEXT: s_add_i32 s3, s3, s4 8773; GFX6-NEXT: s_xor_b32 s4, s3, s4 8774; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 8775; GFX6-NEXT: s_sub_i32 s3, 0, s4 8776; GFX6-NEXT: s_ashr_i32 s5, s2, 31 8777; GFX6-NEXT: s_add_i32 s2, s2, s5 8778; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 8779; GFX6-NEXT: s_xor_b32 s6, s2, s5 8780; GFX6-NEXT: s_mov_b32 s2, -1 8781; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 8782; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 8783; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 8784; GFX6-NEXT: s_mov_b32 s3, 0xf000 8785; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 8786; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 8787; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 8788; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 8789; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 8790; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 8791; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 8792; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 8793; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 8794; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 8795; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 8796; GFX6-NEXT: v_xor_b32_e32 v0, s5, v0 8797; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s5, v0 8798; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 8799; GFX6-NEXT: s_endpgm 8800; 8801; GFX9-LABEL: srem_i32_pow2_shl_denom: 8802; GFX9: ; %bb.0: 8803; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 8804; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8805; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 8806; GFX9-NEXT: s_ashr_i32 s4, s3, 31 8807; GFX9-NEXT: s_add_i32 s3, s3, s4 8808; GFX9-NEXT: s_xor_b32 s3, s3, s4 8809; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 8810; GFX9-NEXT: s_sub_i32 s4, 0, s3 8811; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 8812; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 8813; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 8814; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 8815; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 8816; GFX9-NEXT: s_ashr_i32 s4, s2, 31 8817; GFX9-NEXT: s_add_i32 s2, s2, s4 8818; GFX9-NEXT: s_xor_b32 s2, s2, s4 8819; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 8820; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 8821; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 8822; GFX9-NEXT: v_mov_b32_e32 v1, 0 8823; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 8824; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 8825; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 8826; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 8827; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 8828; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 8829; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 8830; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 8831; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 8832; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 8833; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8834; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 8835; GFX9-NEXT: s_endpgm 8836; 8837; GFX90A-LABEL: srem_i32_pow2_shl_denom: 8838; GFX90A: ; %bb.0: 8839; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 8840; GFX90A-NEXT: v_mov_b32_e32 v1, 0 8841; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 8842; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8843; GFX90A-NEXT: s_lshl_b32 s3, 0x1000, s3 8844; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 8845; GFX90A-NEXT: s_add_i32 s3, s3, s4 8846; GFX90A-NEXT: s_xor_b32 s3, s3, s4 8847; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 8848; GFX90A-NEXT: s_sub_i32 s5, 0, s3 8849; GFX90A-NEXT: s_ashr_i32 s4, s2, 31 8850; GFX90A-NEXT: s_add_i32 s2, s2, s4 8851; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 8852; GFX90A-NEXT: s_xor_b32 s2, s2, s4 8853; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 8854; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 8855; GFX90A-NEXT: v_mul_lo_u32 v2, s5, v0 8856; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 8857; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 8858; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 8859; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s3 8860; GFX90A-NEXT: v_sub_u32_e32 v0, s2, v0 8861; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 8862; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 8863; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 8864; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 8865; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 8866; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 8867; GFX90A-NEXT: v_xor_b32_e32 v0, s4, v0 8868; GFX90A-NEXT: v_subrev_u32_e32 v0, s4, v0 8869; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] 8870; GFX90A-NEXT: s_endpgm 8871 %shl.y = shl i32 4096, %y 8872 %r = srem i32 %x, %shl.y 8873 store i32 %r, i32 addrspace(1)* %out 8874 ret void 8875} 8876 8877define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 8878; CHECK-LABEL: @srem_v2i32_pow2k_denom( 8879; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 8880; CHECK-NEXT: [[TMP2:%.*]] = srem i32 [[TMP1]], 4096 8881; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 8882; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 8883; CHECK-NEXT: [[TMP5:%.*]] = srem i32 [[TMP4]], 4096 8884; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 8885; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 8886; CHECK-NEXT: ret void 8887; 8888; GFX6-LABEL: srem_v2i32_pow2k_denom: 8889; GFX6: ; %bb.0: 8890; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8891; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 8892; GFX6-NEXT: s_movk_i32 s2, 0xf000 8893; GFX6-NEXT: s_mov_b32 s7, 0xf000 8894; GFX6-NEXT: s_mov_b32 s6, -1 8895; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8896; GFX6-NEXT: s_ashr_i32 s3, s0, 31 8897; GFX6-NEXT: s_lshr_b32 s3, s3, 20 8898; GFX6-NEXT: s_add_i32 s3, s0, s3 8899; GFX6-NEXT: s_and_b32 s3, s3, s2 8900; GFX6-NEXT: s_sub_i32 s0, s0, s3 8901; GFX6-NEXT: s_ashr_i32 s3, s1, 31 8902; GFX6-NEXT: s_lshr_b32 s3, s3, 20 8903; GFX6-NEXT: s_add_i32 s3, s1, s3 8904; GFX6-NEXT: s_and_b32 s2, s3, s2 8905; GFX6-NEXT: s_sub_i32 s1, s1, s2 8906; GFX6-NEXT: v_mov_b32_e32 v0, s0 8907; GFX6-NEXT: v_mov_b32_e32 v1, s1 8908; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 8909; GFX6-NEXT: s_endpgm 8910; 8911; GFX9-LABEL: srem_v2i32_pow2k_denom: 8912; GFX9: ; %bb.0: 8913; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8914; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 8915; GFX9-NEXT: s_movk_i32 s6, 0xf000 8916; GFX9-NEXT: v_mov_b32_e32 v2, 0 8917; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8918; GFX9-NEXT: s_ashr_i32 s0, s4, 31 8919; GFX9-NEXT: s_ashr_i32 s1, s5, 31 8920; GFX9-NEXT: s_lshr_b32 s0, s0, 20 8921; GFX9-NEXT: s_lshr_b32 s1, s1, 20 8922; GFX9-NEXT: s_add_i32 s0, s4, s0 8923; GFX9-NEXT: s_add_i32 s1, s5, s1 8924; GFX9-NEXT: s_and_b32 s0, s0, s6 8925; GFX9-NEXT: s_and_b32 s1, s1, s6 8926; GFX9-NEXT: s_sub_i32 s0, s4, s0 8927; GFX9-NEXT: s_sub_i32 s1, s5, s1 8928; GFX9-NEXT: v_mov_b32_e32 v0, s0 8929; GFX9-NEXT: v_mov_b32_e32 v1, s1 8930; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 8931; GFX9-NEXT: s_endpgm 8932; 8933; GFX90A-LABEL: srem_v2i32_pow2k_denom: 8934; GFX90A: ; %bb.0: 8935; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8936; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 8937; GFX90A-NEXT: s_movk_i32 s6, 0xf000 8938; GFX90A-NEXT: v_mov_b32_e32 v2, 0 8939; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8940; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 8941; GFX90A-NEXT: s_ashr_i32 s1, s5, 31 8942; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 8943; GFX90A-NEXT: s_lshr_b32 s1, s1, 20 8944; GFX90A-NEXT: s_add_i32 s0, s4, s0 8945; GFX90A-NEXT: s_add_i32 s1, s5, s1 8946; GFX90A-NEXT: s_and_b32 s0, s0, s6 8947; GFX90A-NEXT: s_and_b32 s1, s1, s6 8948; GFX90A-NEXT: s_sub_i32 s0, s4, s0 8949; GFX90A-NEXT: s_sub_i32 s1, s5, s1 8950; GFX90A-NEXT: v_mov_b32_e32 v0, s0 8951; GFX90A-NEXT: v_mov_b32_e32 v1, s1 8952; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 8953; GFX90A-NEXT: s_endpgm 8954 %r = srem <2 x i32> %x, <i32 4096, i32 4096> 8955 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 8956 ret void 8957} 8958 8959define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 8960; CHECK-LABEL: @srem_v2i32_pow2_shl_denom( 8961; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 8962; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 8963; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 8964; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 8965; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 8966; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 8967; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 8968; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 8969; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 8970; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 8971; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 8972; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000 8973; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 8974; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]] 8975; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]] 8976; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64 8977; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 8978; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]] 8979; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 8980; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32 8981; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 8982; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] 8983; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 8984; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64 8985; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 8986; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 8987; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 8988; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 8989; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]] 8990; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]] 8991; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]] 8992; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]] 8993; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]] 8994; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]] 8995; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]] 8996; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]] 8997; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]] 8998; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]] 8999; CHECK-NEXT: [[TMP38:%.*]] = insertelement <2 x i32> undef, i32 [[TMP37]], i64 0 9000; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[X]], i64 1 9001; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 9002; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31 9003; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31 9004; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]] 9005; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]] 9006; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]] 9007; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]] 9008; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float 9009; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]]) 9010; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000 9011; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32 9012; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]] 9013; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]] 9014; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64 9015; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64 9016; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]] 9017; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32 9018; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32 9019; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32 9020; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]] 9021; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64 9022; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64 9023; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]] 9024; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32 9025; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32 9026; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32 9027; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]] 9028; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]] 9029; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]] 9030; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]] 9031; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]] 9032; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]] 9033; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]] 9034; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]] 9035; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]] 9036; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]] 9037; CHECK-NEXT: [[TMP76:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP75]], i64 1 9038; CHECK-NEXT: store <2 x i32> [[TMP76]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 9039; CHECK-NEXT: ret void 9040; 9041; GFX6-LABEL: srem_v2i32_pow2_shl_denom: 9042; GFX6: ; %bb.0: 9043; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 9044; GFX6-NEXT: s_movk_i32 s6, 0x1000 9045; GFX6-NEXT: s_mov_b32 s10, 0x4f7ffffe 9046; GFX6-NEXT: s_mov_b32 s7, 0xf000 9047; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9048; GFX6-NEXT: s_lshl_b32 s2, s6, s2 9049; GFX6-NEXT: s_ashr_i32 s4, s2, 31 9050; GFX6-NEXT: s_add_i32 s2, s2, s4 9051; GFX6-NEXT: s_xor_b32 s2, s2, s4 9052; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 9053; GFX6-NEXT: s_lshl_b32 s3, s6, s3 9054; GFX6-NEXT: s_ashr_i32 s6, s3, 31 9055; GFX6-NEXT: s_add_i32 s3, s3, s6 9056; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 9057; GFX6-NEXT: s_sub_i32 s9, 0, s2 9058; GFX6-NEXT: s_xor_b32 s3, s3, s6 9059; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s3 9060; GFX6-NEXT: v_mul_f32_e32 v0, s10, v0 9061; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9062; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 9063; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 9064; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 9065; GFX6-NEXT: s_mov_b32 s6, -1 9066; GFX6-NEXT: v_mul_lo_u32 v1, s9, v0 9067; GFX6-NEXT: s_sub_i32 s9, 0, s3 9068; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9069; GFX6-NEXT: s_ashr_i32 s8, s0, 31 9070; GFX6-NEXT: s_add_i32 s0, s0, s8 9071; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 9072; GFX6-NEXT: s_xor_b32 s0, s0, s8 9073; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 9074; GFX6-NEXT: v_mul_f32_e32 v1, s10, v2 9075; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9076; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 9077; GFX6-NEXT: v_mul_lo_u32 v2, s9, v1 9078; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 9079; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 9080; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 9081; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v0 9082; GFX6-NEXT: s_ashr_i32 s0, s1, 31 9083; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 9084; GFX6-NEXT: s_add_i32 s1, s1, s0 9085; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 9086; GFX6-NEXT: s_xor_b32 s1, s1, s0 9087; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 9088; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 9089; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v0 9090; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 9091; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 9092; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 9093; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 9094; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 9095; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 9096; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 9097; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 9098; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 9099; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 9100; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 9101; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 9102; GFX6-NEXT: v_xor_b32_e32 v1, s0, v1 9103; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1 9104; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 9105; GFX6-NEXT: s_endpgm 9106; 9107; GFX9-LABEL: srem_v2i32_pow2_shl_denom: 9108; GFX9: ; %bb.0: 9109; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 9110; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 9111; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 9112; GFX9-NEXT: s_movk_i32 s8, 0x1000 9113; GFX9-NEXT: s_mov_b32 s9, 0x4f7ffffe 9114; GFX9-NEXT: v_mov_b32_e32 v2, 0 9115; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9116; GFX9-NEXT: s_lshl_b32 s0, s8, s6 9117; GFX9-NEXT: s_ashr_i32 s1, s0, 31 9118; GFX9-NEXT: s_add_i32 s0, s0, s1 9119; GFX9-NEXT: s_xor_b32 s0, s0, s1 9120; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s0 9121; GFX9-NEXT: s_lshl_b32 s1, s8, s7 9122; GFX9-NEXT: s_ashr_i32 s6, s1, 31 9123; GFX9-NEXT: s_add_i32 s1, s1, s6 9124; GFX9-NEXT: s_xor_b32 s1, s1, s6 9125; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 9126; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 9127; GFX9-NEXT: s_sub_i32 s7, 0, s0 9128; GFX9-NEXT: s_ashr_i32 s6, s4, 31 9129; GFX9-NEXT: v_mul_f32_e32 v0, s9, v0 9130; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 9131; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 9132; GFX9-NEXT: s_add_i32 s4, s4, s6 9133; GFX9-NEXT: s_xor_b32 s4, s4, s6 9134; GFX9-NEXT: v_mul_f32_e32 v1, s9, v1 9135; GFX9-NEXT: v_mul_lo_u32 v3, s7, v0 9136; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 9137; GFX9-NEXT: s_sub_i32 s7, 0, s1 9138; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 9139; GFX9-NEXT: v_mul_lo_u32 v4, s7, v1 9140; GFX9-NEXT: s_ashr_i32 s7, s5, 31 9141; GFX9-NEXT: s_add_i32 s5, s5, s7 9142; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 9143; GFX9-NEXT: v_mul_hi_u32 v3, v1, v4 9144; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 9145; GFX9-NEXT: s_xor_b32 s5, s5, s7 9146; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 9147; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 9148; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0 9149; GFX9-NEXT: v_mul_lo_u32 v1, v1, s1 9150; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 9151; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v0 9152; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 9153; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 9154; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v0 9155; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 9156; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 9157; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 9158; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v1 9159; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 9160; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 9161; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v1 9162; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 9163; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 9164; GFX9-NEXT: v_xor_b32_e32 v0, s6, v0 9165; GFX9-NEXT: v_xor_b32_e32 v1, s7, v1 9166; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v0 9167; GFX9-NEXT: v_subrev_u32_e32 v1, s7, v1 9168; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 9169; GFX9-NEXT: s_endpgm 9170; 9171; GFX90A-LABEL: srem_v2i32_pow2_shl_denom: 9172; GFX90A: ; %bb.0: 9173; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 9174; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 9175; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 9176; GFX90A-NEXT: s_movk_i32 s8, 0x1000 9177; GFX90A-NEXT: s_mov_b32 s9, 0x4f7ffffe 9178; GFX90A-NEXT: v_mov_b32_e32 v2, 0 9179; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 9180; GFX90A-NEXT: s_lshl_b32 s0, s8, s6 9181; GFX90A-NEXT: s_ashr_i32 s1, s0, 31 9182; GFX90A-NEXT: s_add_i32 s0, s0, s1 9183; GFX90A-NEXT: s_xor_b32 s0, s0, s1 9184; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s0 9185; GFX90A-NEXT: s_lshl_b32 s1, s8, s7 9186; GFX90A-NEXT: s_sub_i32 s8, 0, s0 9187; GFX90A-NEXT: s_ashr_i32 s6, s4, 31 9188; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 9189; GFX90A-NEXT: s_add_i32 s4, s4, s6 9190; GFX90A-NEXT: s_xor_b32 s4, s4, s6 9191; GFX90A-NEXT: s_ashr_i32 s7, s1, 31 9192; GFX90A-NEXT: v_mul_f32_e32 v0, s9, v0 9193; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 9194; GFX90A-NEXT: s_add_i32 s1, s1, s7 9195; GFX90A-NEXT: s_xor_b32 s1, s1, s7 9196; GFX90A-NEXT: v_mul_lo_u32 v1, s8, v0 9197; GFX90A-NEXT: v_mul_hi_u32 v1, v0, v1 9198; GFX90A-NEXT: v_add_u32_e32 v0, v0, v1 9199; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 9200; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s0 9201; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 9202; GFX90A-NEXT: v_subrev_u32_e32 v1, s0, v0 9203; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 9204; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 9205; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s1 9206; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v0 9207; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v0 9208; GFX90A-NEXT: s_ashr_i32 s0, s5, 31 9209; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 9210; GFX90A-NEXT: s_add_i32 s4, s5, s0 9211; GFX90A-NEXT: s_sub_i32 s5, 0, s1 9212; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 9213; GFX90A-NEXT: v_mul_f32_e32 v1, s9, v1 9214; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 9215; GFX90A-NEXT: s_xor_b32 s4, s4, s0 9216; GFX90A-NEXT: v_xor_b32_e32 v0, s6, v0 9217; GFX90A-NEXT: v_subrev_u32_e32 v0, s6, v0 9218; GFX90A-NEXT: v_mul_lo_u32 v3, s5, v1 9219; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 9220; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 9221; GFX90A-NEXT: v_mul_hi_u32 v1, s4, v1 9222; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s1 9223; GFX90A-NEXT: v_sub_u32_e32 v1, s4, v1 9224; GFX90A-NEXT: v_subrev_u32_e32 v3, s1, v1 9225; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 9226; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 9227; GFX90A-NEXT: v_subrev_u32_e32 v3, s1, v1 9228; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s1, v1 9229; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 9230; GFX90A-NEXT: v_xor_b32_e32 v1, s0, v1 9231; GFX90A-NEXT: v_subrev_u32_e32 v1, s0, v1 9232; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 9233; GFX90A-NEXT: s_endpgm 9234 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 9235 %r = srem <2 x i32> %x, %shl.y 9236 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 9237 ret void 9238} 9239 9240define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 9241; CHECK-LABEL: @udiv_i64_oddk_denom( 9242; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943 9243; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 9244; CHECK-NEXT: ret void 9245; 9246; GFX6-LABEL: udiv_i64_oddk_denom: 9247; GFX6: ; %bb.0: 9248; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f176a73 9249; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 9250; GFX6-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 9251; GFX6-NEXT: v_rcp_f32_e32 v0, v0 9252; GFX6-NEXT: s_movk_i32 s2, 0xfee0 9253; GFX6-NEXT: s_mov_b32 s3, 0x68958c89 9254; GFX6-NEXT: v_mov_b32_e32 v8, 0 9255; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9256; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9257; GFX6-NEXT: v_trunc_f32_e32 v1, v1 9258; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9259; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9260; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9261; GFX6-NEXT: v_mov_b32_e32 v7, 0 9262; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 9263; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 9264; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 9265; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 9266; GFX6-NEXT: s_mov_b32 s11, 0xf000 9267; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9268; GFX6-NEXT: s_mov_b32 s8, s4 9269; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9270; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 9271; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 9272; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 9273; GFX6-NEXT: v_mul_hi_u32 v6, v0, v3 9274; GFX6-NEXT: v_mul_hi_u32 v4, v0, v2 9275; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 9276; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9277; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 9278; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 9279; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 9280; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 9281; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 9282; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc 9283; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 9284; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9285; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 9286; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 9287; GFX6-NEXT: v_mul_lo_u32 v4, v0, s2 9288; GFX6-NEXT: v_mul_hi_u32 v5, v0, s3 9289; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 9290; GFX6-NEXT: v_mul_lo_u32 v6, v2, s3 9291; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 9292; GFX6-NEXT: v_mul_lo_u32 v5, v0, s3 9293; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 9294; GFX6-NEXT: v_mul_lo_u32 v6, v0, v4 9295; GFX6-NEXT: v_mul_hi_u32 v9, v0, v5 9296; GFX6-NEXT: v_mul_hi_u32 v10, v0, v4 9297; GFX6-NEXT: v_mul_hi_u32 v11, v2, v4 9298; GFX6-NEXT: s_movk_i32 s2, 0x11f 9299; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 9300; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc 9301; GFX6-NEXT: v_mul_lo_u32 v10, v2, v5 9302; GFX6-NEXT: v_mul_hi_u32 v5, v2, v5 9303; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 9304; GFX6-NEXT: s_mov_b32 s3, 0x976a7377 9305; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v10 9306; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc 9307; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc 9308; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 9309; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 9310; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 9311; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 9312; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9313; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 9314; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 9315; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 9316; GFX6-NEXT: v_mul_hi_u32 v4, s6, v1 9317; GFX6-NEXT: v_mul_hi_u32 v5, s7, v1 9318; GFX6-NEXT: v_mul_lo_u32 v1, s7, v1 9319; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9320; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 9321; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 9322; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 9323; GFX6-NEXT: s_mov_b32 s4, 0x976a7376 9324; GFX6-NEXT: s_mov_b32 s10, -1 9325; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9326; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 9327; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 9328; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 9329; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 9330; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 9331; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 9332; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 9333; GFX6-NEXT: v_mov_b32_e32 v5, s2 9334; GFX6-NEXT: s_mov_b32 s9, s5 9335; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9336; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 9337; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9338; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s7, v2 9339; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 9340; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 9341; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s3, v3 9342; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 9343; GFX6-NEXT: s_movk_i32 s3, 0x11e 9344; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s3, v4 9345; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 9346; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v5 9347; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 9348; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 9349; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 9350; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 9351; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 9352; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 9353; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 9354; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 9355; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] 9356; GFX6-NEXT: v_mov_b32_e32 v6, s7 9357; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 9358; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s3, v2 9359; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 9360; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s4, v3 9361; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 9362; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 9363; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 9364; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 9365; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 9366; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 9367; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 9368; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 9369; GFX6-NEXT: s_endpgm 9370; 9371; GFX9-LABEL: udiv_i64_oddk_denom: 9372; GFX9: ; %bb.0: 9373; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f176a73 9374; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 9375; GFX9-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 9376; GFX9-NEXT: v_rcp_f32_e32 v0, v0 9377; GFX9-NEXT: s_movk_i32 s4, 0xfee0 9378; GFX9-NEXT: s_mov_b32 s5, 0x68958c89 9379; GFX9-NEXT: v_mov_b32_e32 v8, 0 9380; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9381; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9382; GFX9-NEXT: v_trunc_f32_e32 v1, v1 9383; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9384; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 9385; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 9386; GFX9-NEXT: v_mov_b32_e32 v5, 0 9387; GFX9-NEXT: v_mul_lo_u32 v2, v0, s4 9388; GFX9-NEXT: v_mul_hi_u32 v3, v0, s5 9389; GFX9-NEXT: v_mul_lo_u32 v4, v1, s5 9390; GFX9-NEXT: v_mul_lo_u32 v6, v0, s5 9391; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9392; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 9393; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 9394; GFX9-NEXT: v_mul_hi_u32 v4, v0, v6 9395; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 9396; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 9397; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 9398; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 9399; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v7, vcc 9400; GFX9-NEXT: v_mul_lo_u32 v7, v1, v6 9401; GFX9-NEXT: v_mul_hi_u32 v6, v1, v6 9402; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 9403; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v6, vcc 9404; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc 9405; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9406; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v4, vcc 9407; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 9408; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] 9409; GFX9-NEXT: v_mul_lo_u32 v4, v0, s4 9410; GFX9-NEXT: v_mul_hi_u32 v6, v0, s5 9411; GFX9-NEXT: v_mul_lo_u32 v7, v2, s5 9412; GFX9-NEXT: v_mul_lo_u32 v9, v0, s5 9413; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 9414; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 9415; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 9416; GFX9-NEXT: v_mul_lo_u32 v6, v0, v4 9417; GFX9-NEXT: v_mul_hi_u32 v7, v0, v9 9418; GFX9-NEXT: v_mul_hi_u32 v10, v0, v4 9419; GFX9-NEXT: v_mul_hi_u32 v11, v2, v4 9420; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 9421; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 9422; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v8, v10, vcc 9423; GFX9-NEXT: v_mul_lo_u32 v10, v2, v9 9424; GFX9-NEXT: v_mul_hi_u32 v9, v2, v9 9425; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 9426; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v10 9427; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v9, vcc 9428; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v5, vcc 9429; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 9430; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc 9431; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] 9432; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 9433; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 9434; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9435; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 9436; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 9437; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 9438; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 9439; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 9440; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9441; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v4, vcc 9442; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 9443; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 9444; GFX9-NEXT: s_movk_i32 s2, 0x11f 9445; GFX9-NEXT: s_mov_b32 s3, 0x976a7377 9446; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 9447; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 9448; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc 9449; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 9450; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v2, vcc 9451; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 9452; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 9453; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 9454; GFX9-NEXT: v_mov_b32_e32 v6, s2 9455; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9456; GFX9-NEXT: v_mul_lo_u32 v3, v0, s3 9457; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 9458; GFX9-NEXT: v_sub_u32_e32 v4, s7, v2 9459; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 9460; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc 9461; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s3, v3 9462; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] 9463; GFX9-NEXT: s_movk_i32 s3, 0x11e 9464; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s3, v4 9465; GFX9-NEXT: s_mov_b32 s6, 0x976a7376 9466; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 9467; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s6, v6 9468; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 9469; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 9470; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] 9471; GFX9-NEXT: v_mov_b32_e32 v7, s7 9472; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v7, v2, vcc 9473; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s3, v2 9474; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 9475; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 9476; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v3 9477; GFX9-NEXT: v_cndmask_b32_e64 v4, 1, 2, s[0:1] 9478; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 9479; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 9480; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v0, v4 9481; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc 9482; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] 9483; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 9484; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 9485; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 9486; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] 9487; GFX9-NEXT: s_endpgm 9488; 9489; GFX90A-LABEL: udiv_i64_oddk_denom: 9490; GFX90A: ; %bb.0: 9491; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f176a73 9492; GFX90A-NEXT: v_mov_b32_e32 v1, 0x4f800000 9493; GFX90A-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 9494; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 9495; GFX90A-NEXT: s_movk_i32 s2, 0xfee0 9496; GFX90A-NEXT: s_mov_b32 s3, 0x68958c89 9497; GFX90A-NEXT: v_mov_b32_e32 v8, 0 9498; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9499; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9500; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 9501; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9502; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 9503; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 9504; GFX90A-NEXT: v_mov_b32_e32 v2, 0 9505; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 9506; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s2 9507; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s3 9508; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 9509; GFX90A-NEXT: v_mul_lo_u32 v4, v1, s3 9510; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 9511; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s3 9512; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 9513; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 9514; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 9515; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 9516; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc 9517; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 9518; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 9519; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 9520; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 9521; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v9, vcc 9522; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 9523; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 9524; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 9525; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc 9526; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 9527; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] 9528; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s2 9529; GFX90A-NEXT: v_mul_hi_u32 v7, v0, s3 9530; GFX90A-NEXT: v_mul_lo_u32 v5, v3, s3 9531; GFX90A-NEXT: v_add_u32_e32 v6, v7, v6 9532; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 9533; GFX90A-NEXT: v_mul_lo_u32 v9, v0, s3 9534; GFX90A-NEXT: v_mul_lo_u32 v7, v0, v5 9535; GFX90A-NEXT: v_mul_hi_u32 v10, v0, v9 9536; GFX90A-NEXT: v_mul_hi_u32 v6, v0, v5 9537; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v10, v7 9538; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v6, vcc 9539; GFX90A-NEXT: v_mul_hi_u32 v11, v3, v9 9540; GFX90A-NEXT: v_mul_lo_u32 v9, v3, v9 9541; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 9542; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v5 9543; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v11, vcc 9544; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v2, vcc 9545; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 9546; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 9547; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v7, vcc 9548; GFX90A-NEXT: v_add_u32_e32 v1, v1, v4 9549; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] 9550; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 9551; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 9552; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 9553; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 9554; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 9555; GFX90A-NEXT: v_mul_hi_u32 v3, s6, v1 9556; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 9557; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc 9558; GFX90A-NEXT: v_mul_hi_u32 v6, s7, v0 9559; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 9560; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 9561; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 9562; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc 9563; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc 9564; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 9565; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 9566; GFX90A-NEXT: s_movk_i32 s2, 0x11f 9567; GFX90A-NEXT: s_mov_b32 s3, 0x976a7377 9568; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v3, vcc 9569; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s2 9570; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s3 9571; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 9572; GFX90A-NEXT: v_mul_lo_u32 v4, v1, s3 9573; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 9574; GFX90A-NEXT: v_mul_lo_u32 v5, v0, s3 9575; GFX90A-NEXT: v_sub_u32_e32 v4, s7, v3 9576; GFX90A-NEXT: v_mov_b32_e32 v6, s2 9577; GFX90A-NEXT: v_sub_co_u32_e32 v5, vcc, s6, v5 9578; GFX90A-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc 9579; GFX90A-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s3, v5 9580; GFX90A-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] 9581; GFX90A-NEXT: s_movk_i32 s3, 0x11e 9582; GFX90A-NEXT: v_cmp_lt_u32_e64 s[0:1], s3, v4 9583; GFX90A-NEXT: s_mov_b32 s6, 0x976a7376 9584; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 9585; GFX90A-NEXT: v_cmp_lt_u32_e64 s[0:1], s6, v6 9586; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 9587; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 9588; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] 9589; GFX90A-NEXT: v_mov_b32_e32 v7, s7 9590; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc 9591; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s3, v3 9592; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 9593; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 9594; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s6, v5 9595; GFX90A-NEXT: v_cndmask_b32_e64 v4, 1, 2, s[0:1] 9596; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 9597; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s2, v3 9598; GFX90A-NEXT: v_add_co_u32_e64 v4, s[0:1], v0, v4 9599; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc 9600; GFX90A-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] 9601; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 9602; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 9603; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 9604; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 9605; GFX90A-NEXT: s_endpgm 9606 %r = udiv i64 %x, 1235195949943 9607 store i64 %r, i64 addrspace(1)* %out 9608 ret void 9609} 9610 9611define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 9612; CHECK-LABEL: @udiv_i64_pow2k_denom( 9613; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 4096 9614; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 9615; CHECK-NEXT: ret void 9616; 9617; GFX6-LABEL: udiv_i64_pow2k_denom: 9618; GFX6: ; %bb.0: 9619; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 9620; GFX6-NEXT: s_mov_b32 s7, 0xf000 9621; GFX6-NEXT: s_mov_b32 s6, -1 9622; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9623; GFX6-NEXT: s_mov_b32 s4, s0 9624; GFX6-NEXT: s_mov_b32 s5, s1 9625; GFX6-NEXT: s_lshr_b64 s[0:1], s[2:3], 12 9626; GFX6-NEXT: v_mov_b32_e32 v0, s0 9627; GFX6-NEXT: v_mov_b32_e32 v1, s1 9628; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 9629; GFX6-NEXT: s_endpgm 9630; 9631; GFX9-LABEL: udiv_i64_pow2k_denom: 9632; GFX9: ; %bb.0: 9633; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 9634; GFX9-NEXT: v_mov_b32_e32 v2, 0 9635; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9636; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 9637; GFX9-NEXT: v_mov_b32_e32 v0, s2 9638; GFX9-NEXT: v_mov_b32_e32 v1, s3 9639; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 9640; GFX9-NEXT: s_endpgm 9641; 9642; GFX90A-LABEL: udiv_i64_pow2k_denom: 9643; GFX90A: ; %bb.0: 9644; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 9645; GFX90A-NEXT: v_mov_b32_e32 v2, 0 9646; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 9647; GFX90A-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 9648; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] 9649; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 9650; GFX90A-NEXT: s_endpgm 9651 %r = udiv i64 %x, 4096 9652 store i64 %r, i64 addrspace(1)* %out 9653 ret void 9654} 9655 9656define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 9657; CHECK-LABEL: @udiv_i64_pow2_shl_denom( 9658; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 9659; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]] 9660; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 9661; CHECK-NEXT: ret void 9662; 9663; GFX6-LABEL: udiv_i64_pow2_shl_denom: 9664; GFX6: ; %bb.0: 9665; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 9666; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd 9667; GFX6-NEXT: s_mov_b32 s3, 0xf000 9668; GFX6-NEXT: s_mov_b32 s2, -1 9669; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9670; GFX6-NEXT: s_mov_b32 s0, s4 9671; GFX6-NEXT: s_add_i32 s8, s8, 12 9672; GFX6-NEXT: s_mov_b32 s1, s5 9673; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 9674; GFX6-NEXT: v_mov_b32_e32 v0, s4 9675; GFX6-NEXT: v_mov_b32_e32 v1, s5 9676; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 9677; GFX6-NEXT: s_endpgm 9678; 9679; GFX9-LABEL: udiv_i64_pow2_shl_denom: 9680; GFX9: ; %bb.0: 9681; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 9682; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 9683; GFX9-NEXT: v_mov_b32_e32 v2, 0 9684; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9685; GFX9-NEXT: s_add_i32 s2, s2, 12 9686; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s2 9687; GFX9-NEXT: v_mov_b32_e32 v0, s0 9688; GFX9-NEXT: v_mov_b32_e32 v1, s1 9689; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 9690; GFX9-NEXT: s_endpgm 9691; 9692; GFX90A-LABEL: udiv_i64_pow2_shl_denom: 9693; GFX90A: ; %bb.0: 9694; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 9695; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x34 9696; GFX90A-NEXT: v_mov_b32_e32 v2, 0 9697; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 9698; GFX90A-NEXT: s_add_i32 s2, s2, 12 9699; GFX90A-NEXT: s_lshr_b64 s[0:1], s[6:7], s2 9700; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9701; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 9702; GFX90A-NEXT: s_endpgm 9703 %shl.y = shl i64 4096, %y 9704 %r = udiv i64 %x, %shl.y 9705 store i64 %r, i64 addrspace(1)* %out 9706 ret void 9707} 9708 9709define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 9710; CHECK-LABEL: @udiv_v2i64_pow2k_denom( 9711; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 9712; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 9713; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 9714; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 9715; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096 9716; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 9717; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 9718; CHECK-NEXT: ret void 9719; 9720; GFX6-LABEL: udiv_v2i64_pow2k_denom: 9721; GFX6: ; %bb.0: 9722; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 9723; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 9724; GFX6-NEXT: s_mov_b32 s7, 0xf000 9725; GFX6-NEXT: s_mov_b32 s6, -1 9726; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9727; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], 12 9728; GFX6-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 9729; GFX6-NEXT: v_mov_b32_e32 v0, s0 9730; GFX6-NEXT: v_mov_b32_e32 v1, s1 9731; GFX6-NEXT: v_mov_b32_e32 v2, s2 9732; GFX6-NEXT: v_mov_b32_e32 v3, s3 9733; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 9734; GFX6-NEXT: s_endpgm 9735; 9736; GFX9-LABEL: udiv_v2i64_pow2k_denom: 9737; GFX9: ; %bb.0: 9738; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 9739; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 9740; GFX9-NEXT: v_mov_b32_e32 v4, 0 9741; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9742; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 9743; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 12 9744; GFX9-NEXT: v_mov_b32_e32 v0, s0 9745; GFX9-NEXT: v_mov_b32_e32 v1, s1 9746; GFX9-NEXT: v_mov_b32_e32 v2, s4 9747; GFX9-NEXT: v_mov_b32_e32 v3, s5 9748; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 9749; GFX9-NEXT: s_endpgm 9750; 9751; GFX90A-LABEL: udiv_v2i64_pow2k_denom: 9752; GFX90A: ; %bb.0: 9753; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 9754; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 9755; GFX90A-NEXT: v_mov_b32_e32 v4, 0 9756; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 9757; GFX90A-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 9758; GFX90A-NEXT: s_lshr_b64 s[4:5], s[6:7], 12 9759; GFX90A-NEXT: v_mov_b32_e32 v0, s0 9760; GFX90A-NEXT: v_mov_b32_e32 v1, s1 9761; GFX90A-NEXT: v_mov_b32_e32 v2, s4 9762; GFX90A-NEXT: v_mov_b32_e32 v3, s5 9763; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 9764; GFX90A-NEXT: s_endpgm 9765 %r = udiv <2 x i64> %x, <i64 4096, i64 4096> 9766 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 9767 ret void 9768} 9769 9770define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 9771; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom( 9772; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 9773; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 9774; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 9775; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 9776; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095 9777; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 9778; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 9779; CHECK-NEXT: ret void 9780; 9781; GFX6-LABEL: udiv_v2i64_mixed_pow2k_denom: 9782; GFX6: ; %bb.0: 9783; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 9784; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 9785; GFX6-NEXT: v_rcp_f32_e32 v0, v0 9786; GFX6-NEXT: s_movk_i32 s2, 0xf001 9787; GFX6-NEXT: v_mov_b32_e32 v8, 0 9788; GFX6-NEXT: v_mov_b32_e32 v7, 0 9789; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9790; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9791; GFX6-NEXT: v_trunc_f32_e32 v1, v1 9792; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9793; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9794; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9795; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 9796; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 9797; GFX6-NEXT: s_mov_b32 s7, 0xf000 9798; GFX6-NEXT: v_mul_hi_u32 v2, v0, s2 9799; GFX6-NEXT: v_mul_lo_u32 v3, v1, s2 9800; GFX6-NEXT: v_mul_lo_u32 v4, v0, s2 9801; GFX6-NEXT: s_mov_b32 s6, -1 9802; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 9803; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9804; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 9805; GFX6-NEXT: v_mul_hi_u32 v6, v0, v4 9806; GFX6-NEXT: v_mul_hi_u32 v3, v0, v2 9807; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 9808; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9809; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 9810; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 9811; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 9812; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc 9813; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 9814; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc 9815; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 9816; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9817; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 9818; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 9819; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 9820; GFX6-NEXT: v_mul_hi_u32 v4, v0, s2 9821; GFX6-NEXT: v_mul_lo_u32 v5, v2, s2 9822; GFX6-NEXT: v_mul_lo_u32 v6, v0, s2 9823; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9824; GFX6-NEXT: s_lshr_b64 s[2:3], s[8:9], 12 9825; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 9826; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 9827; GFX6-NEXT: v_mul_lo_u32 v5, v0, v4 9828; GFX6-NEXT: v_mul_hi_u32 v9, v0, v6 9829; GFX6-NEXT: v_mul_hi_u32 v10, v0, v4 9830; GFX6-NEXT: v_mul_hi_u32 v11, v2, v4 9831; GFX6-NEXT: v_add_i32_e32 v5, vcc, v9, v5 9832; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc 9833; GFX6-NEXT: v_mul_lo_u32 v10, v2, v6 9834; GFX6-NEXT: v_mul_hi_u32 v6, v2, v6 9835; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 9836; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v10 9837; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v6, vcc 9838; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc 9839; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 9840; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 9841; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 9842; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 9843; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9844; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 9845; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 9846; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 9847; GFX6-NEXT: v_mul_hi_u32 v4, s10, v1 9848; GFX6-NEXT: v_mul_hi_u32 v5, s11, v1 9849; GFX6-NEXT: v_mul_lo_u32 v1, s11, v1 9850; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9851; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 9852; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 9853; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 9854; GFX6-NEXT: s_movk_i32 s0, 0xfff 9855; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9856; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 9857; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 9858; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 9859; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 9860; GFX6-NEXT: v_mul_lo_u32 v4, v1, s0 9861; GFX6-NEXT: v_mul_hi_u32 v5, v0, s0 9862; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 9863; GFX6-NEXT: v_mul_lo_u32 v8, v0, s0 9864; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 9865; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 9866; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 9867; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 9868; GFX6-NEXT: v_mov_b32_e32 v5, s11 9869; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s10, v8 9870; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc 9871; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s0, v8 9872; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc 9873; GFX6-NEXT: s_movk_i32 s0, 0xffe 9874; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 9875; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 9876; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 9877; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 9878; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 9879; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 9880; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 9881; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 9882; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] 9883; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 9884; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 9885; GFX6-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] 9886; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc 9887; GFX6-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] 9888; GFX6-NEXT: v_mov_b32_e32 v0, s2 9889; GFX6-NEXT: v_mov_b32_e32 v1, s3 9890; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 9891; GFX6-NEXT: s_endpgm 9892; 9893; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom: 9894; GFX9: ; %bb.0: 9895; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 9896; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 9897; GFX9-NEXT: v_rcp_f32_e32 v0, v0 9898; GFX9-NEXT: s_movk_i32 s4, 0xf001 9899; GFX9-NEXT: v_mov_b32_e32 v7, 0 9900; GFX9-NEXT: v_mov_b32_e32 v5, 0 9901; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9902; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9903; GFX9-NEXT: v_trunc_f32_e32 v1, v1 9904; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9905; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 9906; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 9907; GFX9-NEXT: s_movk_i32 s8, 0xfff 9908; GFX9-NEXT: v_mul_hi_u32 v2, v0, s4 9909; GFX9-NEXT: v_mul_lo_u32 v4, v1, s4 9910; GFX9-NEXT: v_mul_lo_u32 v3, v0, s4 9911; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 9912; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 9913; GFX9-NEXT: v_mul_hi_u32 v6, v0, v3 9914; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2 9915; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 9916; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 9917; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 9918; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 9919; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v8, vcc 9920; GFX9-NEXT: v_mul_lo_u32 v8, v1, v3 9921; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 9922; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 9923; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc 9924; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc 9925; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9926; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 9927; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 9928; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] 9929; GFX9-NEXT: v_mul_hi_u32 v4, v0, s4 9930; GFX9-NEXT: v_mul_lo_u32 v6, v2, s4 9931; GFX9-NEXT: v_mul_lo_u32 v8, v0, s4 9932; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 9933; GFX9-NEXT: v_sub_u32_e32 v4, v4, v0 9934; GFX9-NEXT: v_add_u32_e32 v4, v4, v6 9935; GFX9-NEXT: v_mul_lo_u32 v6, v0, v4 9936; GFX9-NEXT: v_mul_hi_u32 v9, v0, v8 9937; GFX9-NEXT: v_mul_hi_u32 v10, v0, v4 9938; GFX9-NEXT: v_mul_hi_u32 v11, v2, v4 9939; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 9940; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v9, v6 9941; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v7, v10, vcc 9942; GFX9-NEXT: v_mul_lo_u32 v10, v2, v8 9943; GFX9-NEXT: v_mul_hi_u32 v8, v2, v8 9944; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 9945; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 9946; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v10 9947; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v9, v8, vcc 9948; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v5, vcc 9949; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 9950; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc 9951; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] 9952; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 9953; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 9954; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9955; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 9956; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 9957; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 9958; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 9959; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 9960; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9961; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 9962; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 9963; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 9964; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 12 9965; GFX9-NEXT: s_movk_i32 s4, 0xffe 9966; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 9967; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 9968; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc 9969; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 9970; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v2, vcc 9971; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 9972; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 9973; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 9974; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9975; GFX9-NEXT: v_mov_b32_e32 v3, s7 9976; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s6, v4 9977; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v2, vcc 9978; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s8, v4 9979; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc 9980; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s4, v3 9981; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 9982; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 9983; GFX9-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 9984; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 9985; GFX9-NEXT: v_cndmask_b32_e64 v3, 1, 2, vcc 9986; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 9987; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc 9988; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s4, v4 9989; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 9990; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 9991; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc 9992; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 9993; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v3, vcc 9994; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v6, vcc 9995; GFX9-NEXT: v_mov_b32_e32 v0, s2 9996; GFX9-NEXT: v_mov_b32_e32 v1, s3 9997; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] 9998; GFX9-NEXT: s_endpgm 9999; 10000; GFX90A-LABEL: udiv_v2i64_mixed_pow2k_denom: 10001; GFX90A: ; %bb.0: 10002; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f800000 10003; GFX90A-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 10004; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 10005; GFX90A-NEXT: s_movk_i32 s8, 0xf001 10006; GFX90A-NEXT: v_mov_b32_e32 v8, 0 10007; GFX90A-NEXT: v_mov_b32_e32 v4, 0 10008; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10009; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10010; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 10011; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10012; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 10013; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 10014; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10015; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 10016; GFX90A-NEXT: v_mul_hi_u32 v2, v0, s8 10017; GFX90A-NEXT: v_sub_u32_e32 v2, v2, v0 10018; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s8 10019; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 10020; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s8 10021; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 10022; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 10023; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 10024; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 10025; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc 10026; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 10027; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 10028; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 10029; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 10030; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v9, vcc 10031; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc 10032; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 10033; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10034; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v5, vcc 10035; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v2 10036; GFX90A-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[0:1] 10037; GFX90A-NEXT: v_mul_hi_u32 v6, v0, s8 10038; GFX90A-NEXT: v_mul_lo_u32 v5, v2, s8 10039; GFX90A-NEXT: v_sub_u32_e32 v6, v6, v0 10040; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 10041; GFX90A-NEXT: v_mul_lo_u32 v9, v0, s8 10042; GFX90A-NEXT: v_mul_lo_u32 v7, v0, v5 10043; GFX90A-NEXT: v_mul_hi_u32 v10, v0, v9 10044; GFX90A-NEXT: v_mul_hi_u32 v6, v0, v5 10045; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v10, v7 10046; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v6, vcc 10047; GFX90A-NEXT: v_mul_hi_u32 v11, v2, v9 10048; GFX90A-NEXT: v_mul_lo_u32 v9, v2, v9 10049; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 10050; GFX90A-NEXT: v_mul_hi_u32 v10, v2, v5 10051; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v11, vcc 10052; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v4, vcc 10053; GFX90A-NEXT: v_mul_lo_u32 v2, v2, v5 10054; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 10055; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v7, vcc 10056; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 10057; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] 10058; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 10059; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 10060; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 10061; GFX90A-NEXT: v_mul_lo_u32 v3, s6, v1 10062; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 10063; GFX90A-NEXT: v_mul_hi_u32 v2, s6, v1 10064; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 10065; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v8, v2, vcc 10066; GFX90A-NEXT: v_mul_hi_u32 v6, s7, v0 10067; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 10068; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 10069; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 10070; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v2, v6, vcc 10071; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc 10072; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 10073; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 10074; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v2, vcc 10075; GFX90A-NEXT: s_movk_i32 s0, 0xfff 10076; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s0 10077; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s0 10078; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 10079; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s0 10080; GFX90A-NEXT: v_mov_b32_e32 v5, s7 10081; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 10082; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v5, v2, vcc 10083; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s0, v3 10084; GFX90A-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc 10085; GFX90A-NEXT: s_movk_i32 s0, 0xffe 10086; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 10087; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 10088; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 10089; GFX90A-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 10090; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 10091; GFX90A-NEXT: v_cndmask_b32_e64 v5, 1, 2, vcc 10092; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v0, v5 10093; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc 10094; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s0, v3 10095; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 10096; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 10097; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v3, vcc 10098; GFX90A-NEXT: s_lshr_b64 s[4:5], s[4:5], 12 10099; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 10100; GFX90A-NEXT: v_cndmask_b32_e32 v2, v0, v5, vcc 10101; GFX90A-NEXT: v_cndmask_b32_e32 v3, v1, v6, vcc 10102; GFX90A-NEXT: v_mov_b32_e32 v0, s4 10103; GFX90A-NEXT: v_mov_b32_e32 v1, s5 10104; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 10105; GFX90A-NEXT: s_endpgm 10106 %r = udiv <2 x i64> %x, <i64 4096, i64 4095> 10107 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 10108 ret void 10109} 10110 10111define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 10112; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom( 10113; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 10114; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 10115; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 10116; CHECK-NEXT: [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]] 10117; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 10118; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 10119; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 10120; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]] 10121; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 10122; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 10123; CHECK-NEXT: ret void 10124; 10125; GFX6-LABEL: udiv_v2i64_pow2_shl_denom: 10126; GFX6: ; %bb.0: 10127; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 10128; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 10129; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 10130; GFX6-NEXT: s_mov_b32 s7, 0xf000 10131; GFX6-NEXT: s_mov_b32 s6, -1 10132; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10133; GFX6-NEXT: s_add_i32 s0, s0, 12 10134; GFX6-NEXT: s_add_i32 s2, s2, 12 10135; GFX6-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 10136; GFX6-NEXT: s_lshr_b64 s[2:3], s[10:11], s2 10137; GFX6-NEXT: v_mov_b32_e32 v0, s0 10138; GFX6-NEXT: v_mov_b32_e32 v1, s1 10139; GFX6-NEXT: v_mov_b32_e32 v2, s2 10140; GFX6-NEXT: v_mov_b32_e32 v3, s3 10141; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 10142; GFX6-NEXT: s_endpgm 10143; 10144; GFX9-LABEL: udiv_v2i64_pow2_shl_denom: 10145; GFX9: ; %bb.0: 10146; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10147; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 10148; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 10149; GFX9-NEXT: v_mov_b32_e32 v4, 0 10150; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10151; GFX9-NEXT: s_add_i32 s0, s8, 12 10152; GFX9-NEXT: s_add_i32 s8, s10, 12 10153; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], s0 10154; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 10155; GFX9-NEXT: v_mov_b32_e32 v0, s0 10156; GFX9-NEXT: v_mov_b32_e32 v1, s1 10157; GFX9-NEXT: v_mov_b32_e32 v2, s4 10158; GFX9-NEXT: v_mov_b32_e32 v3, s5 10159; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 10160; GFX9-NEXT: s_endpgm 10161; 10162; GFX90A-LABEL: udiv_v2i64_pow2_shl_denom: 10163; GFX90A: ; %bb.0: 10164; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10165; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 10166; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 10167; GFX90A-NEXT: v_mov_b32_e32 v4, 0 10168; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 10169; GFX90A-NEXT: s_add_i32 s0, s8, 12 10170; GFX90A-NEXT: s_add_i32 s8, s10, 12 10171; GFX90A-NEXT: s_lshr_b64 s[0:1], s[4:5], s0 10172; GFX90A-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 10173; GFX90A-NEXT: v_mov_b32_e32 v0, s0 10174; GFX90A-NEXT: v_mov_b32_e32 v1, s1 10175; GFX90A-NEXT: v_mov_b32_e32 v2, s4 10176; GFX90A-NEXT: v_mov_b32_e32 v3, s5 10177; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 10178; GFX90A-NEXT: s_endpgm 10179 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 10180 %r = udiv <2 x i64> %x, %shl.y 10181 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 10182 ret void 10183} 10184 10185define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 10186; CHECK-LABEL: @urem_i64_oddk_denom( 10187; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993 10188; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 10189; CHECK-NEXT: ret void 10190; 10191; GFX6-LABEL: urem_i64_oddk_denom: 10192; GFX6: ; %bb.0: 10193; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 10194; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 10195; GFX6-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 10196; GFX6-NEXT: v_rcp_f32_e32 v0, v0 10197; GFX6-NEXT: s_movk_i32 s2, 0xfee0 10198; GFX6-NEXT: s_mov_b32 s3, 0x689e0837 10199; GFX6-NEXT: v_mov_b32_e32 v8, 0 10200; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10201; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10202; GFX6-NEXT: v_trunc_f32_e32 v1, v1 10203; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10204; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 10205; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 10206; GFX6-NEXT: v_mov_b32_e32 v7, 0 10207; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 10208; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 10209; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 10210; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 10211; GFX6-NEXT: s_mov_b32 s12, 0x9761f7c9 10212; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10213; GFX6-NEXT: s_mov_b32 s8, s4 10214; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10215; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 10216; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 10217; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 10218; GFX6-NEXT: v_mul_hi_u32 v6, v0, v3 10219; GFX6-NEXT: v_mul_hi_u32 v4, v0, v2 10220; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 10221; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 10222; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 10223; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 10224; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 10225; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 10226; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 10227; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc 10228; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 10229; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10230; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 10231; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 10232; GFX6-NEXT: v_mul_lo_u32 v4, v0, s2 10233; GFX6-NEXT: v_mul_hi_u32 v5, v0, s3 10234; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 10235; GFX6-NEXT: v_mul_lo_u32 v6, v2, s3 10236; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 10237; GFX6-NEXT: v_mul_lo_u32 v5, v0, s3 10238; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 10239; GFX6-NEXT: v_mul_lo_u32 v6, v0, v4 10240; GFX6-NEXT: v_mul_hi_u32 v9, v0, v5 10241; GFX6-NEXT: v_mul_hi_u32 v10, v0, v4 10242; GFX6-NEXT: v_mul_hi_u32 v11, v2, v4 10243; GFX6-NEXT: s_movk_i32 s4, 0x11f 10244; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 10245; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v8, v10, vcc 10246; GFX6-NEXT: v_mul_lo_u32 v10, v2, v5 10247; GFX6-NEXT: v_mul_hi_u32 v5, v2, v5 10248; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 10249; GFX6-NEXT: s_mov_b32 s9, s5 10250; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v10 10251; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc 10252; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v11, v7, vcc 10253; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 10254; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 10255; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 10256; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 10257; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 10258; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 10259; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 10260; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 10261; GFX6-NEXT: v_mul_hi_u32 v4, s6, v1 10262; GFX6-NEXT: v_mul_hi_u32 v5, s7, v1 10263; GFX6-NEXT: v_mul_lo_u32 v1, s7, v1 10264; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10265; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 10266; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 10267; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 10268; GFX6-NEXT: s_movk_i32 s5, 0x11e 10269; GFX6-NEXT: s_mov_b32 s11, 0xf000 10270; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 10271; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 10272; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 10273; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 10274; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 10275; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 10276; GFX6-NEXT: v_mul_hi_u32 v3, v0, s12 10277; GFX6-NEXT: v_mul_lo_u32 v1, v1, s12 10278; GFX6-NEXT: v_mul_lo_u32 v0, v0, s12 10279; GFX6-NEXT: s_mov_b32 s10, -1 10280; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10281; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 10282; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 10283; GFX6-NEXT: v_mov_b32_e32 v3, s4 10284; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 10285; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 10286; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 10287; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 10288; GFX6-NEXT: v_cmp_lt_u32_e64 s[2:3], s5, v5 10289; GFX6-NEXT: s_mov_b32 s6, 0x9761f7c8 10290; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 10291; GFX6-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v4 10292; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 10293; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 10294; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v5 10295; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 10296; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 10297; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 10298; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 10299; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 10300; GFX6-NEXT: v_mov_b32_e32 v5, s7 10301; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 10302; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s5, v1 10303; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 10304; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0 10305; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 10306; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 10307; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 10308; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 10309; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 10310; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 10311; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 10312; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 10313; GFX6-NEXT: s_endpgm 10314; 10315; GFX9-LABEL: urem_i64_oddk_denom: 10316; GFX9: ; %bb.0: 10317; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 10318; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 10319; GFX9-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 10320; GFX9-NEXT: v_rcp_f32_e32 v0, v0 10321; GFX9-NEXT: s_movk_i32 s4, 0xfee0 10322; GFX9-NEXT: s_mov_b32 s5, 0x689e0837 10323; GFX9-NEXT: v_mov_b32_e32 v8, 0 10324; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10325; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10326; GFX9-NEXT: v_trunc_f32_e32 v1, v1 10327; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10328; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 10329; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 10330; GFX9-NEXT: v_mov_b32_e32 v5, 0 10331; GFX9-NEXT: s_movk_i32 s8, 0x11f 10332; GFX9-NEXT: v_mul_lo_u32 v2, v0, s4 10333; GFX9-NEXT: v_mul_hi_u32 v3, v0, s5 10334; GFX9-NEXT: v_mul_lo_u32 v4, v1, s5 10335; GFX9-NEXT: v_mul_lo_u32 v6, v0, s5 10336; GFX9-NEXT: s_mov_b32 s9, 0x9761f7c9 10337; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 10338; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 10339; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 10340; GFX9-NEXT: v_mul_hi_u32 v4, v0, v6 10341; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 10342; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 10343; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 10344; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 10345; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v7, vcc 10346; GFX9-NEXT: v_mul_lo_u32 v7, v1, v6 10347; GFX9-NEXT: v_mul_hi_u32 v6, v1, v6 10348; GFX9-NEXT: s_mov_b32 s10, 0x9761f7c8 10349; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 10350; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v6, vcc 10351; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc 10352; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10353; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v4, vcc 10354; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 10355; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] 10356; GFX9-NEXT: v_mul_lo_u32 v4, v0, s4 10357; GFX9-NEXT: v_mul_hi_u32 v6, v0, s5 10358; GFX9-NEXT: v_mul_lo_u32 v7, v2, s5 10359; GFX9-NEXT: v_mul_lo_u32 v9, v0, s5 10360; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 10361; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 10362; GFX9-NEXT: v_add_u32_e32 v4, v4, v7 10363; GFX9-NEXT: v_mul_lo_u32 v6, v0, v4 10364; GFX9-NEXT: v_mul_hi_u32 v7, v0, v9 10365; GFX9-NEXT: v_mul_hi_u32 v10, v0, v4 10366; GFX9-NEXT: v_mul_hi_u32 v11, v2, v4 10367; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 10368; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 10369; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v8, v10, vcc 10370; GFX9-NEXT: v_mul_lo_u32 v10, v2, v9 10371; GFX9-NEXT: v_mul_hi_u32 v9, v2, v9 10372; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 10373; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v10 10374; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v9, vcc 10375; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v5, vcc 10376; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 10377; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc 10378; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] 10379; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 10380; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 10381; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10382; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 10383; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 10384; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 10385; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 10386; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 10387; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10388; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v4, vcc 10389; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 10390; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 10391; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 10392; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 10393; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc 10394; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 10395; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v2, vcc 10396; GFX9-NEXT: v_mul_lo_u32 v2, v0, s8 10397; GFX9-NEXT: v_mul_hi_u32 v3, v0, s9 10398; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 10399; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 10400; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 10401; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 10402; GFX9-NEXT: v_sub_u32_e32 v2, s7, v1 10403; GFX9-NEXT: v_mov_b32_e32 v3, s8 10404; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 10405; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc 10406; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s9, v0 10407; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1] 10408; GFX9-NEXT: s_movk_i32 s6, 0x11e 10409; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v6 10410; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 10411; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s10, v4 10412; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] 10413; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 10414; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s8, v6 10415; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s9, v4 10416; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] 10417; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] 10418; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 10419; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] 10420; GFX9-NEXT: v_mov_b32_e32 v4, s7 10421; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v4, v1, vcc 10422; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 10423; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 10424; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 10425; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1] 10426; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 10427; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 10428; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc 10429; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 10430; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 10431; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 10432; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] 10433; GFX9-NEXT: s_endpgm 10434; 10435; GFX90A-LABEL: urem_i64_oddk_denom: 10436; GFX90A: ; %bb.0: 10437; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 10438; GFX90A-NEXT: v_mov_b32_e32 v1, 0x4f800000 10439; GFX90A-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 10440; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 10441; GFX90A-NEXT: s_movk_i32 s2, 0xfee0 10442; GFX90A-NEXT: s_mov_b32 s3, 0x689e0837 10443; GFX90A-NEXT: v_mov_b32_e32 v8, 0 10444; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10445; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10446; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 10447; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10448; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 10449; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 10450; GFX90A-NEXT: v_mov_b32_e32 v2, 0 10451; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 10452; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s2 10453; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s3 10454; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 10455; GFX90A-NEXT: v_mul_lo_u32 v4, v1, s3 10456; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 10457; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s3 10458; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 10459; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 10460; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 10461; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 10462; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc 10463; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 10464; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 10465; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 10466; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 10467; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v9, vcc 10468; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 10469; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 10470; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 10471; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc 10472; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 10473; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] 10474; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s2 10475; GFX90A-NEXT: v_mul_hi_u32 v7, v0, s3 10476; GFX90A-NEXT: v_mul_lo_u32 v5, v3, s3 10477; GFX90A-NEXT: v_add_u32_e32 v6, v7, v6 10478; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 10479; GFX90A-NEXT: v_mul_lo_u32 v9, v0, s3 10480; GFX90A-NEXT: v_mul_lo_u32 v7, v0, v5 10481; GFX90A-NEXT: v_mul_hi_u32 v10, v0, v9 10482; GFX90A-NEXT: v_mul_hi_u32 v6, v0, v5 10483; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v10, v7 10484; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v6, vcc 10485; GFX90A-NEXT: v_mul_hi_u32 v11, v3, v9 10486; GFX90A-NEXT: v_mul_lo_u32 v9, v3, v9 10487; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 10488; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v5 10489; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v11, vcc 10490; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v2, vcc 10491; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 10492; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 10493; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v7, vcc 10494; GFX90A-NEXT: v_add_u32_e32 v1, v1, v4 10495; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] 10496; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 10497; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 10498; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 10499; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 10500; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 10501; GFX90A-NEXT: v_mul_hi_u32 v3, s6, v1 10502; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 10503; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc 10504; GFX90A-NEXT: v_mul_hi_u32 v6, s7, v0 10505; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 10506; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 10507; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 10508; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc 10509; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc 10510; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 10511; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 10512; GFX90A-NEXT: s_movk_i32 s8, 0x11f 10513; GFX90A-NEXT: s_mov_b32 s9, 0x9761f7c9 10514; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v3, vcc 10515; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s8 10516; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s9 10517; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 10518; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s9 10519; GFX90A-NEXT: v_add_u32_e32 v1, v3, v1 10520; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s9 10521; GFX90A-NEXT: v_sub_u32_e32 v3, s7, v1 10522; GFX90A-NEXT: v_mov_b32_e32 v4, s8 10523; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 10524; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc 10525; GFX90A-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s9, v0 10526; GFX90A-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1] 10527; GFX90A-NEXT: s_movk_i32 s6, 0x11e 10528; GFX90A-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v6 10529; GFX90A-NEXT: s_mov_b32 s10, 0x9761f7c8 10530; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 10531; GFX90A-NEXT: v_cmp_lt_u32_e64 s[2:3], s10, v5 10532; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] 10533; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 10534; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], s8, v6 10535; GFX90A-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s9, v5 10536; GFX90A-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] 10537; GFX90A-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] 10538; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 10539; GFX90A-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[0:1] 10540; GFX90A-NEXT: v_mov_b32_e32 v5, s7 10541; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc 10542; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 10543; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 10544; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 10545; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] 10546; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 10547; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 10548; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 10549; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 10550; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 10551; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 10552; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 10553; GFX90A-NEXT: s_endpgm 10554 %r = urem i64 %x, 1235195393993 10555 store i64 %r, i64 addrspace(1)* %out 10556 ret void 10557} 10558 10559define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 10560; CHECK-LABEL: @urem_i64_pow2k_denom( 10561; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 4096 10562; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 10563; CHECK-NEXT: ret void 10564; 10565; GFX6-LABEL: urem_i64_pow2k_denom: 10566; GFX6: ; %bb.0: 10567; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 10568; GFX6-NEXT: s_mov_b32 s3, 0xf000 10569; GFX6-NEXT: s_mov_b32 s2, -1 10570; GFX6-NEXT: v_mov_b32_e32 v1, 0 10571; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10572; GFX6-NEXT: s_mov_b32 s0, s4 10573; GFX6-NEXT: s_and_b32 s4, s6, 0xfff 10574; GFX6-NEXT: s_mov_b32 s1, s5 10575; GFX6-NEXT: v_mov_b32_e32 v0, s4 10576; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 10577; GFX6-NEXT: s_endpgm 10578; 10579; GFX9-LABEL: urem_i64_pow2k_denom: 10580; GFX9: ; %bb.0: 10581; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 10582; GFX9-NEXT: v_mov_b32_e32 v1, 0 10583; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10584; GFX9-NEXT: s_and_b32 s2, s2, 0xfff 10585; GFX9-NEXT: v_mov_b32_e32 v0, s2 10586; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 10587; GFX9-NEXT: s_endpgm 10588; 10589; GFX90A-LABEL: urem_i64_pow2k_denom: 10590; GFX90A: ; %bb.0: 10591; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 10592; GFX90A-NEXT: v_mov_b32_e32 v1, 0 10593; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 10594; GFX90A-NEXT: s_and_b32 s2, s2, 0xfff 10595; GFX90A-NEXT: v_mov_b32_e32 v0, s2 10596; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 10597; GFX90A-NEXT: s_endpgm 10598 %r = urem i64 %x, 4096 10599 store i64 %r, i64 addrspace(1)* %out 10600 ret void 10601} 10602 10603define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 10604; CHECK-LABEL: @urem_i64_pow2_shl_denom( 10605; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 10606; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]] 10607; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 10608; CHECK-NEXT: ret void 10609; 10610; GFX6-LABEL: urem_i64_pow2_shl_denom: 10611; GFX6: ; %bb.0: 10612; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 10613; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd 10614; GFX6-NEXT: s_mov_b32 s3, 0xf000 10615; GFX6-NEXT: s_mov_b32 s2, -1 10616; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10617; GFX6-NEXT: s_mov_b32 s0, s4 10618; GFX6-NEXT: s_mov_b32 s1, s5 10619; GFX6-NEXT: s_mov_b64 s[4:5], 0x1000 10620; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 10621; GFX6-NEXT: s_add_u32 s4, s4, -1 10622; GFX6-NEXT: s_addc_u32 s5, s5, -1 10623; GFX6-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 10624; GFX6-NEXT: v_mov_b32_e32 v0, s4 10625; GFX6-NEXT: v_mov_b32_e32 v1, s5 10626; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 10627; GFX6-NEXT: s_endpgm 10628; 10629; GFX9-LABEL: urem_i64_pow2_shl_denom: 10630; GFX9: ; %bb.0: 10631; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 10632; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 10633; GFX9-NEXT: s_mov_b64 s[0:1], 0x1000 10634; GFX9-NEXT: v_mov_b32_e32 v2, 0 10635; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10636; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 10637; GFX9-NEXT: s_add_u32 s0, s0, -1 10638; GFX9-NEXT: s_addc_u32 s1, s1, -1 10639; GFX9-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] 10640; GFX9-NEXT: v_mov_b32_e32 v0, s0 10641; GFX9-NEXT: v_mov_b32_e32 v1, s1 10642; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 10643; GFX9-NEXT: s_endpgm 10644; 10645; GFX90A-LABEL: urem_i64_pow2_shl_denom: 10646; GFX90A: ; %bb.0: 10647; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 10648; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x34 10649; GFX90A-NEXT: s_mov_b64 s[0:1], 0x1000 10650; GFX90A-NEXT: v_mov_b32_e32 v2, 0 10651; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 10652; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 10653; GFX90A-NEXT: s_add_u32 s0, s0, -1 10654; GFX90A-NEXT: s_addc_u32 s1, s1, -1 10655; GFX90A-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] 10656; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 10657; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 10658; GFX90A-NEXT: s_endpgm 10659 %shl.y = shl i64 4096, %y 10660 %r = urem i64 %x, %shl.y 10661 store i64 %r, i64 addrspace(1)* %out 10662 ret void 10663} 10664 10665define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 10666; CHECK-LABEL: @urem_v2i64_pow2k_denom( 10667; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 10668; CHECK-NEXT: [[TMP2:%.*]] = urem i64 [[TMP1]], 4096 10669; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 10670; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 10671; CHECK-NEXT: [[TMP5:%.*]] = urem i64 [[TMP4]], 4096 10672; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 10673; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 10674; CHECK-NEXT: ret void 10675; 10676; GFX6-LABEL: urem_v2i64_pow2k_denom: 10677; GFX6: ; %bb.0: 10678; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 10679; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 10680; GFX6-NEXT: s_movk_i32 s8, 0xfff 10681; GFX6-NEXT: v_mov_b32_e32 v1, 0 10682; GFX6-NEXT: s_mov_b32 s7, 0xf000 10683; GFX6-NEXT: s_mov_b32 s6, -1 10684; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10685; GFX6-NEXT: s_and_b32 s0, s0, s8 10686; GFX6-NEXT: s_and_b32 s1, s2, s8 10687; GFX6-NEXT: v_mov_b32_e32 v0, s0 10688; GFX6-NEXT: v_mov_b32_e32 v2, s1 10689; GFX6-NEXT: v_mov_b32_e32 v3, v1 10690; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 10691; GFX6-NEXT: s_endpgm 10692; 10693; GFX9-LABEL: urem_v2i64_pow2k_denom: 10694; GFX9: ; %bb.0: 10695; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10696; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 10697; GFX9-NEXT: s_movk_i32 s0, 0xfff 10698; GFX9-NEXT: v_mov_b32_e32 v1, 0 10699; GFX9-NEXT: v_mov_b32_e32 v3, v1 10700; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10701; GFX9-NEXT: s_and_b32 s1, s4, s0 10702; GFX9-NEXT: s_and_b32 s0, s6, s0 10703; GFX9-NEXT: v_mov_b32_e32 v0, s1 10704; GFX9-NEXT: v_mov_b32_e32 v2, s0 10705; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[2:3] 10706; GFX9-NEXT: s_endpgm 10707; 10708; GFX90A-LABEL: urem_v2i64_pow2k_denom: 10709; GFX90A: ; %bb.0: 10710; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10711; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 10712; GFX90A-NEXT: s_movk_i32 s0, 0xfff 10713; GFX90A-NEXT: v_mov_b32_e32 v1, 0 10714; GFX90A-NEXT: v_mov_b32_e32 v3, v1 10715; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 10716; GFX90A-NEXT: s_and_b32 s1, s4, s0 10717; GFX90A-NEXT: s_and_b32 s0, s6, s0 10718; GFX90A-NEXT: v_mov_b32_e32 v0, s1 10719; GFX90A-NEXT: v_mov_b32_e32 v2, s0 10720; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[2:3] 10721; GFX90A-NEXT: s_endpgm 10722 %r = urem <2 x i64> %x, <i64 4096, i64 4096> 10723 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 10724 ret void 10725} 10726 10727define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 10728; CHECK-LABEL: @urem_v2i64_pow2_shl_denom( 10729; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 10730; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 10731; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 10732; CHECK-NEXT: [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]] 10733; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 10734; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 10735; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 10736; CHECK-NEXT: [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]] 10737; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 10738; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 10739; CHECK-NEXT: ret void 10740; 10741; GFX6-LABEL: urem_v2i64_pow2_shl_denom: 10742; GFX6: ; %bb.0: 10743; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 10744; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 10745; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 10746; GFX6-NEXT: s_mov_b64 s[12:13], 0x1000 10747; GFX6-NEXT: s_mov_b32 s7, 0xf000 10748; GFX6-NEXT: s_mov_b32 s6, -1 10749; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10750; GFX6-NEXT: s_lshl_b64 s[2:3], s[12:13], s2 10751; GFX6-NEXT: s_lshl_b64 s[0:1], s[12:13], s0 10752; GFX6-NEXT: s_add_u32 s0, s0, -1 10753; GFX6-NEXT: s_addc_u32 s1, s1, -1 10754; GFX6-NEXT: s_and_b64 s[0:1], s[8:9], s[0:1] 10755; GFX6-NEXT: s_add_u32 s2, s2, -1 10756; GFX6-NEXT: s_addc_u32 s3, s3, -1 10757; GFX6-NEXT: s_and_b64 s[2:3], s[10:11], s[2:3] 10758; GFX6-NEXT: v_mov_b32_e32 v0, s0 10759; GFX6-NEXT: v_mov_b32_e32 v1, s1 10760; GFX6-NEXT: v_mov_b32_e32 v2, s2 10761; GFX6-NEXT: v_mov_b32_e32 v3, s3 10762; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 10763; GFX6-NEXT: s_endpgm 10764; 10765; GFX9-LABEL: urem_v2i64_pow2_shl_denom: 10766; GFX9: ; %bb.0: 10767; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10768; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 10769; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 10770; GFX9-NEXT: s_mov_b64 s[0:1], 0x1000 10771; GFX9-NEXT: v_mov_b32_e32 v4, 0 10772; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10773; GFX9-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 10774; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 10775; GFX9-NEXT: s_add_u32 s0, s0, -1 10776; GFX9-NEXT: s_addc_u32 s1, s1, -1 10777; GFX9-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1] 10778; GFX9-NEXT: s_add_u32 s4, s10, -1 10779; GFX9-NEXT: s_addc_u32 s5, s11, -1 10780; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 10781; GFX9-NEXT: v_mov_b32_e32 v0, s0 10782; GFX9-NEXT: v_mov_b32_e32 v1, s1 10783; GFX9-NEXT: v_mov_b32_e32 v2, s4 10784; GFX9-NEXT: v_mov_b32_e32 v3, s5 10785; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 10786; GFX9-NEXT: s_endpgm 10787; 10788; GFX90A-LABEL: urem_v2i64_pow2_shl_denom: 10789; GFX90A: ; %bb.0: 10790; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10791; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 10792; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 10793; GFX90A-NEXT: s_mov_b64 s[0:1], 0x1000 10794; GFX90A-NEXT: v_mov_b32_e32 v4, 0 10795; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 10796; GFX90A-NEXT: s_lshl_b64 s[10:11], s[0:1], s10 10797; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 10798; GFX90A-NEXT: s_add_u32 s0, s0, -1 10799; GFX90A-NEXT: s_addc_u32 s1, s1, -1 10800; GFX90A-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1] 10801; GFX90A-NEXT: s_add_u32 s4, s10, -1 10802; GFX90A-NEXT: s_addc_u32 s5, s11, -1 10803; GFX90A-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 10804; GFX90A-NEXT: v_mov_b32_e32 v0, s0 10805; GFX90A-NEXT: v_mov_b32_e32 v1, s1 10806; GFX90A-NEXT: v_mov_b32_e32 v2, s4 10807; GFX90A-NEXT: v_mov_b32_e32 v3, s5 10808; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 10809; GFX90A-NEXT: s_endpgm 10810 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 10811 %r = urem <2 x i64> %x, %shl.y 10812 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 10813 ret void 10814} 10815 10816define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 10817; CHECK-LABEL: @sdiv_i64_oddk_denom( 10818; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195 10819; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 10820; CHECK-NEXT: ret void 10821; 10822; GFX6-LABEL: sdiv_i64_oddk_denom: 10823; GFX6: ; %bb.0: 10824; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 10825; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 10826; GFX6-NEXT: v_rcp_f32_e32 v0, v0 10827; GFX6-NEXT: s_mov_b32 s2, 0xffed2705 10828; GFX6-NEXT: v_mov_b32_e32 v8, 0 10829; GFX6-NEXT: v_mov_b32_e32 v7, 0 10830; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10831; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10832; GFX6-NEXT: v_trunc_f32_e32 v1, v1 10833; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10834; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 10835; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 10836; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 10837; GFX6-NEXT: s_mov_b32 s7, 0xf000 10838; GFX6-NEXT: v_mul_lo_u32 v2, v1, s2 10839; GFX6-NEXT: v_mul_hi_u32 v3, v0, s2 10840; GFX6-NEXT: v_mul_lo_u32 v4, v0, s2 10841; GFX6-NEXT: s_mov_b32 s6, -1 10842; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10843; GFX6-NEXT: s_mov_b32 s4, s8 10844; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10845; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 10846; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 10847; GFX6-NEXT: v_mul_hi_u32 v6, v0, v4 10848; GFX6-NEXT: v_mul_hi_u32 v3, v0, v2 10849; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 10850; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 10851; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 10852; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 10853; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 10854; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc 10855; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 10856; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc 10857; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 10858; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10859; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 10860; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 10861; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 10862; GFX6-NEXT: v_mul_lo_u32 v4, v2, s2 10863; GFX6-NEXT: v_mul_hi_u32 v5, v0, s2 10864; GFX6-NEXT: s_mov_b32 s5, s9 10865; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 10866; GFX6-NEXT: v_mul_lo_u32 v5, v0, s2 10867; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 10868; GFX6-NEXT: v_mul_lo_u32 v10, v0, v4 10869; GFX6-NEXT: v_mul_hi_u32 v11, v0, v5 10870; GFX6-NEXT: v_mul_hi_u32 v12, v0, v4 10871; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 10872; GFX6-NEXT: v_mul_lo_u32 v5, v2, v5 10873; GFX6-NEXT: v_mul_hi_u32 v6, v2, v4 10874; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 10875; GFX6-NEXT: v_addc_u32_e32 v11, vcc, v8, v12, vcc 10876; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 10877; GFX6-NEXT: v_add_i32_e32 v5, vcc, v10, v5 10878; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc 10879; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc 10880; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 10881; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 10882; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 10883; GFX6-NEXT: s_ashr_i32 s2, s11, 31 10884; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 10885; GFX6-NEXT: s_add_u32 s0, s10, s2 10886; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 10887; GFX6-NEXT: s_mov_b32 s3, s2 10888; GFX6-NEXT: s_addc_u32 s1, s11, s2 10889; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 10890; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 10891; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 10892; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 10893; GFX6-NEXT: v_mul_hi_u32 v4, s0, v1 10894; GFX6-NEXT: v_mul_hi_u32 v5, s1, v1 10895; GFX6-NEXT: v_mul_lo_u32 v1, s1, v1 10896; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10897; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 10898; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 10899; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 10900; GFX6-NEXT: s_mov_b32 s3, 0x12d8fb 10901; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 10902; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 10903; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 10904; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 10905; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 10906; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 10907; GFX6-NEXT: v_mul_hi_u32 v5, v0, s3 10908; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 10909; GFX6-NEXT: v_mul_lo_u32 v8, v0, s3 10910; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 10911; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 10912; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 10913; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 10914; GFX6-NEXT: v_mov_b32_e32 v5, s1 10915; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s0, v8 10916; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc 10917; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s3, v8 10918; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc 10919; GFX6-NEXT: s_mov_b32 s0, 0x12d8fa 10920; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 10921; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 10922; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 10923; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 10924; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 10925; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 10926; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 10927; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 10928; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] 10929; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 10930; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 10931; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 10932; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 10933; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 10934; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 10935; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 10936; GFX6-NEXT: v_mov_b32_e32 v2, s2 10937; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 10938; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 10939; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 10940; GFX6-NEXT: s_endpgm 10941; 10942; GFX9-LABEL: sdiv_i64_oddk_denom: 10943; GFX9: ; %bb.0: 10944; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 10945; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 10946; GFX9-NEXT: v_rcp_f32_e32 v0, v0 10947; GFX9-NEXT: s_mov_b32 s8, 0xffed2705 10948; GFX9-NEXT: v_mov_b32_e32 v7, 0 10949; GFX9-NEXT: v_mov_b32_e32 v5, 0 10950; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10951; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10952; GFX9-NEXT: v_trunc_f32_e32 v1, v1 10953; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10954; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 10955; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 10956; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 10957; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 10958; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 10959; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 10960; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10961; GFX9-NEXT: s_ashr_i32 s0, s7, 31 10962; GFX9-NEXT: s_mov_b32 s1, s0 10963; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 10964; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 10965; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 10966; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 10967; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 10968; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 10969; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 10970; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 10971; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v8, vcc 10972; GFX9-NEXT: v_mul_lo_u32 v8, v1, v4 10973; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 10974; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 10975; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc 10976; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc 10977; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10978; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 10979; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 10980; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] 10981; GFX9-NEXT: v_mul_lo_u32 v4, v2, s8 10982; GFX9-NEXT: v_mul_hi_u32 v6, v0, s8 10983; GFX9-NEXT: v_mul_lo_u32 v8, v0, s8 10984; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 10985; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 10986; GFX9-NEXT: v_sub_u32_e32 v4, v4, v0 10987; GFX9-NEXT: v_mul_lo_u32 v10, v0, v4 10988; GFX9-NEXT: v_mul_hi_u32 v11, v0, v8 10989; GFX9-NEXT: v_mul_hi_u32 v12, v0, v4 10990; GFX9-NEXT: v_mul_hi_u32 v9, v2, v8 10991; GFX9-NEXT: v_mul_lo_u32 v8, v2, v8 10992; GFX9-NEXT: v_mul_hi_u32 v6, v2, v4 10993; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 10994; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v7, v12, vcc 10995; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 10996; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 10997; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc 10998; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc 10999; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 11000; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc 11001; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] 11002; GFX9-NEXT: s_add_u32 s2, s6, s0 11003; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 11004; GFX9-NEXT: s_addc_u32 s3, s7, s0 11005; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 11006; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] 11007; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 11008; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 11009; GFX9-NEXT: v_mul_hi_u32 v4, s2, v1 11010; GFX9-NEXT: v_mul_hi_u32 v6, s3, v1 11011; GFX9-NEXT: v_mul_lo_u32 v1, s3, v1 11012; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 11013; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 11014; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 11015; GFX9-NEXT: v_mul_hi_u32 v0, s3, v0 11016; GFX9-NEXT: s_mov_b32 s1, 0x12d8fb 11017; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 11018; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 11019; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc 11020; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 11021; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v2, vcc 11022; GFX9-NEXT: v_mul_lo_u32 v2, v1, s1 11023; GFX9-NEXT: v_mul_hi_u32 v3, v0, s1 11024; GFX9-NEXT: v_mul_lo_u32 v4, v0, s1 11025; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 11026; GFX9-NEXT: v_mov_b32_e32 v3, s3 11027; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s2, v4 11028; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v2, vcc 11029; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s1, v4 11030; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc 11031; GFX9-NEXT: s_mov_b32 s1, 0x12d8fa 11032; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s1, v3 11033; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 11034; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 11035; GFX9-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 11036; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 11037; GFX9-NEXT: v_cndmask_b32_e64 v3, 1, 2, vcc 11038; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 11039; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc 11040; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s1, v4 11041; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 11042; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 11043; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc 11044; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 11045; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 11046; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 11047; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 11048; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 11049; GFX9-NEXT: v_mov_b32_e32 v2, s0 11050; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 11051; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc 11052; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] 11053; GFX9-NEXT: s_endpgm 11054; 11055; GFX90A-LABEL: sdiv_i64_oddk_denom: 11056; GFX90A: ; %bb.0: 11057; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f800000 11058; GFX90A-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 11059; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 11060; GFX90A-NEXT: s_mov_b32 s2, 0xffed2705 11061; GFX90A-NEXT: v_mov_b32_e32 v8, 0 11062; GFX90A-NEXT: v_mov_b32_e32 v2, 0 11063; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 11064; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 11065; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 11066; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 11067; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 11068; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 11069; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 11070; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s2 11071; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s2 11072; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 11073; GFX90A-NEXT: v_sub_u32_e32 v3, v3, v0 11074; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s2 11075; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 11076; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 11077; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 11078; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 11079; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc 11080; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 11081; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 11082; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 11083; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 11084; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v9, vcc 11085; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 11086; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 11087; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 11088; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc 11089; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 11090; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] 11091; GFX90A-NEXT: v_mul_lo_u32 v5, v3, s2 11092; GFX90A-NEXT: v_mul_hi_u32 v6, v0, s2 11093; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 11094; GFX90A-NEXT: v_sub_u32_e32 v5, v5, v0 11095; GFX90A-NEXT: v_mul_lo_u32 v7, v0, s2 11096; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v7 11097; GFX90A-NEXT: v_mul_lo_u32 v10, v3, v7 11098; GFX90A-NEXT: v_mul_lo_u32 v12, v0, v5 11099; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v7 11100; GFX90A-NEXT: v_mul_hi_u32 v11, v0, v5 11101; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v12 11102; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, v8, v11, vcc 11103; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 11104; GFX90A-NEXT: v_mul_hi_u32 v6, v3, v5 11105; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v11, v9, vcc 11106; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v2, vcc 11107; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 11108; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v7, v3 11109; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc 11110; GFX90A-NEXT: v_add_u32_e32 v1, v1, v4 11111; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] 11112; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 11113; GFX90A-NEXT: s_ashr_i32 s0, s7, 31 11114; GFX90A-NEXT: s_add_u32 s2, s6, s0 11115; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 11116; GFX90A-NEXT: s_mov_b32 s1, s0 11117; GFX90A-NEXT: s_addc_u32 s3, s7, s0 11118; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 11119; GFX90A-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] 11120; GFX90A-NEXT: v_mul_lo_u32 v4, s2, v1 11121; GFX90A-NEXT: v_mul_hi_u32 v5, s2, v0 11122; GFX90A-NEXT: v_mul_hi_u32 v3, s2, v1 11123; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 11124; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc 11125; GFX90A-NEXT: v_mul_hi_u32 v6, s3, v0 11126; GFX90A-NEXT: v_mul_lo_u32 v0, s3, v0 11127; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 11128; GFX90A-NEXT: v_mul_hi_u32 v5, s3, v1 11129; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc 11130; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc 11131; GFX90A-NEXT: v_mul_lo_u32 v1, s3, v1 11132; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 11133; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v3, vcc 11134; GFX90A-NEXT: s_mov_b32 s1, 0x12d8fb 11135; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s1 11136; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s1 11137; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 11138; GFX90A-NEXT: v_mul_lo_u32 v4, v0, s1 11139; GFX90A-NEXT: v_mov_b32_e32 v5, s3 11140; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, s2, v4 11141; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v3, vcc 11142; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s1, v4 11143; GFX90A-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc 11144; GFX90A-NEXT: s_mov_b32 s1, 0x12d8fa 11145; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s1, v5 11146; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 11147; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 11148; GFX90A-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 11149; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 11150; GFX90A-NEXT: v_cndmask_b32_e64 v5, 1, 2, vcc 11151; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v0, v5 11152; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc 11153; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s1, v4 11154; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 11155; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 11156; GFX90A-NEXT: v_cndmask_b32_e32 v3, -1, v4, vcc 11157; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 11158; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 11159; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 11160; GFX90A-NEXT: v_xor_b32_e32 v0, s0, v0 11161; GFX90A-NEXT: v_xor_b32_e32 v1, s0, v1 11162; GFX90A-NEXT: v_mov_b32_e32 v3, s0 11163; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 11164; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 11165; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 11166; GFX90A-NEXT: s_endpgm 11167 %r = sdiv i64 %x, 1235195 11168 store i64 %r, i64 addrspace(1)* %out 11169 ret void 11170} 11171 11172define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 11173; CHECK-LABEL: @sdiv_i64_pow2k_denom( 11174; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 4096 11175; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 11176; CHECK-NEXT: ret void 11177; 11178; GFX6-LABEL: sdiv_i64_pow2k_denom: 11179; GFX6: ; %bb.0: 11180; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 11181; GFX6-NEXT: s_mov_b32 s7, 0xf000 11182; GFX6-NEXT: s_mov_b32 s6, -1 11183; GFX6-NEXT: s_waitcnt lgkmcnt(0) 11184; GFX6-NEXT: s_mov_b32 s4, s0 11185; GFX6-NEXT: s_ashr_i32 s0, s3, 31 11186; GFX6-NEXT: s_lshr_b32 s0, s0, 20 11187; GFX6-NEXT: s_add_u32 s0, s2, s0 11188; GFX6-NEXT: s_mov_b32 s5, s1 11189; GFX6-NEXT: s_addc_u32 s1, s3, 0 11190; GFX6-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 11191; GFX6-NEXT: v_mov_b32_e32 v0, s0 11192; GFX6-NEXT: v_mov_b32_e32 v1, s1 11193; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 11194; GFX6-NEXT: s_endpgm 11195; 11196; GFX9-LABEL: sdiv_i64_pow2k_denom: 11197; GFX9: ; %bb.0: 11198; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 11199; GFX9-NEXT: v_mov_b32_e32 v2, 0 11200; GFX9-NEXT: s_waitcnt lgkmcnt(0) 11201; GFX9-NEXT: s_ashr_i32 s4, s3, 31 11202; GFX9-NEXT: s_lshr_b32 s4, s4, 20 11203; GFX9-NEXT: s_add_u32 s2, s2, s4 11204; GFX9-NEXT: s_addc_u32 s3, s3, 0 11205; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 11206; GFX9-NEXT: v_mov_b32_e32 v0, s2 11207; GFX9-NEXT: v_mov_b32_e32 v1, s3 11208; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 11209; GFX9-NEXT: s_endpgm 11210; 11211; GFX90A-LABEL: sdiv_i64_pow2k_denom: 11212; GFX90A: ; %bb.0: 11213; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 11214; GFX90A-NEXT: v_mov_b32_e32 v2, 0 11215; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 11216; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 11217; GFX90A-NEXT: s_lshr_b32 s4, s4, 20 11218; GFX90A-NEXT: s_add_u32 s2, s2, s4 11219; GFX90A-NEXT: s_addc_u32 s3, s3, 0 11220; GFX90A-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 11221; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] 11222; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 11223; GFX90A-NEXT: s_endpgm 11224 %r = sdiv i64 %x, 4096 11225 store i64 %r, i64 addrspace(1)* %out 11226 ret void 11227} 11228 11229define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 11230; CHECK-LABEL: @sdiv_i64_pow2_shl_denom( 11231; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 11232; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]] 11233; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 11234; CHECK-NEXT: ret void 11235; 11236; GFX6-LABEL: sdiv_i64_pow2_shl_denom: 11237; GFX6: ; %bb.0: 11238; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd 11239; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 11240; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 11241; GFX6-NEXT: s_mov_b32 s7, 0xf000 11242; GFX6-NEXT: s_mov_b32 s6, -1 11243; GFX6-NEXT: s_waitcnt lgkmcnt(0) 11244; GFX6-NEXT: s_lshl_b64 s[4:5], s[2:3], s4 11245; GFX6-NEXT: s_ashr_i32 s2, s5, 31 11246; GFX6-NEXT: s_add_u32 s4, s4, s2 11247; GFX6-NEXT: s_mov_b32 s3, s2 11248; GFX6-NEXT: s_addc_u32 s5, s5, s2 11249; GFX6-NEXT: s_xor_b64 s[12:13], s[4:5], s[2:3] 11250; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 11251; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s13 11252; GFX6-NEXT: s_sub_u32 s4, 0, s12 11253; GFX6-NEXT: s_subb_u32 s5, 0, s13 11254; GFX6-NEXT: s_ashr_i32 s14, s11, 31 11255; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 11256; GFX6-NEXT: v_rcp_f32_e32 v0, v0 11257; GFX6-NEXT: s_mov_b32 s15, s14 11258; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 11259; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 11260; GFX6-NEXT: v_trunc_f32_e32 v1, v1 11261; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 11262; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 11263; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 11264; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 11265; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 11266; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 11267; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 11268; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11269; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 11270; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 11271; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 11272; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 11273; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 11274; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 11275; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 11276; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 11277; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 11278; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 11279; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 11280; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 11281; GFX6-NEXT: v_mov_b32_e32 v4, 0 11282; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 11283; GFX6-NEXT: v_mov_b32_e32 v6, 0 11284; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11285; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 11286; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 11287; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 11288; GFX6-NEXT: v_mul_lo_u32 v5, s4, v2 11289; GFX6-NEXT: v_mul_hi_u32 v7, s4, v0 11290; GFX6-NEXT: v_mul_lo_u32 v8, s5, v0 11291; GFX6-NEXT: s_mov_b32 s5, s9 11292; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 11293; GFX6-NEXT: v_mul_lo_u32 v7, s4, v0 11294; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 11295; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 11296; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 11297; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 11298; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 11299; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 11300; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 11301; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 11302; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 11303; GFX6-NEXT: v_mul_lo_u32 v2, v2, v5 11304; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 11305; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 11306; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 11307; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 11308; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 11309; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 11310; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 11311; GFX6-NEXT: s_add_u32 s0, s10, s14 11312; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 11313; GFX6-NEXT: s_addc_u32 s1, s11, s14 11314; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 11315; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 11316; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 11317; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 11318; GFX6-NEXT: v_mul_hi_u32 v5, s10, v1 11319; GFX6-NEXT: v_mul_hi_u32 v7, s11, v1 11320; GFX6-NEXT: v_mul_lo_u32 v1, s11, v1 11321; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11322; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 11323; GFX6-NEXT: v_mul_lo_u32 v5, s11, v0 11324; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 11325; GFX6-NEXT: s_mov_b32 s4, s8 11326; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 11327; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 11328; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 11329; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 11330; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 11331; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 11332; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 11333; GFX6-NEXT: v_mul_lo_u32 v4, s13, v0 11334; GFX6-NEXT: v_mov_b32_e32 v5, s13 11335; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11336; GFX6-NEXT: v_mul_lo_u32 v3, s12, v0 11337; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 11338; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s11, v2 11339; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 11340; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 11341; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s12, v3 11342; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 11343; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v4 11344; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 11345; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v5 11346; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 11347; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v4 11348; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 11349; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 11350; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 11351; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 11352; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 11353; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 11354; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] 11355; GFX6-NEXT: v_mov_b32_e32 v6, s11 11356; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 11357; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 11358; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 11359; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v3 11360; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 11361; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 11362; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 11363; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 11364; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 11365; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 11366; GFX6-NEXT: s_xor_b64 s[0:1], s[14:15], s[2:3] 11367; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 11368; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 11369; GFX6-NEXT: v_xor_b32_e32 v1, s1, v1 11370; GFX6-NEXT: v_mov_b32_e32 v2, s1 11371; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 11372; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 11373; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 11374; GFX6-NEXT: s_endpgm 11375; 11376; GFX9-LABEL: sdiv_i64_pow2_shl_denom: 11377; GFX9: ; %bb.0: 11378; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 11379; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 11380; GFX9-NEXT: v_mov_b32_e32 v2, 0 11381; GFX9-NEXT: s_waitcnt lgkmcnt(0) 11382; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 11383; GFX9-NEXT: s_ashr_i32 s8, s3, 31 11384; GFX9-NEXT: s_add_u32 s2, s2, s8 11385; GFX9-NEXT: s_mov_b32 s9, s8 11386; GFX9-NEXT: s_addc_u32 s3, s3, s8 11387; GFX9-NEXT: s_xor_b64 s[10:11], s[2:3], s[8:9] 11388; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 11389; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 11390; GFX9-NEXT: s_sub_u32 s12, 0, s10 11391; GFX9-NEXT: s_subb_u32 s4, 0, s11 11392; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 11393; GFX9-NEXT: v_rcp_f32_e32 v0, v0 11394; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 11395; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 11396; GFX9-NEXT: v_trunc_f32_e32 v1, v1 11397; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 11398; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 11399; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 11400; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 11401; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0 11402; GFX9-NEXT: v_mul_lo_u32 v6, s4, v0 11403; GFX9-NEXT: v_mul_lo_u32 v5, s12, v0 11404; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 11405; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 11406; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 11407; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3 11408; GFX9-NEXT: v_mul_hi_u32 v8, v0, v3 11409; GFX9-NEXT: v_mul_hi_u32 v7, v1, v5 11410; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 11411; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 11412; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 11413; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc 11414; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 11415; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 11416; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v7, vcc 11417; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v2, vcc 11418; GFX9-NEXT: v_mov_b32_e32 v6, 0 11419; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 11420; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc 11421; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v3 11422; GFX9-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[2:3] 11423; GFX9-NEXT: v_mul_lo_u32 v5, s12, v3 11424; GFX9-NEXT: v_mul_hi_u32 v7, s12, v0 11425; GFX9-NEXT: v_mul_lo_u32 v8, s4, v0 11426; GFX9-NEXT: v_mul_lo_u32 v9, s12, v0 11427; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 11428; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 11429; GFX9-NEXT: v_add_u32_e32 v5, v5, v8 11430; GFX9-NEXT: v_mul_lo_u32 v10, v0, v5 11431; GFX9-NEXT: v_mul_hi_u32 v11, v0, v9 11432; GFX9-NEXT: v_mul_hi_u32 v12, v0, v5 11433; GFX9-NEXT: v_mul_hi_u32 v8, v3, v9 11434; GFX9-NEXT: v_mul_lo_u32 v9, v3, v9 11435; GFX9-NEXT: v_mul_hi_u32 v7, v3, v5 11436; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 11437; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc 11438; GFX9-NEXT: v_mul_lo_u32 v3, v3, v5 11439; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 11440; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc 11441; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 11442; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 11443; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v5, vcc 11444; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 11445; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3] 11446; GFX9-NEXT: s_waitcnt lgkmcnt(0) 11447; GFX9-NEXT: s_ashr_i32 s2, s7, 31 11448; GFX9-NEXT: s_add_u32 s0, s6, s2 11449; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 11450; GFX9-NEXT: s_mov_b32 s3, s2 11451; GFX9-NEXT: s_addc_u32 s1, s7, s2 11452; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 11453; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3] 11454; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 11455; GFX9-NEXT: v_mul_hi_u32 v4, s6, v0 11456; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 11457; GFX9-NEXT: v_mul_hi_u32 v7, s7, v1 11458; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 11459; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 11460; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 11461; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 11462; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 11463; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 11464; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v4, v0, vcc 11465; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v2, vcc 11466; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 11467; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v3, vcc 11468; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 11469; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 11470; GFX9-NEXT: v_mul_lo_u32 v5, s11, v0 11471; GFX9-NEXT: v_mov_b32_e32 v6, s11 11472; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 11473; GFX9-NEXT: v_mul_lo_u32 v4, s10, v0 11474; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 11475; GFX9-NEXT: v_sub_u32_e32 v5, s7, v3 11476; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s6, v4 11477; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc 11478; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s10, v4 11479; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] 11480; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v5 11481; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 11482; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v6 11483; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 11484; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v5 11485; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v6, s[0:1] 11486; GFX9-NEXT: v_mov_b32_e32 v7, s7 11487; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc 11488; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 11489; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 11490; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 11491; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v4 11492; GFX9-NEXT: v_cndmask_b32_e64 v5, 1, 2, s[0:1] 11493; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 11494; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v3 11495; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v0, v5 11496; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc 11497; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] 11498; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 11499; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 11500; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[8:9] 11501; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 11502; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 11503; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 11504; GFX9-NEXT: v_mov_b32_e32 v3, s1 11505; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 11506; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 11507; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 11508; GFX9-NEXT: s_endpgm 11509; 11510; GFX90A-LABEL: sdiv_i64_pow2_shl_denom: 11511; GFX90A: ; %bb.0: 11512; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x34 11513; GFX90A-NEXT: s_mov_b64 s[2:3], 0x1000 11514; GFX90A-NEXT: v_mov_b32_e32 v2, 0 11515; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 11516; GFX90A-NEXT: s_lshl_b64 s[4:5], s[2:3], s4 11517; GFX90A-NEXT: s_ashr_i32 s2, s5, 31 11518; GFX90A-NEXT: s_add_u32 s4, s4, s2 11519; GFX90A-NEXT: s_mov_b32 s3, s2 11520; GFX90A-NEXT: s_addc_u32 s5, s5, s2 11521; GFX90A-NEXT: s_xor_b64 s[8:9], s[4:5], s[2:3] 11522; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 11523; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 11524; GFX90A-NEXT: s_sub_u32 s10, 0, s8 11525; GFX90A-NEXT: s_subb_u32 s11, 0, s9 11526; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 11527; GFX90A-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 11528; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 11529; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 11530; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 11531; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 11532; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 11533; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 11534; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 11535; GFX90A-NEXT: v_mul_lo_u32 v3, s10, v1 11536; GFX90A-NEXT: v_mul_hi_u32 v5, s10, v0 11537; GFX90A-NEXT: v_mul_lo_u32 v4, s11, v0 11538; GFX90A-NEXT: v_add_u32_e32 v3, v5, v3 11539; GFX90A-NEXT: v_mul_lo_u32 v6, s10, v0 11540; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 11541; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 11542; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 11543; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 11544; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 11545; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 11546; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 11547; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 11548; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 11549; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 11550; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc 11551; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 11552; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 11553; GFX90A-NEXT: v_mov_b32_e32 v6, 0 11554; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 11555; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc 11556; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 11557; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] 11558; GFX90A-NEXT: v_mul_lo_u32 v5, s10, v3 11559; GFX90A-NEXT: v_mul_hi_u32 v7, s10, v0 11560; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 11561; GFX90A-NEXT: v_mul_lo_u32 v7, s11, v0 11562; GFX90A-NEXT: v_add_u32_e32 v5, v5, v7 11563; GFX90A-NEXT: v_mul_lo_u32 v8, s10, v0 11564; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v8 11565; GFX90A-NEXT: v_mul_lo_u32 v10, v3, v8 11566; GFX90A-NEXT: v_mul_lo_u32 v12, v0, v5 11567; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8 11568; GFX90A-NEXT: v_mul_hi_u32 v11, v0, v5 11569; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 11570; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc 11571; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 11572; GFX90A-NEXT: v_mul_hi_u32 v7, v3, v5 11573; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc 11574; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v2, vcc 11575; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 11576; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 11577; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v7, vcc 11578; GFX90A-NEXT: v_add_u32_e32 v1, v1, v4 11579; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 11580; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 11581; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] 11582; GFX90A-NEXT: s_add_u32 s0, s6, s10 11583; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 11584; GFX90A-NEXT: s_mov_b32 s11, s10 11585; GFX90A-NEXT: s_addc_u32 s1, s7, s10 11586; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 11587; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 11588; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 11589; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 11590; GFX90A-NEXT: v_mul_hi_u32 v3, s6, v1 11591; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 11592; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 11593; GFX90A-NEXT: v_mul_hi_u32 v7, s7, v0 11594; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 11595; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 11596; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 11597; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v7, vcc 11598; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc 11599; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 11600; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 11601; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v3, vcc 11602; GFX90A-NEXT: v_mul_lo_u32 v3, s8, v1 11603; GFX90A-NEXT: v_mul_hi_u32 v4, s8, v0 11604; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 11605; GFX90A-NEXT: v_mul_lo_u32 v4, s9, v0 11606; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 11607; GFX90A-NEXT: v_mul_lo_u32 v5, s8, v0 11608; GFX90A-NEXT: v_sub_u32_e32 v4, s7, v3 11609; GFX90A-NEXT: v_mov_b32_e32 v6, s9 11610; GFX90A-NEXT: v_sub_co_u32_e32 v5, vcc, s6, v5 11611; GFX90A-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc 11612; GFX90A-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s8, v5 11613; GFX90A-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] 11614; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 11615; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 11616; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v6 11617; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 11618; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 11619; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] 11620; GFX90A-NEXT: v_mov_b32_e32 v7, s7 11621; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc 11622; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 11623; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 11624; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 11625; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 11626; GFX90A-NEXT: v_cndmask_b32_e64 v4, 1, 2, s[0:1] 11627; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 11628; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 11629; GFX90A-NEXT: v_add_co_u32_e64 v4, s[0:1], v0, v4 11630; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc 11631; GFX90A-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] 11632; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 11633; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 11634; GFX90A-NEXT: s_xor_b64 s[0:1], s[10:11], s[2:3] 11635; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 11636; GFX90A-NEXT: v_xor_b32_e32 v0, s0, v0 11637; GFX90A-NEXT: v_xor_b32_e32 v1, s1, v1 11638; GFX90A-NEXT: v_mov_b32_e32 v3, s1 11639; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 11640; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 11641; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 11642; GFX90A-NEXT: s_endpgm 11643 %shl.y = shl i64 4096, %y 11644 %r = sdiv i64 %x, %shl.y 11645 store i64 %r, i64 addrspace(1)* %out 11646 ret void 11647} 11648 11649define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 11650; CHECK-LABEL: @sdiv_v2i64_pow2k_denom( 11651; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 11652; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 11653; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 11654; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 11655; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096 11656; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 11657; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 11658; CHECK-NEXT: ret void 11659; 11660; GFX6-LABEL: sdiv_v2i64_pow2k_denom: 11661; GFX6: ; %bb.0: 11662; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 11663; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 11664; GFX6-NEXT: s_mov_b32 s7, 0xf000 11665; GFX6-NEXT: s_mov_b32 s6, -1 11666; GFX6-NEXT: s_waitcnt lgkmcnt(0) 11667; GFX6-NEXT: s_ashr_i32 s8, s1, 31 11668; GFX6-NEXT: s_lshr_b32 s8, s8, 20 11669; GFX6-NEXT: s_add_u32 s0, s0, s8 11670; GFX6-NEXT: s_addc_u32 s1, s1, 0 11671; GFX6-NEXT: s_ashr_i32 s8, s3, 31 11672; GFX6-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 11673; GFX6-NEXT: s_lshr_b32 s8, s8, 20 11674; GFX6-NEXT: s_add_u32 s2, s2, s8 11675; GFX6-NEXT: s_addc_u32 s3, s3, 0 11676; GFX6-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 11677; GFX6-NEXT: v_mov_b32_e32 v0, s0 11678; GFX6-NEXT: v_mov_b32_e32 v1, s1 11679; GFX6-NEXT: v_mov_b32_e32 v2, s2 11680; GFX6-NEXT: v_mov_b32_e32 v3, s3 11681; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 11682; GFX6-NEXT: s_endpgm 11683; 11684; GFX9-LABEL: sdiv_v2i64_pow2k_denom: 11685; GFX9: ; %bb.0: 11686; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 11687; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 11688; GFX9-NEXT: v_mov_b32_e32 v4, 0 11689; GFX9-NEXT: s_waitcnt lgkmcnt(0) 11690; GFX9-NEXT: s_ashr_i32 s0, s5, 31 11691; GFX9-NEXT: s_lshr_b32 s0, s0, 20 11692; GFX9-NEXT: s_add_u32 s0, s4, s0 11693; GFX9-NEXT: s_addc_u32 s1, s5, 0 11694; GFX9-NEXT: s_ashr_i32 s4, s7, 31 11695; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 11696; GFX9-NEXT: s_lshr_b32 s4, s4, 20 11697; GFX9-NEXT: s_add_u32 s4, s6, s4 11698; GFX9-NEXT: s_addc_u32 s5, s7, 0 11699; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 11700; GFX9-NEXT: v_mov_b32_e32 v0, s0 11701; GFX9-NEXT: v_mov_b32_e32 v1, s1 11702; GFX9-NEXT: v_mov_b32_e32 v2, s4 11703; GFX9-NEXT: v_mov_b32_e32 v3, s5 11704; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 11705; GFX9-NEXT: s_endpgm 11706; 11707; GFX90A-LABEL: sdiv_v2i64_pow2k_denom: 11708; GFX90A: ; %bb.0: 11709; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 11710; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 11711; GFX90A-NEXT: v_mov_b32_e32 v4, 0 11712; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 11713; GFX90A-NEXT: s_ashr_i32 s0, s5, 31 11714; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 11715; GFX90A-NEXT: s_add_u32 s0, s4, s0 11716; GFX90A-NEXT: s_addc_u32 s1, s5, 0 11717; GFX90A-NEXT: s_ashr_i32 s4, s7, 31 11718; GFX90A-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 11719; GFX90A-NEXT: s_lshr_b32 s4, s4, 20 11720; GFX90A-NEXT: s_add_u32 s4, s6, s4 11721; GFX90A-NEXT: s_addc_u32 s5, s7, 0 11722; GFX90A-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 11723; GFX90A-NEXT: v_mov_b32_e32 v0, s0 11724; GFX90A-NEXT: v_mov_b32_e32 v1, s1 11725; GFX90A-NEXT: v_mov_b32_e32 v2, s4 11726; GFX90A-NEXT: v_mov_b32_e32 v3, s5 11727; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 11728; GFX90A-NEXT: s_endpgm 11729 %r = sdiv <2 x i64> %x, <i64 4096, i64 4096> 11730 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 11731 ret void 11732} 11733 11734define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 11735; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom( 11736; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 11737; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 11738; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 11739; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 11740; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095 11741; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 11742; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 11743; CHECK-NEXT: ret void 11744; 11745; GFX6-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 11746; GFX6: ; %bb.0: 11747; GFX6-NEXT: v_mov_b32_e32 v0, 0x457ff000 11748; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 11749; GFX6-NEXT: v_mac_f32_e32 v0, 0, v1 11750; GFX6-NEXT: v_rcp_f32_e32 v0, v0 11751; GFX6-NEXT: s_movk_i32 s6, 0xf001 11752; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 11753; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 11754; GFX6-NEXT: s_mov_b32 s7, 0xf000 11755; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 11756; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 11757; GFX6-NEXT: v_trunc_f32_e32 v1, v1 11758; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 11759; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 11760; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 11761; GFX6-NEXT: s_waitcnt lgkmcnt(0) 11762; GFX6-NEXT: s_ashr_i32 s0, s9, 31 11763; GFX6-NEXT: s_lshr_b32 s0, s0, 20 11764; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 11765; GFX6-NEXT: v_mul_lo_u32 v3, v1, s6 11766; GFX6-NEXT: s_add_u32 s2, s8, s0 11767; GFX6-NEXT: s_addc_u32 s3, s9, 0 11768; GFX6-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 11769; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 11770; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 11771; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 11772; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 11773; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 11774; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 11775; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 11776; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 11777; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 11778; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 11779; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 11780; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 11781; GFX6-NEXT: s_ashr_i32 s8, s11, 31 11782; GFX6-NEXT: s_mov_b32 s9, s8 11783; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 11784; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 11785; GFX6-NEXT: v_mov_b32_e32 v4, 0 11786; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 11787; GFX6-NEXT: v_mov_b32_e32 v6, 0 11788; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11789; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 11790; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 11791; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 11792; GFX6-NEXT: v_mul_lo_u32 v5, v2, s6 11793; GFX6-NEXT: v_mul_hi_u32 v7, v0, s6 11794; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 11795; GFX6-NEXT: v_mul_lo_u32 v7, v0, s6 11796; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, v0, v5 11797; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 11798; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 11799; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 11800; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 11801; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 11802; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 11803; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 11804; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 11805; GFX6-NEXT: v_mul_lo_u32 v2, v2, v5 11806; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 11807; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 11808; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 11809; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 11810; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 11811; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 11812; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 11813; GFX6-NEXT: s_add_u32 s0, s10, s8 11814; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 11815; GFX6-NEXT: s_addc_u32 s1, s11, s8 11816; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 11817; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[8:9] 11818; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 11819; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 11820; GFX6-NEXT: v_mul_hi_u32 v5, s0, v1 11821; GFX6-NEXT: v_mul_hi_u32 v7, s1, v1 11822; GFX6-NEXT: v_mul_lo_u32 v1, s1, v1 11823; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11824; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 11825; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 11826; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 11827; GFX6-NEXT: s_movk_i32 s9, 0xfff 11828; GFX6-NEXT: s_mov_b32 s6, -1 11829; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 11830; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 11831; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 11832; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 11833; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 11834; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9 11835; GFX6-NEXT: v_mul_hi_u32 v5, v0, s9 11836; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 11837; GFX6-NEXT: v_mul_lo_u32 v8, v0, s9 11838; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 11839; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 11840; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 11841; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 11842; GFX6-NEXT: v_mov_b32_e32 v5, s1 11843; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s0, v8 11844; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc 11845; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s9, v8 11846; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc 11847; GFX6-NEXT: s_movk_i32 s0, 0xffe 11848; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 11849; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 11850; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 11851; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 11852; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 11853; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 11854; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 11855; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 11856; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] 11857; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 11858; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 11859; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 11860; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 11861; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 11862; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 11863; GFX6-NEXT: v_xor_b32_e32 v1, s8, v1 11864; GFX6-NEXT: v_mov_b32_e32 v3, s8 11865; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s8, v0 11866; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc 11867; GFX6-NEXT: v_mov_b32_e32 v0, s2 11868; GFX6-NEXT: v_mov_b32_e32 v1, s3 11869; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 11870; GFX6-NEXT: s_endpgm 11871; 11872; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 11873; GFX9: ; %bb.0: 11874; GFX9-NEXT: v_mov_b32_e32 v0, 0x457ff000 11875; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 11876; GFX9-NEXT: v_mac_f32_e32 v0, 0, v1 11877; GFX9-NEXT: v_rcp_f32_e32 v0, v0 11878; GFX9-NEXT: s_movk_i32 s8, 0xf001 11879; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 11880; GFX9-NEXT: v_mov_b32_e32 v4, 0 11881; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 11882; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 11883; GFX9-NEXT: v_trunc_f32_e32 v1, v1 11884; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 11885; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 11886; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 11887; GFX9-NEXT: s_waitcnt lgkmcnt(0) 11888; GFX9-NEXT: s_ashr_i32 s2, s5, 31 11889; GFX9-NEXT: s_lshr_b32 s2, s2, 20 11890; GFX9-NEXT: v_mul_hi_u32 v2, v0, s8 11891; GFX9-NEXT: v_mul_lo_u32 v3, v1, s8 11892; GFX9-NEXT: v_mul_lo_u32 v5, v0, s8 11893; GFX9-NEXT: s_add_u32 s4, s4, s2 11894; GFX9-NEXT: s_addc_u32 s5, s5, 0 11895; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 11896; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 11897; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 11898; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5 11899; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 11900; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 11901; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 11902; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 11903; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 11904; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 11905; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 11906; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 11907; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 11908; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 11909; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc 11910; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v4, vcc 11911; GFX9-NEXT: v_mov_b32_e32 v6, 0 11912; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 11913; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc 11914; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 11915; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] 11916; GFX9-NEXT: v_mul_lo_u32 v5, v2, s8 11917; GFX9-NEXT: v_mul_hi_u32 v7, v0, s8 11918; GFX9-NEXT: v_mul_lo_u32 v8, v0, s8 11919; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 11920; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 11921; GFX9-NEXT: v_sub_u32_e32 v5, v5, v0 11922; GFX9-NEXT: v_mul_lo_u32 v10, v0, v5 11923; GFX9-NEXT: v_mul_hi_u32 v11, v0, v8 11924; GFX9-NEXT: v_mul_hi_u32 v12, v0, v5 11925; GFX9-NEXT: v_mul_hi_u32 v9, v2, v8 11926; GFX9-NEXT: v_mul_lo_u32 v8, v2, v8 11927; GFX9-NEXT: v_mul_hi_u32 v7, v2, v5 11928; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 11929; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc 11930; GFX9-NEXT: v_mul_lo_u32 v2, v2, v5 11931; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 11932; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc 11933; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc 11934; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 11935; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v5, vcc 11936; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3] 11937; GFX9-NEXT: s_ashr_i32 s2, s7, 31 11938; GFX9-NEXT: s_add_u32 s6, s6, s2 11939; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 11940; GFX9-NEXT: s_mov_b32 s3, s2 11941; GFX9-NEXT: s_addc_u32 s7, s7, s2 11942; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 11943; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3] 11944; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 11945; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 11946; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 11947; GFX9-NEXT: v_mul_hi_u32 v7, s7, v1 11948; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 11949; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 11950; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc 11951; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 11952; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 11953; GFX9-NEXT: s_movk_i32 s3, 0xfff 11954; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 11955; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 11956; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7, v4, vcc 11957; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 11958; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v2, vcc 11959; GFX9-NEXT: v_mul_lo_u32 v2, v1, s3 11960; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 11961; GFX9-NEXT: v_mul_lo_u32 v5, v0, s3 11962; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 11963; GFX9-NEXT: v_mov_b32_e32 v3, s7 11964; GFX9-NEXT: v_sub_co_u32_e32 v5, vcc, s6, v5 11965; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v2, vcc 11966; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s3, v5 11967; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc 11968; GFX9-NEXT: s_movk_i32 s3, 0xffe 11969; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s3, v3 11970; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 11971; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 11972; GFX9-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 11973; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 11974; GFX9-NEXT: v_cndmask_b32_e64 v3, 1, 2, vcc 11975; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 11976; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc 11977; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s3, v5 11978; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 11979; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 11980; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v5, vcc 11981; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 11982; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 11983; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 11984; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 11985; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 11986; GFX9-NEXT: v_mov_b32_e32 v3, s2 11987; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v0 11988; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc 11989; GFX9-NEXT: v_mov_b32_e32 v0, s4 11990; GFX9-NEXT: v_mov_b32_e32 v1, s5 11991; GFX9-NEXT: s_waitcnt lgkmcnt(0) 11992; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 11993; GFX9-NEXT: s_endpgm 11994; 11995; GFX90A-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 11996; GFX90A: ; %bb.0: 11997; GFX90A-NEXT: v_mov_b32_e32 v0, 0x457ff000 11998; GFX90A-NEXT: v_mov_b32_e32 v1, 0x4f800000 11999; GFX90A-NEXT: v_mac_f32_e32 v0, 0, v1 12000; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 12001; GFX90A-NEXT: s_movk_i32 s8, 0xf001 12002; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 12003; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 12004; GFX90A-NEXT: v_mov_b32_e32 v4, 0 12005; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 12006; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 12007; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 12008; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 12009; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 12010; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 12011; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 12012; GFX90A-NEXT: s_ashr_i32 s0, s5, 31 12013; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 12014; GFX90A-NEXT: v_mul_hi_u32 v2, v0, s8 12015; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s8 12016; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 12017; GFX90A-NEXT: v_sub_u32_e32 v2, v2, v0 12018; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s8 12019; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 12020; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 12021; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 12022; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 12023; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 12024; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 12025; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 12026; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 12027; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 12028; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc 12029; GFX90A-NEXT: s_add_u32 s0, s4, s0 12030; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc 12031; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 12032; GFX90A-NEXT: s_addc_u32 s1, s5, 0 12033; GFX90A-NEXT: v_mov_b32_e32 v6, 0 12034; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 12035; GFX90A-NEXT: s_ashr_i64 s[4:5], s[0:1], 12 12036; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc 12037; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v2 12038; GFX90A-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[0:1] 12039; GFX90A-NEXT: v_mul_lo_u32 v5, v2, s8 12040; GFX90A-NEXT: v_mul_hi_u32 v7, v0, s8 12041; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 12042; GFX90A-NEXT: v_sub_u32_e32 v5, v5, v0 12043; GFX90A-NEXT: v_mul_lo_u32 v8, v0, s8 12044; GFX90A-NEXT: v_mul_hi_u32 v9, v2, v8 12045; GFX90A-NEXT: v_mul_lo_u32 v10, v2, v8 12046; GFX90A-NEXT: v_mul_lo_u32 v12, v0, v5 12047; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8 12048; GFX90A-NEXT: v_mul_hi_u32 v11, v0, v5 12049; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 12050; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc 12051; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 12052; GFX90A-NEXT: v_mul_hi_u32 v7, v2, v5 12053; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc 12054; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v4, vcc 12055; GFX90A-NEXT: v_mul_lo_u32 v2, v2, v5 12056; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 12057; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v7, vcc 12058; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 12059; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] 12060; GFX90A-NEXT: s_ashr_i32 s0, s7, 31 12061; GFX90A-NEXT: s_add_u32 s6, s6, s0 12062; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 12063; GFX90A-NEXT: s_mov_b32 s1, s0 12064; GFX90A-NEXT: s_addc_u32 s7, s7, s0 12065; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 12066; GFX90A-NEXT: s_xor_b64 s[6:7], s[6:7], s[0:1] 12067; GFX90A-NEXT: v_mul_lo_u32 v3, s6, v1 12068; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 12069; GFX90A-NEXT: v_mul_hi_u32 v2, s6, v1 12070; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 12071; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 12072; GFX90A-NEXT: v_mul_hi_u32 v7, s7, v0 12073; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 12074; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 12075; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 12076; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v2, v7, vcc 12077; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc 12078; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 12079; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 12080; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v2, vcc 12081; GFX90A-NEXT: s_movk_i32 s1, 0xfff 12082; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s1 12083; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s1 12084; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 12085; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s1 12086; GFX90A-NEXT: v_mov_b32_e32 v5, s7 12087; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 12088; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v5, v2, vcc 12089; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s1, v3 12090; GFX90A-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc 12091; GFX90A-NEXT: s_movk_i32 s1, 0xffe 12092; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s1, v5 12093; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 12094; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 12095; GFX90A-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 12096; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 12097; GFX90A-NEXT: v_cndmask_b32_e64 v5, 1, 2, vcc 12098; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v0, v5 12099; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc 12100; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s1, v3 12101; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 12102; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 12103; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v3, vcc 12104; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 12105; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 12106; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 12107; GFX90A-NEXT: v_xor_b32_e32 v0, s0, v0 12108; GFX90A-NEXT: v_xor_b32_e32 v1, s0, v1 12109; GFX90A-NEXT: v_mov_b32_e32 v3, s0 12110; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0 12111; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc 12112; GFX90A-NEXT: v_mov_b32_e32 v0, s4 12113; GFX90A-NEXT: v_mov_b32_e32 v1, s5 12114; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 12115; GFX90A-NEXT: s_endpgm 12116 %r = sdiv <2 x i64> %x, <i64 4096, i64 4095> 12117 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 12118 ret void 12119} 12120 12121define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 12122; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom( 12123; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 12124; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 12125; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 12126; CHECK-NEXT: [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]] 12127; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 12128; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 12129; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 12130; CHECK-NEXT: [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]] 12131; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 12132; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 12133; CHECK-NEXT: ret void 12134; 12135; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom: 12136; GFX6: ; %bb.0: 12137; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 12138; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 12139; GFX6-NEXT: s_mov_b32 s18, 0x4f800000 12140; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc 12141; GFX6-NEXT: s_mov_b32 s20, 0x2f800000 12142; GFX6-NEXT: s_waitcnt lgkmcnt(0) 12143; GFX6-NEXT: s_lshl_b64 s[12:13], s[2:3], s6 12144; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 12145; GFX6-NEXT: s_ashr_i32 s16, s3, 31 12146; GFX6-NEXT: s_add_u32 s2, s2, s16 12147; GFX6-NEXT: s_mov_b32 s17, s16 12148; GFX6-NEXT: s_addc_u32 s3, s3, s16 12149; GFX6-NEXT: s_xor_b64 s[14:15], s[2:3], s[16:17] 12150; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s14 12151; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s15 12152; GFX6-NEXT: s_mov_b32 s21, 0xcf800000 12153; GFX6-NEXT: s_sub_u32 s6, 0, s14 12154; GFX6-NEXT: s_subb_u32 s7, 0, s15 12155; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1 12156; GFX6-NEXT: v_rcp_f32_e32 v0, v0 12157; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 12158; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 12159; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0 12160; GFX6-NEXT: v_mul_f32_e32 v1, s20, v0 12161; GFX6-NEXT: v_trunc_f32_e32 v1, v1 12162; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 12163; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1 12164; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v0 12165; GFX6-NEXT: v_mul_lo_u32 v0, s6, v2 12166; GFX6-NEXT: v_mul_hi_u32 v1, s6, v3 12167; GFX6-NEXT: v_mul_lo_u32 v4, s7, v3 12168; GFX6-NEXT: v_mul_lo_u32 v5, s6, v3 12169; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 12170; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v4 12171; GFX6-NEXT: v_mul_lo_u32 v0, v3, v1 12172; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 12173; GFX6-NEXT: v_mul_hi_u32 v6, v3, v1 12174; GFX6-NEXT: v_mul_hi_u32 v7, v2, v1 12175; GFX6-NEXT: v_add_i32_e32 v0, vcc, v4, v0 12176; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 12177; GFX6-NEXT: v_mul_lo_u32 v6, v2, v5 12178; GFX6-NEXT: v_mul_hi_u32 v5, v2, v5 12179; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 12180; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc 12181; GFX6-NEXT: v_mul_lo_u32 v5, v2, v1 12182; GFX6-NEXT: v_mov_b32_e32 v0, 0 12183; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v7, v0, vcc 12184; GFX6-NEXT: v_mov_b32_e32 v1, 0 12185; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 12186; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v6, vcc 12187; GFX6-NEXT: v_add_i32_e64 v3, s[2:3], v3, v4 12188; GFX6-NEXT: v_addc_u32_e64 v4, vcc, v2, v5, s[2:3] 12189; GFX6-NEXT: v_mul_lo_u32 v6, s6, v4 12190; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 12191; GFX6-NEXT: v_mul_lo_u32 v8, s7, v3 12192; GFX6-NEXT: s_mov_b32 s7, 0xf000 12193; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 12194; GFX6-NEXT: v_mul_lo_u32 v7, s6, v3 12195; GFX6-NEXT: v_add_i32_e32 v6, vcc, v8, v6 12196; GFX6-NEXT: v_mul_lo_u32 v10, v3, v6 12197; GFX6-NEXT: v_mul_hi_u32 v11, v3, v7 12198; GFX6-NEXT: v_mul_hi_u32 v12, v3, v6 12199; GFX6-NEXT: v_mul_hi_u32 v9, v4, v7 12200; GFX6-NEXT: v_mul_lo_u32 v7, v4, v7 12201; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 12202; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 12203; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 12204; GFX6-NEXT: v_mul_lo_u32 v4, v4, v6 12205; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 12206; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 12207; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v8, v0, vcc 12208; GFX6-NEXT: v_add_i32_e32 v4, vcc, v7, v4 12209; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v1, v6, vcc 12210; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 12211; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v2, v6, s[2:3] 12212; GFX6-NEXT: s_waitcnt lgkmcnt(0) 12213; GFX6-NEXT: s_ashr_i32 s2, s9, 31 12214; GFX6-NEXT: s_add_u32 s0, s8, s2 12215; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 12216; GFX6-NEXT: s_mov_b32 s3, s2 12217; GFX6-NEXT: s_addc_u32 s1, s9, s2 12218; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 12219; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3] 12220; GFX6-NEXT: v_mul_lo_u32 v4, s8, v2 12221; GFX6-NEXT: v_mul_hi_u32 v5, s8, v3 12222; GFX6-NEXT: v_mul_hi_u32 v6, s8, v2 12223; GFX6-NEXT: v_mul_hi_u32 v7, s9, v2 12224; GFX6-NEXT: v_mul_lo_u32 v2, s9, v2 12225; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 12226; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 12227; GFX6-NEXT: v_mul_lo_u32 v6, s9, v3 12228; GFX6-NEXT: v_mul_hi_u32 v3, s9, v3 12229; GFX6-NEXT: s_xor_b64 s[16:17], s[2:3], s[16:17] 12230; GFX6-NEXT: s_ashr_i32 s2, s13, 31 12231; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 12232; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 12233; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v7, v0, vcc 12234; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 12235; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v1, v4, vcc 12236; GFX6-NEXT: v_mul_lo_u32 v4, s14, v3 12237; GFX6-NEXT: v_mul_hi_u32 v5, s14, v2 12238; GFX6-NEXT: v_mul_lo_u32 v6, s15, v2 12239; GFX6-NEXT: v_mov_b32_e32 v7, s15 12240; GFX6-NEXT: s_mov_b32 s3, s2 12241; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 12242; GFX6-NEXT: v_mul_lo_u32 v5, s14, v2 12243; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 12244; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s9, v4 12245; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s8, v5 12246; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc 12247; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s14, v5 12248; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] 12249; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v6 12250; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 12251; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7 12252; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 12253; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v6 12254; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] 12255; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v2 12256; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1] 12257; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v2 12258; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1] 12259; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 12260; GFX6-NEXT: s_add_u32 s8, s12, s2 12261; GFX6-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[0:1] 12262; GFX6-NEXT: v_mov_b32_e32 v8, s9 12263; GFX6-NEXT: s_addc_u32 s9, s13, s2 12264; GFX6-NEXT: s_xor_b64 s[8:9], s[8:9], s[2:3] 12265; GFX6-NEXT: v_cvt_f32_u32_e32 v10, s8 12266; GFX6-NEXT: v_cvt_f32_u32_e32 v11, s9 12267; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v8, v4, vcc 12268; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s15, v4 12269; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 12270; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s14, v5 12271; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 12272; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s15, v4 12273; GFX6-NEXT: v_mac_f32_e32 v10, s18, v11 12274; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v5, vcc 12275; GFX6-NEXT: v_rcp_f32_e32 v5, v10 12276; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 12277; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 12278; GFX6-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] 12279; GFX6-NEXT: v_mul_f32_e32 v5, s19, v5 12280; GFX6-NEXT: v_mul_f32_e32 v6, s20, v5 12281; GFX6-NEXT: v_trunc_f32_e32 v6, v6 12282; GFX6-NEXT: v_mac_f32_e32 v5, s21, v6 12283; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 12284; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v6 12285; GFX6-NEXT: s_sub_u32 s12, 0, s8 12286; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 12287; GFX6-NEXT: v_mul_hi_u32 v4, s12, v5 12288; GFX6-NEXT: v_mul_lo_u32 v7, s12, v6 12289; GFX6-NEXT: s_subb_u32 s13, 0, s9 12290; GFX6-NEXT: v_mul_lo_u32 v8, s13, v5 12291; GFX6-NEXT: v_xor_b32_e32 v2, s16, v2 12292; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 12293; GFX6-NEXT: v_mul_lo_u32 v7, s12, v5 12294; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8 12295; GFX6-NEXT: v_mul_lo_u32 v8, v5, v4 12296; GFX6-NEXT: v_mul_hi_u32 v9, v5, v7 12297; GFX6-NEXT: v_mul_hi_u32 v10, v5, v4 12298; GFX6-NEXT: v_mul_hi_u32 v11, v6, v4 12299; GFX6-NEXT: v_mul_lo_u32 v4, v6, v4 12300; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 12301; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 12302; GFX6-NEXT: v_mul_lo_u32 v10, v6, v7 12303; GFX6-NEXT: v_mul_hi_u32 v7, v6, v7 12304; GFX6-NEXT: v_xor_b32_e32 v3, s17, v3 12305; GFX6-NEXT: s_mov_b32 s6, -1 12306; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v10 12307; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc 12308; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v11, v0, vcc 12309; GFX6-NEXT: v_add_i32_e32 v4, vcc, v7, v4 12310; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v1, v8, vcc 12311; GFX6-NEXT: v_add_i32_e64 v4, s[0:1], v5, v4 12312; GFX6-NEXT: v_addc_u32_e64 v5, vcc, v6, v7, s[0:1] 12313; GFX6-NEXT: v_mul_lo_u32 v8, s12, v5 12314; GFX6-NEXT: v_mul_hi_u32 v9, s12, v4 12315; GFX6-NEXT: v_mul_lo_u32 v10, s13, v4 12316; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 12317; GFX6-NEXT: v_mul_lo_u32 v9, s12, v4 12318; GFX6-NEXT: v_add_i32_e32 v8, vcc, v10, v8 12319; GFX6-NEXT: v_mul_lo_u32 v12, v4, v8 12320; GFX6-NEXT: v_mul_hi_u32 v13, v4, v9 12321; GFX6-NEXT: v_mul_hi_u32 v14, v4, v8 12322; GFX6-NEXT: v_mul_hi_u32 v11, v5, v9 12323; GFX6-NEXT: v_mul_lo_u32 v9, v5, v9 12324; GFX6-NEXT: v_mul_hi_u32 v10, v5, v8 12325; GFX6-NEXT: v_add_i32_e32 v12, vcc, v13, v12 12326; GFX6-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc 12327; GFX6-NEXT: v_mul_lo_u32 v5, v5, v8 12328; GFX6-NEXT: v_add_i32_e32 v9, vcc, v12, v9 12329; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v13, v11, vcc 12330; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v10, v0, vcc 12331; GFX6-NEXT: v_add_i32_e32 v5, vcc, v9, v5 12332; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc 12333; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v7 12334; GFX6-NEXT: s_ashr_i32 s12, s11, 31 12335; GFX6-NEXT: v_addc_u32_e64 v6, vcc, v6, v8, s[0:1] 12336; GFX6-NEXT: s_add_u32 s0, s10, s12 12337; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 12338; GFX6-NEXT: s_mov_b32 s13, s12 12339; GFX6-NEXT: s_addc_u32 s1, s11, s12 12340; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 12341; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[12:13] 12342; GFX6-NEXT: v_mul_lo_u32 v6, s10, v5 12343; GFX6-NEXT: v_mul_hi_u32 v7, s10, v4 12344; GFX6-NEXT: v_mul_hi_u32 v9, s10, v5 12345; GFX6-NEXT: v_mul_hi_u32 v10, s11, v5 12346; GFX6-NEXT: v_mul_lo_u32 v5, s11, v5 12347; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 12348; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc 12349; GFX6-NEXT: v_mul_lo_u32 v9, s11, v4 12350; GFX6-NEXT: v_mul_hi_u32 v4, s11, v4 12351; GFX6-NEXT: v_mov_b32_e32 v8, s17 12352; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v9 12353; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc 12354; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v10, v0, vcc 12355; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 12356; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v0, vcc 12357; GFX6-NEXT: v_mul_lo_u32 v6, s8, v5 12358; GFX6-NEXT: v_mul_hi_u32 v7, s8, v4 12359; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s16, v2 12360; GFX6-NEXT: v_mul_lo_u32 v2, s9, v4 12361; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v3, v8, vcc 12362; GFX6-NEXT: v_add_i32_e32 v3, vcc, v7, v6 12363; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 12364; GFX6-NEXT: v_mul_lo_u32 v3, s8, v4 12365; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s11, v2 12366; GFX6-NEXT: v_mov_b32_e32 v7, s9 12367; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 12368; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc 12369; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s8, v3 12370; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] 12371; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 12372; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 12373; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 12374; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 12375; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 12376; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] 12377; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v4 12378; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v5, s[0:1] 12379; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v4 12380; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v5, s[0:1] 12381; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 12382; GFX6-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[0:1] 12383; GFX6-NEXT: v_mov_b32_e32 v8, s11 12384; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc 12385; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 12386; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 12387; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 12388; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 12389; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v2 12390; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc 12391; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 12392; GFX6-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[0:1] 12393; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc 12394; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[2:3] 12395; GFX6-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc 12396; GFX6-NEXT: v_xor_b32_e32 v3, s0, v3 12397; GFX6-NEXT: v_xor_b32_e32 v4, s1, v2 12398; GFX6-NEXT: v_mov_b32_e32 v5, s1 12399; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v3 12400; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v4, v5, vcc 12401; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 12402; GFX6-NEXT: s_endpgm 12403; 12404; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom: 12405; GFX9: ; %bb.0: 12406; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 12407; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 12408; GFX9-NEXT: s_mov_b32 s16, 0x4f800000 12409; GFX9-NEXT: s_mov_b32 s17, 0x5f7ffffc 12410; GFX9-NEXT: s_mov_b32 s18, 0x2f800000 12411; GFX9-NEXT: s_waitcnt lgkmcnt(0) 12412; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s6 12413; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 12414; GFX9-NEXT: s_ashr_i32 s12, s3, 31 12415; GFX9-NEXT: s_add_u32 s2, s2, s12 12416; GFX9-NEXT: s_mov_b32 s13, s12 12417; GFX9-NEXT: s_addc_u32 s3, s3, s12 12418; GFX9-NEXT: s_xor_b64 s[10:11], s[2:3], s[12:13] 12419; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 12420; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 12421; GFX9-NEXT: s_mov_b32 s19, 0xcf800000 12422; GFX9-NEXT: s_sub_u32 s14, 0, s10 12423; GFX9-NEXT: s_subb_u32 s4, 0, s11 12424; GFX9-NEXT: v_mac_f32_e32 v0, s16, v1 12425; GFX9-NEXT: v_rcp_f32_e32 v0, v0 12426; GFX9-NEXT: v_mul_f32_e32 v0, s17, v0 12427; GFX9-NEXT: v_mul_f32_e32 v1, s18, v0 12428; GFX9-NEXT: v_trunc_f32_e32 v1, v1 12429; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 12430; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v1 12431; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 12432; GFX9-NEXT: v_mul_lo_u32 v0, s14, v2 12433; GFX9-NEXT: v_mul_hi_u32 v1, s14, v3 12434; GFX9-NEXT: v_mul_lo_u32 v5, s4, v3 12435; GFX9-NEXT: v_mul_lo_u32 v4, s14, v3 12436; GFX9-NEXT: v_add_u32_e32 v0, v1, v0 12437; GFX9-NEXT: v_add_u32_e32 v5, v0, v5 12438; GFX9-NEXT: v_mul_hi_u32 v1, v3, v4 12439; GFX9-NEXT: v_mul_lo_u32 v6, v3, v5 12440; GFX9-NEXT: v_mul_hi_u32 v7, v3, v5 12441; GFX9-NEXT: v_mul_hi_u32 v8, v2, v5 12442; GFX9-NEXT: v_mul_lo_u32 v5, v2, v5 12443; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v6 12444; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 12445; GFX9-NEXT: v_mul_lo_u32 v7, v2, v4 12446; GFX9-NEXT: v_mul_hi_u32 v4, v2, v4 12447; GFX9-NEXT: v_mov_b32_e32 v0, 0 12448; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v7 12449; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v4, vcc 12450; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v0, vcc 12451; GFX9-NEXT: v_mov_b32_e32 v1, 0 12452; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 12453; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v6, vcc 12454; GFX9-NEXT: v_add_co_u32_e64 v3, s[2:3], v3, v4 12455; GFX9-NEXT: v_addc_co_u32_e64 v4, vcc, v2, v5, s[2:3] 12456; GFX9-NEXT: v_mul_lo_u32 v6, s14, v4 12457; GFX9-NEXT: v_mul_hi_u32 v7, s14, v3 12458; GFX9-NEXT: v_mul_lo_u32 v8, s4, v3 12459; GFX9-NEXT: v_mul_lo_u32 v9, s14, v3 12460; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 12461; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 12462; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 12463; GFX9-NEXT: v_mul_lo_u32 v10, v3, v6 12464; GFX9-NEXT: v_mul_hi_u32 v11, v3, v9 12465; GFX9-NEXT: v_mul_hi_u32 v12, v3, v6 12466; GFX9-NEXT: v_mul_hi_u32 v8, v4, v9 12467; GFX9-NEXT: v_mul_lo_u32 v9, v4, v9 12468; GFX9-NEXT: v_mul_hi_u32 v7, v4, v6 12469; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 12470; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc 12471; GFX9-NEXT: v_mul_lo_u32 v4, v4, v6 12472; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 12473; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc 12474; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v0, vcc 12475; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 12476; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v1, v6, vcc 12477; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 12478; GFX9-NEXT: s_waitcnt lgkmcnt(0) 12479; GFX9-NEXT: s_ashr_i32 s14, s5, 31 12480; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v2, v6, s[2:3] 12481; GFX9-NEXT: s_add_u32 s2, s4, s14 12482; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 12483; GFX9-NEXT: s_mov_b32 s15, s14 12484; GFX9-NEXT: s_addc_u32 s3, s5, s14 12485; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 12486; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], s[14:15] 12487; GFX9-NEXT: v_mul_lo_u32 v4, s4, v2 12488; GFX9-NEXT: v_mul_hi_u32 v5, s4, v3 12489; GFX9-NEXT: v_mul_hi_u32 v6, s4, v2 12490; GFX9-NEXT: v_mul_hi_u32 v7, s5, v2 12491; GFX9-NEXT: v_mul_lo_u32 v2, s5, v2 12492; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 12493; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc 12494; GFX9-NEXT: v_mul_lo_u32 v6, s5, v3 12495; GFX9-NEXT: v_mul_hi_u32 v3, s5, v3 12496; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 12497; GFX9-NEXT: s_xor_b64 s[12:13], s[14:15], s[12:13] 12498; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 12499; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc 12500; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v0, vcc 12501; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 12502; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v4, vcc 12503; GFX9-NEXT: v_mul_lo_u32 v4, s10, v3 12504; GFX9-NEXT: v_mul_hi_u32 v5, s10, v2 12505; GFX9-NEXT: v_mul_lo_u32 v6, s11, v2 12506; GFX9-NEXT: v_mov_b32_e32 v7, s11 12507; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 12508; GFX9-NEXT: v_mul_lo_u32 v5, s10, v2 12509; GFX9-NEXT: v_add_u32_e32 v4, v4, v6 12510; GFX9-NEXT: v_sub_u32_e32 v6, s5, v4 12511; GFX9-NEXT: v_sub_co_u32_e32 v5, vcc, s4, v5 12512; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v7, vcc 12513; GFX9-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s10, v5 12514; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1] 12515; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v6 12516; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 12517; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v7 12518; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 12519; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v6 12520; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] 12521; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 12522; GFX9-NEXT: v_cndmask_b32_e64 v6, 1, 2, s[0:1] 12523; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], v2, v6 12524; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v3, s[0:1] 12525; GFX9-NEXT: s_ashr_i32 s4, s9, 31 12526; GFX9-NEXT: s_add_u32 s0, s8, s4 12527; GFX9-NEXT: v_mov_b32_e32 v8, s5 12528; GFX9-NEXT: s_mov_b32 s5, s4 12529; GFX9-NEXT: s_addc_u32 s1, s9, s4 12530; GFX9-NEXT: s_xor_b64 s[8:9], s[0:1], s[4:5] 12531; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v8, v4, vcc 12532; GFX9-NEXT: v_cvt_f32_u32_e32 v8, s8 12533; GFX9-NEXT: v_cvt_f32_u32_e32 v9, s9 12534; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v4 12535; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc 12536; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v5 12537; GFX9-NEXT: v_mac_f32_e32 v8, s16, v9 12538; GFX9-NEXT: v_rcp_f32_e32 v8, v8 12539; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 12540; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v4 12541; GFX9-NEXT: v_cndmask_b32_e32 v4, v10, v5, vcc 12542; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 12543; GFX9-NEXT: v_mul_f32_e32 v4, s17, v8 12544; GFX9-NEXT: v_mul_f32_e32 v5, s18, v4 12545; GFX9-NEXT: v_trunc_f32_e32 v5, v5 12546; GFX9-NEXT: v_mac_f32_e32 v4, s19, v5 12547; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 12548; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 12549; GFX9-NEXT: s_sub_u32 s10, 0, s8 12550; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 12551; GFX9-NEXT: s_subb_u32 s11, 0, s9 12552; GFX9-NEXT: v_mul_hi_u32 v6, s10, v4 12553; GFX9-NEXT: v_mul_lo_u32 v8, s10, v5 12554; GFX9-NEXT: v_mul_lo_u32 v9, s11, v4 12555; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc 12556; GFX9-NEXT: v_mul_lo_u32 v7, s10, v4 12557; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 12558; GFX9-NEXT: v_add_u32_e32 v6, v6, v9 12559; GFX9-NEXT: v_mul_lo_u32 v8, v4, v6 12560; GFX9-NEXT: v_mul_hi_u32 v9, v4, v7 12561; GFX9-NEXT: v_mul_hi_u32 v10, v4, v6 12562; GFX9-NEXT: v_mul_hi_u32 v11, v5, v6 12563; GFX9-NEXT: v_mul_lo_u32 v6, v5, v6 12564; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 12565; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc 12566; GFX9-NEXT: v_mul_lo_u32 v10, v5, v7 12567; GFX9-NEXT: v_mul_hi_u32 v7, v5, v7 12568; GFX9-NEXT: v_xor_b32_e32 v2, s12, v2 12569; GFX9-NEXT: v_xor_b32_e32 v3, s13, v3 12570; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 12571; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v7, vcc 12572; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v0, vcc 12573; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 12574; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v8, vcc 12575; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v4, v6 12576; GFX9-NEXT: v_addc_co_u32_e64 v6, vcc, v5, v7, s[0:1] 12577; GFX9-NEXT: v_mul_lo_u32 v8, s10, v6 12578; GFX9-NEXT: v_mul_hi_u32 v9, s10, v4 12579; GFX9-NEXT: v_mul_lo_u32 v10, s11, v4 12580; GFX9-NEXT: v_mul_lo_u32 v11, s10, v4 12581; GFX9-NEXT: v_add_u32_e32 v5, v5, v7 12582; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 12583; GFX9-NEXT: v_add_u32_e32 v8, v8, v10 12584; GFX9-NEXT: v_mul_lo_u32 v12, v4, v8 12585; GFX9-NEXT: v_mul_hi_u32 v13, v4, v11 12586; GFX9-NEXT: v_mul_hi_u32 v14, v4, v8 12587; GFX9-NEXT: v_mul_hi_u32 v10, v6, v11 12588; GFX9-NEXT: v_mul_lo_u32 v11, v6, v11 12589; GFX9-NEXT: v_mul_hi_u32 v9, v6, v8 12590; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12 12591; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v14, vcc 12592; GFX9-NEXT: v_mul_lo_u32 v6, v6, v8 12593; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v12, v11 12594; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v10, vcc 12595; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v0, vcc 12596; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6 12597; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v8, vcc 12598; GFX9-NEXT: s_ashr_i32 s10, s7, 31 12599; GFX9-NEXT: v_addc_co_u32_e64 v5, vcc, v5, v8, s[0:1] 12600; GFX9-NEXT: s_add_u32 s0, s6, s10 12601; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 12602; GFX9-NEXT: s_mov_b32 s11, s10 12603; GFX9-NEXT: s_addc_u32 s1, s7, s10 12604; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc 12605; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 12606; GFX9-NEXT: v_mul_lo_u32 v6, s6, v5 12607; GFX9-NEXT: v_mul_hi_u32 v7, s6, v4 12608; GFX9-NEXT: v_mul_hi_u32 v9, s6, v5 12609; GFX9-NEXT: v_mul_hi_u32 v10, s7, v5 12610; GFX9-NEXT: v_mul_lo_u32 v5, s7, v5 12611; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 12612; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v9, vcc 12613; GFX9-NEXT: v_mul_lo_u32 v9, s7, v4 12614; GFX9-NEXT: v_mul_hi_u32 v4, s7, v4 12615; GFX9-NEXT: v_mov_b32_e32 v8, s13 12616; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v9 12617; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc 12618; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v10, v0, vcc 12619; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 12620; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v6, vcc 12621; GFX9-NEXT: v_mul_lo_u32 v6, s8, v5 12622; GFX9-NEXT: v_mul_hi_u32 v7, s8, v4 12623; GFX9-NEXT: v_mul_lo_u32 v9, s9, v4 12624; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s12, v2 12625; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v8, vcc 12626; GFX9-NEXT: v_add_u32_e32 v3, v7, v6 12627; GFX9-NEXT: v_mul_lo_u32 v6, s8, v4 12628; GFX9-NEXT: v_add_u32_e32 v3, v3, v9 12629; GFX9-NEXT: v_sub_u32_e32 v7, s7, v3 12630; GFX9-NEXT: v_mov_b32_e32 v8, s9 12631; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, s6, v6 12632; GFX9-NEXT: v_subb_co_u32_e64 v7, s[0:1], v7, v8, vcc 12633; GFX9-NEXT: v_subrev_co_u32_e64 v8, s[0:1], s8, v6 12634; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[0:1], 0, v7, s[0:1] 12635; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v7 12636; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] 12637; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 12638; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 12639; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v7 12640; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v8, s[0:1] 12641; GFX9-NEXT: v_mov_b32_e32 v9, s7 12642; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v9, v3, vcc 12643; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 12644; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 12645; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc 12646; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v6 12647; GFX9-NEXT: v_cndmask_b32_e64 v7, 1, 2, s[0:1] 12648; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 12649; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 12650; GFX9-NEXT: v_add_co_u32_e64 v7, s[0:1], v4, v7 12651; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v6, vcc 12652; GFX9-NEXT: v_addc_co_u32_e64 v8, s[0:1], 0, v5, s[0:1] 12653; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 12654; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v7, vcc 12655; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[4:5] 12656; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v8, vcc 12657; GFX9-NEXT: v_xor_b32_e32 v3, s0, v3 12658; GFX9-NEXT: v_xor_b32_e32 v4, s1, v4 12659; GFX9-NEXT: v_mov_b32_e32 v5, s1 12660; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v3 12661; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v5, vcc 12662; GFX9-NEXT: s_waitcnt lgkmcnt(0) 12663; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] 12664; GFX9-NEXT: s_endpgm 12665; 12666; GFX90A-LABEL: sdiv_v2i64_pow2_shl_denom: 12667; GFX90A: ; %bb.0: 12668; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 12669; GFX90A-NEXT: s_mov_b64 s[2:3], 0x1000 12670; GFX90A-NEXT: s_mov_b32 s16, 0x4f800000 12671; GFX90A-NEXT: s_mov_b32 s17, 0x5f7ffffc 12672; GFX90A-NEXT: s_mov_b32 s18, 0x2f800000 12673; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 12674; GFX90A-NEXT: s_lshl_b64 s[8:9], s[2:3], s6 12675; GFX90A-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 12676; GFX90A-NEXT: s_ashr_i32 s10, s3, 31 12677; GFX90A-NEXT: s_add_u32 s2, s2, s10 12678; GFX90A-NEXT: s_mov_b32 s11, s10 12679; GFX90A-NEXT: s_addc_u32 s3, s3, s10 12680; GFX90A-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] 12681; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s12 12682; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s13 12683; GFX90A-NEXT: s_mov_b32 s19, 0xcf800000 12684; GFX90A-NEXT: s_sub_u32 s14, 0, s12 12685; GFX90A-NEXT: s_subb_u32 s15, 0, s13 12686; GFX90A-NEXT: v_mac_f32_e32 v0, s16, v1 12687; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 12688; GFX90A-NEXT: v_mov_b32_e32 v4, 0 12689; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 12690; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 12691; GFX90A-NEXT: v_mul_f32_e32 v0, s17, v0 12692; GFX90A-NEXT: v_mul_f32_e32 v1, s18, v0 12693; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 12694; GFX90A-NEXT: v_mac_f32_e32 v0, s19, v1 12695; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 12696; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 12697; GFX90A-NEXT: v_mul_hi_u32 v3, s14, v0 12698; GFX90A-NEXT: v_mul_lo_u32 v5, s14, v1 12699; GFX90A-NEXT: v_mul_lo_u32 v2, s15, v0 12700; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 12701; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 12702; GFX90A-NEXT: v_mul_lo_u32 v6, s14, v0 12703; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 12704; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 12705; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 12706; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 12707; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 12708; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 12709; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 12710; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 12711; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 12712; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc 12713; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc 12714; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v2 12715; GFX90A-NEXT: v_mov_b32_e32 v2, 0 12716; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 12717; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v2, v5, vcc 12718; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 12719; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v5, s[0:1] 12720; GFX90A-NEXT: v_mul_lo_u32 v6, s14, v3 12721; GFX90A-NEXT: v_mul_hi_u32 v7, s14, v0 12722; GFX90A-NEXT: v_add_u32_e32 v6, v7, v6 12723; GFX90A-NEXT: v_mul_lo_u32 v7, s15, v0 12724; GFX90A-NEXT: v_add_u32_e32 v6, v6, v7 12725; GFX90A-NEXT: v_mul_lo_u32 v8, s14, v0 12726; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v8 12727; GFX90A-NEXT: v_mul_lo_u32 v10, v3, v8 12728; GFX90A-NEXT: v_mul_lo_u32 v12, v0, v6 12729; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8 12730; GFX90A-NEXT: v_mul_hi_u32 v11, v0, v6 12731; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 12732; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc 12733; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 12734; GFX90A-NEXT: v_mul_hi_u32 v7, v3, v6 12735; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc 12736; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v4, vcc 12737; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v6 12738; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 12739; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v2, v7, vcc 12740; GFX90A-NEXT: v_add_u32_e32 v1, v1, v5 12741; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 12742; GFX90A-NEXT: s_ashr_i32 s14, s5, 31 12743; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v6, s[0:1] 12744; GFX90A-NEXT: s_add_u32 s0, s4, s14 12745; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 12746; GFX90A-NEXT: s_mov_b32 s15, s14 12747; GFX90A-NEXT: s_addc_u32 s1, s5, s14 12748; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 12749; GFX90A-NEXT: s_xor_b64 s[4:5], s[0:1], s[14:15] 12750; GFX90A-NEXT: v_mul_lo_u32 v5, s4, v1 12751; GFX90A-NEXT: v_mul_hi_u32 v6, s4, v0 12752; GFX90A-NEXT: v_mul_hi_u32 v3, s4, v1 12753; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 12754; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 12755; GFX90A-NEXT: v_mul_hi_u32 v7, s5, v0 12756; GFX90A-NEXT: v_mul_lo_u32 v0, s5, v0 12757; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 12758; GFX90A-NEXT: v_mul_hi_u32 v6, s5, v1 12759; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v7, vcc 12760; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc 12761; GFX90A-NEXT: v_mul_lo_u32 v1, s5, v1 12762; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 12763; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc 12764; GFX90A-NEXT: v_mul_lo_u32 v3, s12, v1 12765; GFX90A-NEXT: v_mul_hi_u32 v5, s12, v0 12766; GFX90A-NEXT: v_add_u32_e32 v3, v5, v3 12767; GFX90A-NEXT: v_mul_lo_u32 v5, s13, v0 12768; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 12769; GFX90A-NEXT: v_mul_lo_u32 v6, s12, v0 12770; GFX90A-NEXT: v_sub_u32_e32 v5, s5, v3 12771; GFX90A-NEXT: v_mov_b32_e32 v7, s13 12772; GFX90A-NEXT: v_sub_co_u32_e32 v6, vcc, s4, v6 12773; GFX90A-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v7, vcc 12774; GFX90A-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s12, v6 12775; GFX90A-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] 12776; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v5 12777; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 12778; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v7 12779; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 12780; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v5 12781; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v7, s[0:1] 12782; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 12783; GFX90A-NEXT: v_cndmask_b32_e64 v5, 1, 2, s[0:1] 12784; GFX90A-NEXT: v_mov_b32_e32 v8, s5 12785; GFX90A-NEXT: v_add_co_u32_e64 v5, s[0:1], v0, v5 12786; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v8, v3, vcc 12787; GFX90A-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1] 12788; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s13, v3 12789; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 12790; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s12, v6 12791; GFX90A-NEXT: s_xor_b64 s[0:1], s[14:15], s[10:11] 12792; GFX90A-NEXT: s_ashr_i32 s4, s9, 31 12793; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 12794; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s13, v3 12795; GFX90A-NEXT: s_add_u32 s8, s8, s4 12796; GFX90A-NEXT: v_cndmask_b32_e32 v3, v8, v6, vcc 12797; GFX90A-NEXT: s_mov_b32 s5, s4 12798; GFX90A-NEXT: s_addc_u32 s9, s9, s4 12799; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 12800; GFX90A-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5] 12801; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 12802; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s8 12803; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s9 12804; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc 12805; GFX90A-NEXT: v_xor_b32_e32 v0, s0, v0 12806; GFX90A-NEXT: s_sub_u32 s10, 0, s8 12807; GFX90A-NEXT: v_mac_f32_e32 v3, s16, v5 12808; GFX90A-NEXT: v_rcp_f32_e32 v3, v3 12809; GFX90A-NEXT: v_xor_b32_e32 v1, s1, v1 12810; GFX90A-NEXT: v_mov_b32_e32 v6, s1 12811; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 12812; GFX90A-NEXT: v_mul_f32_e32 v3, s17, v3 12813; GFX90A-NEXT: v_mul_f32_e32 v5, s18, v3 12814; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 12815; GFX90A-NEXT: v_mac_f32_e32 v3, s19, v5 12816; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 12817; GFX90A-NEXT: v_cvt_u32_f32_e32 v5, v5 12818; GFX90A-NEXT: s_subb_u32 s11, 0, s9 12819; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc 12820; GFX90A-NEXT: v_mul_hi_u32 v7, s10, v3 12821; GFX90A-NEXT: v_mul_lo_u32 v8, s10, v5 12822; GFX90A-NEXT: v_mul_lo_u32 v6, s11, v3 12823; GFX90A-NEXT: v_add_u32_e32 v7, v7, v8 12824; GFX90A-NEXT: v_add_u32_e32 v6, v7, v6 12825; GFX90A-NEXT: v_mul_lo_u32 v9, s10, v3 12826; GFX90A-NEXT: v_mul_lo_u32 v8, v3, v6 12827; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v9 12828; GFX90A-NEXT: v_mul_hi_u32 v7, v3, v6 12829; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 12830; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc 12831; GFX90A-NEXT: v_mul_hi_u32 v11, v5, v9 12832; GFX90A-NEXT: v_mul_lo_u32 v9, v5, v9 12833; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9 12834; GFX90A-NEXT: v_mul_hi_u32 v10, v5, v6 12835; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v11, vcc 12836; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v10, v4, vcc 12837; GFX90A-NEXT: v_mul_lo_u32 v6, v5, v6 12838; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 12839; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v2, v8, vcc 12840; GFX90A-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v6 12841; GFX90A-NEXT: v_addc_co_u32_e64 v6, vcc, v5, v7, s[0:1] 12842; GFX90A-NEXT: v_mul_lo_u32 v8, s10, v6 12843; GFX90A-NEXT: v_mul_hi_u32 v9, s10, v3 12844; GFX90A-NEXT: v_add_u32_e32 v8, v9, v8 12845; GFX90A-NEXT: v_mul_lo_u32 v9, s11, v3 12846; GFX90A-NEXT: v_add_u32_e32 v8, v8, v9 12847; GFX90A-NEXT: v_mul_lo_u32 v10, s10, v3 12848; GFX90A-NEXT: v_mul_hi_u32 v11, v6, v10 12849; GFX90A-NEXT: v_mul_lo_u32 v12, v6, v10 12850; GFX90A-NEXT: v_mul_lo_u32 v14, v3, v8 12851; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v10 12852; GFX90A-NEXT: v_mul_hi_u32 v13, v3, v8 12853; GFX90A-NEXT: v_add_co_u32_e32 v10, vcc, v10, v14 12854; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc 12855; GFX90A-NEXT: v_add_co_u32_e32 v10, vcc, v10, v12 12856; GFX90A-NEXT: v_mul_hi_u32 v9, v6, v8 12857; GFX90A-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v11, vcc 12858; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v4, vcc 12859; GFX90A-NEXT: v_mul_lo_u32 v6, v6, v8 12860; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6 12861; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v2, v9, vcc 12862; GFX90A-NEXT: v_add_u32_e32 v5, v5, v7 12863; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 12864; GFX90A-NEXT: v_addc_co_u32_e64 v5, vcc, v5, v8, s[0:1] 12865; GFX90A-NEXT: s_add_u32 s0, s6, s10 12866; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 12867; GFX90A-NEXT: s_mov_b32 s11, s10 12868; GFX90A-NEXT: s_addc_u32 s1, s7, s10 12869; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc 12870; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 12871; GFX90A-NEXT: v_mul_lo_u32 v7, s6, v5 12872; GFX90A-NEXT: v_mul_hi_u32 v8, s6, v3 12873; GFX90A-NEXT: v_mul_hi_u32 v6, s6, v5 12874; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 12875; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 12876; GFX90A-NEXT: v_mul_hi_u32 v9, s7, v3 12877; GFX90A-NEXT: v_mul_lo_u32 v3, s7, v3 12878; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v7, v3 12879; GFX90A-NEXT: v_mul_hi_u32 v8, s7, v5 12880; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v9, vcc 12881; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v4, vcc 12882; GFX90A-NEXT: v_mul_lo_u32 v5, s7, v5 12883; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 12884; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v6, vcc 12885; GFX90A-NEXT: v_mul_lo_u32 v5, s8, v2 12886; GFX90A-NEXT: v_mul_hi_u32 v6, s8, v3 12887; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 12888; GFX90A-NEXT: v_mul_lo_u32 v6, s9, v3 12889; GFX90A-NEXT: v_add_u32_e32 v5, v5, v6 12890; GFX90A-NEXT: v_mul_lo_u32 v7, s8, v3 12891; GFX90A-NEXT: v_sub_u32_e32 v6, s7, v5 12892; GFX90A-NEXT: v_mov_b32_e32 v8, s9 12893; GFX90A-NEXT: v_sub_co_u32_e32 v7, vcc, s6, v7 12894; GFX90A-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v8, vcc 12895; GFX90A-NEXT: v_subrev_co_u32_e64 v8, s[0:1], s8, v7 12896; GFX90A-NEXT: v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1] 12897; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 12898; GFX90A-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] 12899; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 12900; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 12901; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 12902; GFX90A-NEXT: v_cndmask_b32_e64 v6, v9, v8, s[0:1] 12903; GFX90A-NEXT: v_mov_b32_e32 v9, s7 12904; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v9, v5, vcc 12905; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 12906; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 12907; GFX90A-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc 12908; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v7 12909; GFX90A-NEXT: v_cndmask_b32_e64 v6, 1, 2, s[0:1] 12910; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 12911; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s9, v5 12912; GFX90A-NEXT: v_add_co_u32_e64 v6, s[0:1], v3, v6 12913; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc 12914; GFX90A-NEXT: v_addc_co_u32_e64 v8, s[0:1], 0, v2, s[0:1] 12915; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 12916; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 12917; GFX90A-NEXT: s_xor_b64 s[0:1], s[10:11], s[4:5] 12918; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc 12919; GFX90A-NEXT: v_xor_b32_e32 v3, s0, v3 12920; GFX90A-NEXT: v_xor_b32_e32 v5, s1, v2 12921; GFX90A-NEXT: v_mov_b32_e32 v6, s1 12922; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v3 12923; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc 12924; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 12925; GFX90A-NEXT: s_endpgm 12926 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 12927 %r = sdiv <2 x i64> %x, %shl.y 12928 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 12929 ret void 12930} 12931 12932define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 12933; CHECK-LABEL: @srem_i64_oddk_denom( 12934; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 1235195 12935; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 12936; CHECK-NEXT: ret void 12937; 12938; GFX6-LABEL: srem_i64_oddk_denom: 12939; GFX6: ; %bb.0: 12940; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 12941; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 12942; GFX6-NEXT: v_rcp_f32_e32 v0, v0 12943; GFX6-NEXT: s_mov_b32 s2, 0xffed2705 12944; GFX6-NEXT: v_mov_b32_e32 v8, 0 12945; GFX6-NEXT: v_mov_b32_e32 v7, 0 12946; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 12947; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 12948; GFX6-NEXT: v_trunc_f32_e32 v1, v1 12949; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 12950; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 12951; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 12952; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 12953; GFX6-NEXT: s_mov_b32 s7, 0xf000 12954; GFX6-NEXT: v_mul_lo_u32 v2, v1, s2 12955; GFX6-NEXT: v_mul_hi_u32 v3, v0, s2 12956; GFX6-NEXT: v_mul_lo_u32 v4, v0, s2 12957; GFX6-NEXT: s_mov_b32 s6, -1 12958; GFX6-NEXT: s_waitcnt lgkmcnt(0) 12959; GFX6-NEXT: s_mov_b32 s4, s8 12960; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 12961; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 12962; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 12963; GFX6-NEXT: v_mul_hi_u32 v6, v0, v4 12964; GFX6-NEXT: v_mul_hi_u32 v3, v0, v2 12965; GFX6-NEXT: v_mul_hi_u32 v9, v1, v2 12966; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 12967; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 12968; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 12969; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 12970; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v3, vcc 12971; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 12972; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc 12973; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v9, v7, vcc 12974; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 12975; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 12976; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 12977; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 12978; GFX6-NEXT: v_mul_lo_u32 v4, v2, s2 12979; GFX6-NEXT: v_mul_hi_u32 v5, v0, s2 12980; GFX6-NEXT: s_mov_b32 s5, s9 12981; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 12982; GFX6-NEXT: v_mul_lo_u32 v5, v0, s2 12983; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, v0, v4 12984; GFX6-NEXT: v_mul_lo_u32 v10, v0, v4 12985; GFX6-NEXT: v_mul_hi_u32 v11, v0, v5 12986; GFX6-NEXT: v_mul_hi_u32 v12, v0, v4 12987; GFX6-NEXT: v_mul_hi_u32 v9, v2, v5 12988; GFX6-NEXT: v_mul_lo_u32 v5, v2, v5 12989; GFX6-NEXT: v_mul_hi_u32 v6, v2, v4 12990; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 12991; GFX6-NEXT: v_addc_u32_e32 v11, vcc, v8, v12, vcc 12992; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 12993; GFX6-NEXT: v_add_i32_e32 v5, vcc, v10, v5 12994; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v11, v9, vcc 12995; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v6, v7, vcc 12996; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 12997; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc 12998; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 12999; GFX6-NEXT: s_ashr_i32 s2, s11, 31 13000; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v4, s[0:1] 13001; GFX6-NEXT: s_add_u32 s0, s10, s2 13002; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 13003; GFX6-NEXT: s_mov_b32 s3, s2 13004; GFX6-NEXT: s_addc_u32 s1, s11, s2 13005; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 13006; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] 13007; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 13008; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 13009; GFX6-NEXT: v_mul_hi_u32 v4, s0, v1 13010; GFX6-NEXT: v_mul_hi_u32 v5, s1, v1 13011; GFX6-NEXT: v_mul_lo_u32 v1, s1, v1 13012; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 13013; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v4, vcc 13014; GFX6-NEXT: v_mul_lo_u32 v4, s1, v0 13015; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 13016; GFX6-NEXT: s_mov_b32 s3, 0x12d8fb 13017; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 13018; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 13019; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 13020; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 13021; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v8, v2, vcc 13022; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 13023; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3 13024; GFX6-NEXT: v_mul_lo_u32 v0, v0, s3 13025; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 13026; GFX6-NEXT: v_mov_b32_e32 v2, s1 13027; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 13028; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 13029; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v0 13030; GFX6-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc 13031; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s3, v2 13032; GFX6-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v3, vcc 13033; GFX6-NEXT: s_mov_b32 s0, 0x12d8fa 13034; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v2 13035; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 13036; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 13037; GFX6-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc 13038; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 13039; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v0 13040; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 13041; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 13042; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 13043; GFX6-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[0:1] 13044; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 13045; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 13046; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 13047; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 13048; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 13049; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 13050; GFX6-NEXT: v_mov_b32_e32 v2, s2 13051; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 13052; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 13053; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 13054; GFX6-NEXT: s_endpgm 13055; 13056; GFX9-LABEL: srem_i64_oddk_denom: 13057; GFX9: ; %bb.0: 13058; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 13059; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 13060; GFX9-NEXT: v_rcp_f32_e32 v0, v0 13061; GFX9-NEXT: s_mov_b32 s8, 0xffed2705 13062; GFX9-NEXT: v_mov_b32_e32 v7, 0 13063; GFX9-NEXT: v_mov_b32_e32 v5, 0 13064; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 13065; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 13066; GFX9-NEXT: v_trunc_f32_e32 v1, v1 13067; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 13068; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 13069; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 13070; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 13071; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 13072; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 13073; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 13074; GFX9-NEXT: s_waitcnt lgkmcnt(0) 13075; GFX9-NEXT: s_ashr_i32 s0, s7, 31 13076; GFX9-NEXT: s_mov_b32 s1, s0 13077; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 13078; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 13079; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 13080; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 13081; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 13082; GFX9-NEXT: v_mul_hi_u32 v9, v1, v2 13083; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 13084; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 13085; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v8, vcc 13086; GFX9-NEXT: v_mul_lo_u32 v8, v1, v4 13087; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 13088; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v8 13089; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc 13090; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v5, vcc 13091; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 13092; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 13093; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v2 13094; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3] 13095; GFX9-NEXT: v_mul_lo_u32 v4, v2, s8 13096; GFX9-NEXT: v_mul_hi_u32 v6, v0, s8 13097; GFX9-NEXT: v_mul_lo_u32 v8, v0, s8 13098; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 13099; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 13100; GFX9-NEXT: v_sub_u32_e32 v4, v4, v0 13101; GFX9-NEXT: v_mul_lo_u32 v10, v0, v4 13102; GFX9-NEXT: v_mul_hi_u32 v11, v0, v8 13103; GFX9-NEXT: v_mul_hi_u32 v12, v0, v4 13104; GFX9-NEXT: v_mul_hi_u32 v9, v2, v8 13105; GFX9-NEXT: v_mul_lo_u32 v8, v2, v8 13106; GFX9-NEXT: v_mul_hi_u32 v6, v2, v4 13107; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 13108; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v7, v12, vcc 13109; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 13110; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 13111; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc 13112; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc 13113; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2 13114; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc 13115; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3] 13116; GFX9-NEXT: s_add_u32 s2, s6, s0 13117; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 13118; GFX9-NEXT: s_addc_u32 s3, s7, s0 13119; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 13120; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] 13121; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 13122; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 13123; GFX9-NEXT: v_mul_hi_u32 v4, s2, v1 13124; GFX9-NEXT: v_mul_hi_u32 v6, s3, v1 13125; GFX9-NEXT: v_mul_lo_u32 v1, s3, v1 13126; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 13127; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v4, vcc 13128; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 13129; GFX9-NEXT: v_mul_hi_u32 v0, s3, v0 13130; GFX9-NEXT: s_mov_b32 s1, 0x12d8fb 13131; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 13132; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 13133; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc 13134; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 13135; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v7, v2, vcc 13136; GFX9-NEXT: v_mul_lo_u32 v1, v1, s1 13137; GFX9-NEXT: v_mul_hi_u32 v2, v0, s1 13138; GFX9-NEXT: v_mul_lo_u32 v0, v0, s1 13139; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 13140; GFX9-NEXT: v_mov_b32_e32 v2, s3 13141; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 13142; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 13143; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s1, v0 13144; GFX9-NEXT: v_subbrev_co_u32_e32 v3, vcc, 0, v1, vcc 13145; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s1, v2 13146; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc 13147; GFX9-NEXT: s_mov_b32 s1, 0x12d8fa 13148; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s1, v2 13149; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 13150; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 13151; GFX9-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc 13152; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 13153; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 13154; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 13155; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s1, v0 13156; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 13157; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 13158; GFX9-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 13159; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 13160; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 13161; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 13162; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 13163; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 13164; GFX9-NEXT: v_mov_b32_e32 v2, s0 13165; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 13166; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc 13167; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[4:5] 13168; GFX9-NEXT: s_endpgm 13169; 13170; GFX90A-LABEL: srem_i64_oddk_denom: 13171; GFX90A: ; %bb.0: 13172; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f800000 13173; GFX90A-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 13174; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 13175; GFX90A-NEXT: s_mov_b32 s2, 0xffed2705 13176; GFX90A-NEXT: v_mov_b32_e32 v8, 0 13177; GFX90A-NEXT: v_mov_b32_e32 v2, 0 13178; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 13179; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 13180; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 13181; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 13182; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 13183; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 13184; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 13185; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s2 13186; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s2 13187; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 13188; GFX90A-NEXT: v_sub_u32_e32 v3, v3, v0 13189; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s2 13190; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 13191; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 13192; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 13193; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 13194; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v4, vcc 13195; GFX90A-NEXT: v_mul_hi_u32 v9, v1, v6 13196; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 13197; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 13198; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 13199; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v9, vcc 13200; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 13201; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 13202; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 13203; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc 13204; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 13205; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] 13206; GFX90A-NEXT: v_mul_lo_u32 v5, v3, s2 13207; GFX90A-NEXT: v_mul_hi_u32 v6, v0, s2 13208; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 13209; GFX90A-NEXT: v_sub_u32_e32 v5, v5, v0 13210; GFX90A-NEXT: v_mul_lo_u32 v7, v0, s2 13211; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v7 13212; GFX90A-NEXT: v_mul_lo_u32 v10, v3, v7 13213; GFX90A-NEXT: v_mul_lo_u32 v12, v0, v5 13214; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v7 13215; GFX90A-NEXT: v_mul_hi_u32 v11, v0, v5 13216; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v12 13217; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, v8, v11, vcc 13218; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v10 13219; GFX90A-NEXT: v_mul_hi_u32 v6, v3, v5 13220; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v11, v9, vcc 13221; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v2, vcc 13222; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 13223; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v7, v3 13224; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc 13225; GFX90A-NEXT: v_add_u32_e32 v1, v1, v4 13226; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] 13227; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 13228; GFX90A-NEXT: s_ashr_i32 s0, s7, 31 13229; GFX90A-NEXT: s_add_u32 s2, s6, s0 13230; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 13231; GFX90A-NEXT: s_mov_b32 s1, s0 13232; GFX90A-NEXT: s_addc_u32 s3, s7, s0 13233; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 13234; GFX90A-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] 13235; GFX90A-NEXT: v_mul_lo_u32 v4, s2, v1 13236; GFX90A-NEXT: v_mul_hi_u32 v5, s2, v0 13237; GFX90A-NEXT: v_mul_hi_u32 v3, s2, v1 13238; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 13239; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc 13240; GFX90A-NEXT: v_mul_hi_u32 v6, s3, v0 13241; GFX90A-NEXT: v_mul_lo_u32 v0, s3, v0 13242; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 13243; GFX90A-NEXT: v_mul_hi_u32 v5, s3, v1 13244; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc 13245; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc 13246; GFX90A-NEXT: v_mul_lo_u32 v1, s3, v1 13247; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 13248; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v8, v3, vcc 13249; GFX90A-NEXT: s_mov_b32 s1, 0x12d8fb 13250; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s1 13251; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s1 13252; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s1 13253; GFX90A-NEXT: v_add_u32_e32 v1, v3, v1 13254; GFX90A-NEXT: v_mov_b32_e32 v3, s3 13255; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 13256; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc 13257; GFX90A-NEXT: v_subrev_co_u32_e32 v3, vcc, s1, v0 13258; GFX90A-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v1, vcc 13259; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s1, v3 13260; GFX90A-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v4, vcc 13261; GFX90A-NEXT: s_mov_b32 s1, 0x12d8fa 13262; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s1, v3 13263; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 13264; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 13265; GFX90A-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc 13266; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 13267; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 13268; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc 13269; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s1, v0 13270; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 13271; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 13272; GFX90A-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 13273; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 13274; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 13275; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 13276; GFX90A-NEXT: v_xor_b32_e32 v0, s0, v0 13277; GFX90A-NEXT: v_xor_b32_e32 v1, s0, v1 13278; GFX90A-NEXT: v_mov_b32_e32 v3, s0 13279; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 13280; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 13281; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 13282; GFX90A-NEXT: s_endpgm 13283 %r = srem i64 %x, 1235195 13284 store i64 %r, i64 addrspace(1)* %out 13285 ret void 13286} 13287 13288define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 13289; CHECK-LABEL: @srem_i64_pow2k_denom( 13290; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 4096 13291; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 13292; CHECK-NEXT: ret void 13293; 13294; GFX6-LABEL: srem_i64_pow2k_denom: 13295; GFX6: ; %bb.0: 13296; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 13297; GFX6-NEXT: s_mov_b32 s3, 0xf000 13298; GFX6-NEXT: s_mov_b32 s2, -1 13299; GFX6-NEXT: s_waitcnt lgkmcnt(0) 13300; GFX6-NEXT: s_mov_b32 s0, s4 13301; GFX6-NEXT: s_ashr_i32 s4, s7, 31 13302; GFX6-NEXT: s_lshr_b32 s4, s4, 20 13303; GFX6-NEXT: s_add_u32 s4, s6, s4 13304; GFX6-NEXT: s_mov_b32 s1, s5 13305; GFX6-NEXT: s_addc_u32 s5, s7, 0 13306; GFX6-NEXT: s_and_b32 s4, s4, 0xfffff000 13307; GFX6-NEXT: s_sub_u32 s4, s6, s4 13308; GFX6-NEXT: s_subb_u32 s5, s7, s5 13309; GFX6-NEXT: v_mov_b32_e32 v0, s4 13310; GFX6-NEXT: v_mov_b32_e32 v1, s5 13311; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 13312; GFX6-NEXT: s_endpgm 13313; 13314; GFX9-LABEL: srem_i64_pow2k_denom: 13315; GFX9: ; %bb.0: 13316; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 13317; GFX9-NEXT: v_mov_b32_e32 v2, 0 13318; GFX9-NEXT: s_waitcnt lgkmcnt(0) 13319; GFX9-NEXT: s_ashr_i32 s4, s3, 31 13320; GFX9-NEXT: s_lshr_b32 s4, s4, 20 13321; GFX9-NEXT: s_add_u32 s4, s2, s4 13322; GFX9-NEXT: s_addc_u32 s5, s3, 0 13323; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 13324; GFX9-NEXT: s_sub_u32 s2, s2, s4 13325; GFX9-NEXT: s_subb_u32 s3, s3, s5 13326; GFX9-NEXT: v_mov_b32_e32 v0, s2 13327; GFX9-NEXT: v_mov_b32_e32 v1, s3 13328; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 13329; GFX9-NEXT: s_endpgm 13330; 13331; GFX90A-LABEL: srem_i64_pow2k_denom: 13332; GFX90A: ; %bb.0: 13333; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 13334; GFX90A-NEXT: v_mov_b32_e32 v2, 0 13335; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 13336; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 13337; GFX90A-NEXT: s_lshr_b32 s4, s4, 20 13338; GFX90A-NEXT: s_add_u32 s4, s2, s4 13339; GFX90A-NEXT: s_addc_u32 s5, s3, 0 13340; GFX90A-NEXT: s_and_b32 s4, s4, 0xfffff000 13341; GFX90A-NEXT: s_sub_u32 s2, s2, s4 13342; GFX90A-NEXT: s_subb_u32 s3, s3, s5 13343; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] 13344; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 13345; GFX90A-NEXT: s_endpgm 13346 %r = srem i64 %x, 4096 13347 store i64 %r, i64 addrspace(1)* %out 13348 ret void 13349} 13350 13351define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 13352; CHECK-LABEL: @srem_i64_pow2_shl_denom( 13353; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 13354; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]] 13355; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 13356; CHECK-NEXT: ret void 13357; 13358; GFX6-LABEL: srem_i64_pow2_shl_denom: 13359; GFX6: ; %bb.0: 13360; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd 13361; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 13362; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 13363; GFX6-NEXT: s_mov_b32 s7, 0xf000 13364; GFX6-NEXT: s_mov_b32 s6, -1 13365; GFX6-NEXT: s_waitcnt lgkmcnt(0) 13366; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 13367; GFX6-NEXT: s_ashr_i32 s4, s3, 31 13368; GFX6-NEXT: s_add_u32 s2, s2, s4 13369; GFX6-NEXT: s_mov_b32 s5, s4 13370; GFX6-NEXT: s_addc_u32 s3, s3, s4 13371; GFX6-NEXT: s_xor_b64 s[12:13], s[2:3], s[4:5] 13372; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s12 13373; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s13 13374; GFX6-NEXT: s_sub_u32 s2, 0, s12 13375; GFX6-NEXT: s_subb_u32 s3, 0, s13 13376; GFX6-NEXT: s_ashr_i32 s14, s11, 31 13377; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 13378; GFX6-NEXT: v_rcp_f32_e32 v0, v0 13379; GFX6-NEXT: s_mov_b32 s15, s14 13380; GFX6-NEXT: s_mov_b32 s4, s8 13381; GFX6-NEXT: s_mov_b32 s5, s9 13382; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 13383; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 13384; GFX6-NEXT: v_trunc_f32_e32 v1, v1 13385; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 13386; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 13387; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 13388; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 13389; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 13390; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 13391; GFX6-NEXT: v_mul_lo_u32 v4, s2, v0 13392; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 13393; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 13394; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 13395; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 13396; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 13397; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 13398; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 13399; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 13400; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 13401; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 13402; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 13403; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 13404; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 13405; GFX6-NEXT: v_mov_b32_e32 v4, 0 13406; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 13407; GFX6-NEXT: v_mov_b32_e32 v6, 0 13408; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 13409; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v5, vcc 13410; GFX6-NEXT: v_add_i32_e64 v0, s[0:1], v0, v2 13411; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v1, v3, s[0:1] 13412; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 13413; GFX6-NEXT: v_mul_hi_u32 v7, s2, v0 13414; GFX6-NEXT: v_mul_lo_u32 v8, s3, v0 13415; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 13416; GFX6-NEXT: v_mul_lo_u32 v7, s2, v0 13417; GFX6-NEXT: v_add_i32_e32 v5, vcc, v8, v5 13418; GFX6-NEXT: v_mul_lo_u32 v10, v0, v5 13419; GFX6-NEXT: v_mul_hi_u32 v11, v0, v7 13420; GFX6-NEXT: v_mul_hi_u32 v12, v0, v5 13421; GFX6-NEXT: v_mul_hi_u32 v9, v2, v7 13422; GFX6-NEXT: v_mul_lo_u32 v7, v2, v7 13423; GFX6-NEXT: v_mul_hi_u32 v8, v2, v5 13424; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 13425; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 13426; GFX6-NEXT: v_mul_lo_u32 v2, v2, v5 13427; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 13428; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 13429; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v8, v4, vcc 13430; GFX6-NEXT: v_add_i32_e32 v2, vcc, v7, v2 13431; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc 13432; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 13433; GFX6-NEXT: v_addc_u32_e64 v1, vcc, v1, v5, s[0:1] 13434; GFX6-NEXT: s_add_u32 s0, s10, s14 13435; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 13436; GFX6-NEXT: s_addc_u32 s1, s11, s14 13437; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 13438; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 13439; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 13440; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 13441; GFX6-NEXT: v_mul_hi_u32 v5, s10, v1 13442; GFX6-NEXT: v_mul_hi_u32 v7, s11, v1 13443; GFX6-NEXT: v_mul_lo_u32 v1, s11, v1 13444; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 13445; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 13446; GFX6-NEXT: v_mul_lo_u32 v5, s11, v0 13447; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0 13448; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 13449; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 13450; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v7, v4, vcc 13451; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 13452; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v6, v2, vcc 13453; GFX6-NEXT: v_mul_lo_u32 v1, s12, v1 13454; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0 13455; GFX6-NEXT: v_mul_lo_u32 v3, s13, v0 13456; GFX6-NEXT: v_mul_lo_u32 v0, s12, v0 13457; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 13458; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 13459; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s11, v1 13460; GFX6-NEXT: v_mov_b32_e32 v3, s13 13461; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 13462; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 13463; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 13464; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 13465; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v5 13466; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 13467; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v4 13468; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 13469; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 13470; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v5 13471; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 13472; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 13473; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 13474; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 13475; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 13476; GFX6-NEXT: v_mov_b32_e32 v5, s11 13477; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 13478; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 13479; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 13480; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 13481; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 13482; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 13483; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 13484; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 13485; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 13486; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 13487; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 13488; GFX6-NEXT: v_xor_b32_e32 v0, s14, v0 13489; GFX6-NEXT: v_xor_b32_e32 v1, s14, v1 13490; GFX6-NEXT: v_mov_b32_e32 v2, s14 13491; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s14, v0 13492; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 13493; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 13494; GFX6-NEXT: s_endpgm 13495; 13496; GFX9-LABEL: srem_i64_pow2_shl_denom: 13497; GFX9: ; %bb.0: 13498; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 13499; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 13500; GFX9-NEXT: v_mov_b32_e32 v2, 0 13501; GFX9-NEXT: s_waitcnt lgkmcnt(0) 13502; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 13503; GFX9-NEXT: s_ashr_i32 s4, s3, 31 13504; GFX9-NEXT: s_add_u32 s2, s2, s4 13505; GFX9-NEXT: s_mov_b32 s5, s4 13506; GFX9-NEXT: s_addc_u32 s3, s3, s4 13507; GFX9-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] 13508; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 13509; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 13510; GFX9-NEXT: s_sub_u32 s10, 0, s8 13511; GFX9-NEXT: s_subb_u32 s4, 0, s9 13512; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 13513; GFX9-NEXT: v_rcp_f32_e32 v0, v0 13514; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 13515; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 13516; GFX9-NEXT: v_trunc_f32_e32 v1, v1 13517; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 13518; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 13519; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 13520; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 13521; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 13522; GFX9-NEXT: v_mul_lo_u32 v6, s4, v0 13523; GFX9-NEXT: v_mul_lo_u32 v5, s10, v0 13524; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 13525; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 13526; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 13527; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3 13528; GFX9-NEXT: v_mul_hi_u32 v8, v0, v3 13529; GFX9-NEXT: v_mul_hi_u32 v7, v1, v5 13530; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 13531; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 13532; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 13533; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc 13534; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 13535; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 13536; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v7, vcc 13537; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v2, vcc 13538; GFX9-NEXT: v_mov_b32_e32 v6, 0 13539; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 13540; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc 13541; GFX9-NEXT: v_add_co_u32_e64 v0, s[2:3], v0, v3 13542; GFX9-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[2:3] 13543; GFX9-NEXT: v_mul_lo_u32 v5, s10, v3 13544; GFX9-NEXT: v_mul_hi_u32 v7, s10, v0 13545; GFX9-NEXT: v_mul_lo_u32 v8, s4, v0 13546; GFX9-NEXT: v_mul_lo_u32 v9, s10, v0 13547; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 13548; GFX9-NEXT: v_add_u32_e32 v5, v7, v5 13549; GFX9-NEXT: v_add_u32_e32 v5, v5, v8 13550; GFX9-NEXT: v_mul_lo_u32 v10, v0, v5 13551; GFX9-NEXT: v_mul_hi_u32 v11, v0, v9 13552; GFX9-NEXT: v_mul_hi_u32 v12, v0, v5 13553; GFX9-NEXT: v_mul_hi_u32 v8, v3, v9 13554; GFX9-NEXT: v_mul_lo_u32 v9, v3, v9 13555; GFX9-NEXT: v_mul_hi_u32 v7, v3, v5 13556; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 13557; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc 13558; GFX9-NEXT: v_mul_lo_u32 v3, v3, v5 13559; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 13560; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc 13561; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 13562; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 13563; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v5, vcc 13564; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 13565; GFX9-NEXT: s_waitcnt lgkmcnt(0) 13566; GFX9-NEXT: s_ashr_i32 s10, s7, 31 13567; GFX9-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3] 13568; GFX9-NEXT: s_add_u32 s0, s6, s10 13569; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 13570; GFX9-NEXT: s_mov_b32 s11, s10 13571; GFX9-NEXT: s_addc_u32 s1, s7, s10 13572; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 13573; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 13574; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 13575; GFX9-NEXT: v_mul_hi_u32 v4, s6, v0 13576; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 13577; GFX9-NEXT: v_mul_hi_u32 v7, s7, v1 13578; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 13579; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 13580; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 13581; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 13582; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 13583; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 13584; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v4, v0, vcc 13585; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7, v2, vcc 13586; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 13587; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v3, vcc 13588; GFX9-NEXT: v_mul_lo_u32 v1, s8, v1 13589; GFX9-NEXT: v_mul_hi_u32 v3, s8, v0 13590; GFX9-NEXT: v_mul_lo_u32 v4, s9, v0 13591; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 13592; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 13593; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 13594; GFX9-NEXT: v_sub_u32_e32 v3, s7, v1 13595; GFX9-NEXT: v_mov_b32_e32 v4, s9 13596; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 13597; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc 13598; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s8, v0 13599; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1] 13600; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v6 13601; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 13602; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v5 13603; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] 13604; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 13605; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v6 13606; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v5 13607; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] 13608; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] 13609; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 13610; GFX9-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[0:1] 13611; GFX9-NEXT: v_mov_b32_e32 v5, s7 13612; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc 13613; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 13614; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 13615; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 13616; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] 13617; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 13618; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 13619; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 13620; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 13621; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 13622; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 13623; GFX9-NEXT: v_xor_b32_e32 v0, s10, v0 13624; GFX9-NEXT: v_xor_b32_e32 v1, s10, v1 13625; GFX9-NEXT: v_mov_b32_e32 v3, s10 13626; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s10, v0 13627; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 13628; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 13629; GFX9-NEXT: s_endpgm 13630; 13631; GFX90A-LABEL: srem_i64_pow2_shl_denom: 13632; GFX90A: ; %bb.0: 13633; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x34 13634; GFX90A-NEXT: s_mov_b64 s[2:3], 0x1000 13635; GFX90A-NEXT: v_mov_b32_e32 v2, 0 13636; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 13637; GFX90A-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 13638; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 13639; GFX90A-NEXT: s_add_u32 s2, s2, s4 13640; GFX90A-NEXT: s_mov_b32 s5, s4 13641; GFX90A-NEXT: s_addc_u32 s3, s3, s4 13642; GFX90A-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] 13643; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 13644; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 13645; GFX90A-NEXT: s_sub_u32 s2, 0, s8 13646; GFX90A-NEXT: s_subb_u32 s3, 0, s9 13647; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 13648; GFX90A-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 13649; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 13650; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 13651; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 13652; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 13653; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 13654; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 13655; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 13656; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 13657; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 13658; GFX90A-NEXT: s_mov_b32 s11, s10 13659; GFX90A-NEXT: v_mul_lo_u32 v3, s2, v1 13660; GFX90A-NEXT: v_mul_hi_u32 v5, s2, v0 13661; GFX90A-NEXT: v_mul_lo_u32 v4, s3, v0 13662; GFX90A-NEXT: v_add_u32_e32 v3, v5, v3 13663; GFX90A-NEXT: v_mul_lo_u32 v6, s2, v0 13664; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 13665; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 13666; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 13667; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 13668; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 13669; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 13670; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 13671; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 13672; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 13673; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 13674; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc 13675; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 13676; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 13677; GFX90A-NEXT: v_mov_b32_e32 v6, 0 13678; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 13679; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v5, vcc 13680; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 13681; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1] 13682; GFX90A-NEXT: v_mul_lo_u32 v5, s2, v3 13683; GFX90A-NEXT: v_mul_hi_u32 v7, s2, v0 13684; GFX90A-NEXT: v_add_u32_e32 v5, v7, v5 13685; GFX90A-NEXT: v_mul_lo_u32 v7, s3, v0 13686; GFX90A-NEXT: v_add_u32_e32 v5, v5, v7 13687; GFX90A-NEXT: v_mul_lo_u32 v8, s2, v0 13688; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v8 13689; GFX90A-NEXT: v_mul_lo_u32 v10, v3, v8 13690; GFX90A-NEXT: v_mul_lo_u32 v12, v0, v5 13691; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8 13692; GFX90A-NEXT: v_mul_hi_u32 v11, v0, v5 13693; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 13694; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc 13695; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 13696; GFX90A-NEXT: v_mul_hi_u32 v7, v3, v5 13697; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc 13698; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v2, vcc 13699; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v5 13700; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 13701; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v6, v7, vcc 13702; GFX90A-NEXT: v_add_u32_e32 v1, v1, v4 13703; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1] 13704; GFX90A-NEXT: s_add_u32 s0, s6, s10 13705; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 13706; GFX90A-NEXT: s_addc_u32 s1, s7, s10 13707; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 13708; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 13709; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 13710; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 13711; GFX90A-NEXT: v_mul_hi_u32 v3, s6, v1 13712; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 13713; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 13714; GFX90A-NEXT: v_mul_hi_u32 v7, s7, v0 13715; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 13716; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 13717; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 13718; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v7, vcc 13719; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc 13720; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 13721; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 13722; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v3, vcc 13723; GFX90A-NEXT: v_mul_lo_u32 v1, s8, v1 13724; GFX90A-NEXT: v_mul_hi_u32 v3, s8, v0 13725; GFX90A-NEXT: v_add_u32_e32 v1, v3, v1 13726; GFX90A-NEXT: v_mul_lo_u32 v3, s9, v0 13727; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 13728; GFX90A-NEXT: v_mul_lo_u32 v0, s8, v0 13729; GFX90A-NEXT: v_sub_u32_e32 v3, s7, v1 13730; GFX90A-NEXT: v_mov_b32_e32 v4, s9 13731; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 13732; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc 13733; GFX90A-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s8, v0 13734; GFX90A-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1] 13735; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v6 13736; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 13737; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v5 13738; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] 13739; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 13740; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v6 13741; GFX90A-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v5 13742; GFX90A-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] 13743; GFX90A-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] 13744; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 13745; GFX90A-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[0:1] 13746; GFX90A-NEXT: v_mov_b32_e32 v5, s7 13747; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc 13748; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 13749; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 13750; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 13751; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] 13752; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 13753; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 13754; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 13755; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 13756; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 13757; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 13758; GFX90A-NEXT: v_xor_b32_e32 v0, s10, v0 13759; GFX90A-NEXT: v_xor_b32_e32 v1, s10, v1 13760; GFX90A-NEXT: v_mov_b32_e32 v3, s10 13761; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s10, v0 13762; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 13763; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 13764; GFX90A-NEXT: s_endpgm 13765 %shl.y = shl i64 4096, %y 13766 %r = srem i64 %x, %shl.y 13767 store i64 %r, i64 addrspace(1)* %out 13768 ret void 13769} 13770 13771define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 13772; CHECK-LABEL: @srem_v2i64_pow2k_denom( 13773; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 13774; CHECK-NEXT: [[TMP2:%.*]] = srem i64 [[TMP1]], 4096 13775; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 13776; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 13777; CHECK-NEXT: [[TMP5:%.*]] = srem i64 [[TMP4]], 4096 13778; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 13779; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 13780; CHECK-NEXT: ret void 13781; 13782; GFX6-LABEL: srem_v2i64_pow2k_denom: 13783; GFX6: ; %bb.0: 13784; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 13785; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 13786; GFX6-NEXT: s_movk_i32 s8, 0xf000 13787; GFX6-NEXT: s_mov_b32 s7, 0xf000 13788; GFX6-NEXT: s_mov_b32 s6, -1 13789; GFX6-NEXT: s_waitcnt lgkmcnt(0) 13790; GFX6-NEXT: s_ashr_i32 s9, s1, 31 13791; GFX6-NEXT: s_lshr_b32 s9, s9, 20 13792; GFX6-NEXT: s_add_u32 s9, s0, s9 13793; GFX6-NEXT: s_addc_u32 s10, s1, 0 13794; GFX6-NEXT: s_and_b32 s9, s9, s8 13795; GFX6-NEXT: s_sub_u32 s0, s0, s9 13796; GFX6-NEXT: s_subb_u32 s1, s1, s10 13797; GFX6-NEXT: s_ashr_i32 s9, s3, 31 13798; GFX6-NEXT: s_lshr_b32 s9, s9, 20 13799; GFX6-NEXT: s_add_u32 s9, s2, s9 13800; GFX6-NEXT: s_addc_u32 s10, s3, 0 13801; GFX6-NEXT: s_and_b32 s8, s9, s8 13802; GFX6-NEXT: s_sub_u32 s2, s2, s8 13803; GFX6-NEXT: s_subb_u32 s3, s3, s10 13804; GFX6-NEXT: v_mov_b32_e32 v0, s0 13805; GFX6-NEXT: v_mov_b32_e32 v1, s1 13806; GFX6-NEXT: v_mov_b32_e32 v2, s2 13807; GFX6-NEXT: v_mov_b32_e32 v3, s3 13808; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 13809; GFX6-NEXT: s_endpgm 13810; 13811; GFX9-LABEL: srem_v2i64_pow2k_denom: 13812; GFX9: ; %bb.0: 13813; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 13814; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 13815; GFX9-NEXT: s_movk_i32 s8, 0xf000 13816; GFX9-NEXT: v_mov_b32_e32 v4, 0 13817; GFX9-NEXT: s_waitcnt lgkmcnt(0) 13818; GFX9-NEXT: s_ashr_i32 s0, s5, 31 13819; GFX9-NEXT: s_lshr_b32 s0, s0, 20 13820; GFX9-NEXT: s_add_u32 s0, s4, s0 13821; GFX9-NEXT: s_addc_u32 s1, s5, 0 13822; GFX9-NEXT: s_and_b32 s0, s0, s8 13823; GFX9-NEXT: s_sub_u32 s0, s4, s0 13824; GFX9-NEXT: s_subb_u32 s1, s5, s1 13825; GFX9-NEXT: s_ashr_i32 s4, s7, 31 13826; GFX9-NEXT: s_lshr_b32 s4, s4, 20 13827; GFX9-NEXT: s_add_u32 s4, s6, s4 13828; GFX9-NEXT: s_addc_u32 s5, s7, 0 13829; GFX9-NEXT: s_and_b32 s4, s4, s8 13830; GFX9-NEXT: s_sub_u32 s4, s6, s4 13831; GFX9-NEXT: s_subb_u32 s5, s7, s5 13832; GFX9-NEXT: v_mov_b32_e32 v0, s0 13833; GFX9-NEXT: v_mov_b32_e32 v1, s1 13834; GFX9-NEXT: v_mov_b32_e32 v2, s4 13835; GFX9-NEXT: v_mov_b32_e32 v3, s5 13836; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 13837; GFX9-NEXT: s_endpgm 13838; 13839; GFX90A-LABEL: srem_v2i64_pow2k_denom: 13840; GFX90A: ; %bb.0: 13841; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 13842; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 13843; GFX90A-NEXT: s_movk_i32 s8, 0xf000 13844; GFX90A-NEXT: v_mov_b32_e32 v4, 0 13845; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 13846; GFX90A-NEXT: s_ashr_i32 s0, s5, 31 13847; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 13848; GFX90A-NEXT: s_add_u32 s0, s4, s0 13849; GFX90A-NEXT: s_addc_u32 s1, s5, 0 13850; GFX90A-NEXT: s_and_b32 s0, s0, s8 13851; GFX90A-NEXT: s_sub_u32 s0, s4, s0 13852; GFX90A-NEXT: s_subb_u32 s1, s5, s1 13853; GFX90A-NEXT: s_ashr_i32 s4, s7, 31 13854; GFX90A-NEXT: s_lshr_b32 s4, s4, 20 13855; GFX90A-NEXT: s_add_u32 s4, s6, s4 13856; GFX90A-NEXT: s_addc_u32 s5, s7, 0 13857; GFX90A-NEXT: s_and_b32 s4, s4, s8 13858; GFX90A-NEXT: s_sub_u32 s4, s6, s4 13859; GFX90A-NEXT: s_subb_u32 s5, s7, s5 13860; GFX90A-NEXT: v_mov_b32_e32 v0, s0 13861; GFX90A-NEXT: v_mov_b32_e32 v1, s1 13862; GFX90A-NEXT: v_mov_b32_e32 v2, s4 13863; GFX90A-NEXT: v_mov_b32_e32 v3, s5 13864; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 13865; GFX90A-NEXT: s_endpgm 13866 %r = srem <2 x i64> %x, <i64 4096, i64 4096> 13867 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 13868 ret void 13869} 13870 13871define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 13872; CHECK-LABEL: @srem_v2i64_pow2_shl_denom( 13873; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 13874; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 13875; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 13876; CHECK-NEXT: [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]] 13877; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 13878; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 13879; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 13880; CHECK-NEXT: [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]] 13881; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 13882; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 13883; CHECK-NEXT: ret void 13884; 13885; GFX6-LABEL: srem_v2i64_pow2_shl_denom: 13886; GFX6: ; %bb.0: 13887; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 13888; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 13889; GFX6-NEXT: s_mov_b32 s18, 0x4f800000 13890; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc 13891; GFX6-NEXT: s_mov_b32 s20, 0x2f800000 13892; GFX6-NEXT: s_waitcnt lgkmcnt(0) 13893; GFX6-NEXT: s_lshl_b64 s[14:15], s[2:3], s6 13894; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 13895; GFX6-NEXT: s_ashr_i32 s4, s3, 31 13896; GFX6-NEXT: s_add_u32 s2, s2, s4 13897; GFX6-NEXT: s_mov_b32 s5, s4 13898; GFX6-NEXT: s_addc_u32 s3, s3, s4 13899; GFX6-NEXT: s_xor_b64 s[16:17], s[2:3], s[4:5] 13900; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s16 13901; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s17 13902; GFX6-NEXT: s_mov_b32 s21, 0xcf800000 13903; GFX6-NEXT: s_sub_u32 s6, 0, s16 13904; GFX6-NEXT: s_subb_u32 s7, 0, s17 13905; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1 13906; GFX6-NEXT: v_rcp_f32_e32 v0, v0 13907; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 13908; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 13909; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0 13910; GFX6-NEXT: v_mul_f32_e32 v1, s20, v0 13911; GFX6-NEXT: v_trunc_f32_e32 v1, v1 13912; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 13913; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1 13914; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v0 13915; GFX6-NEXT: s_waitcnt lgkmcnt(0) 13916; GFX6-NEXT: s_ashr_i32 s12, s9, 31 13917; GFX6-NEXT: s_add_u32 s0, s8, s12 13918; GFX6-NEXT: v_mul_lo_u32 v0, s6, v2 13919; GFX6-NEXT: v_mul_hi_u32 v1, s6, v3 13920; GFX6-NEXT: v_mul_lo_u32 v4, s7, v3 13921; GFX6-NEXT: v_mul_lo_u32 v5, s6, v3 13922; GFX6-NEXT: s_mov_b32 s13, s12 13923; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 13924; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v4 13925; GFX6-NEXT: v_mul_lo_u32 v0, v3, v1 13926; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 13927; GFX6-NEXT: v_mul_hi_u32 v6, v3, v1 13928; GFX6-NEXT: v_mul_hi_u32 v7, v2, v1 13929; GFX6-NEXT: s_addc_u32 s1, s9, s12 13930; GFX6-NEXT: v_add_i32_e32 v0, vcc, v4, v0 13931; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 13932; GFX6-NEXT: v_mul_lo_u32 v6, v2, v5 13933; GFX6-NEXT: v_mul_hi_u32 v5, v2, v5 13934; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] 13935; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 13936; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc 13937; GFX6-NEXT: v_mul_lo_u32 v5, v2, v1 13938; GFX6-NEXT: v_mov_b32_e32 v0, 0 13939; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v7, v0, vcc 13940; GFX6-NEXT: v_mov_b32_e32 v1, 0 13941; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 13942; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v6, vcc 13943; GFX6-NEXT: v_add_i32_e64 v3, s[2:3], v3, v4 13944; GFX6-NEXT: v_addc_u32_e64 v4, vcc, v2, v5, s[2:3] 13945; GFX6-NEXT: v_mul_lo_u32 v6, s6, v4 13946; GFX6-NEXT: v_mul_hi_u32 v7, s6, v3 13947; GFX6-NEXT: v_mul_lo_u32 v8, s7, v3 13948; GFX6-NEXT: s_mov_b32 s7, 0xf000 13949; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 13950; GFX6-NEXT: v_mul_lo_u32 v7, s6, v3 13951; GFX6-NEXT: v_add_i32_e32 v6, vcc, v8, v6 13952; GFX6-NEXT: v_mul_lo_u32 v10, v3, v6 13953; GFX6-NEXT: v_mul_hi_u32 v11, v3, v7 13954; GFX6-NEXT: v_mul_hi_u32 v12, v3, v6 13955; GFX6-NEXT: v_mul_hi_u32 v9, v4, v7 13956; GFX6-NEXT: v_mul_lo_u32 v7, v4, v7 13957; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 13958; GFX6-NEXT: v_add_i32_e32 v10, vcc, v11, v10 13959; GFX6-NEXT: v_addc_u32_e32 v11, vcc, 0, v12, vcc 13960; GFX6-NEXT: v_mul_lo_u32 v4, v4, v6 13961; GFX6-NEXT: v_add_i32_e32 v7, vcc, v10, v7 13962; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc 13963; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v8, v0, vcc 13964; GFX6-NEXT: v_add_i32_e32 v4, vcc, v7, v4 13965; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v1, v6, vcc 13966; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 13967; GFX6-NEXT: v_addc_u32_e64 v2, vcc, v2, v6, s[2:3] 13968; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 13969; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc 13970; GFX6-NEXT: v_mul_lo_u32 v4, s8, v2 13971; GFX6-NEXT: v_mul_hi_u32 v5, s8, v3 13972; GFX6-NEXT: v_mul_hi_u32 v6, s8, v2 13973; GFX6-NEXT: v_mul_hi_u32 v7, s9, v2 13974; GFX6-NEXT: v_mul_lo_u32 v2, s9, v2 13975; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 13976; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 13977; GFX6-NEXT: v_mul_lo_u32 v6, s9, v3 13978; GFX6-NEXT: v_mul_hi_u32 v3, s9, v3 13979; GFX6-NEXT: s_mov_b32 s6, -1 13980; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 13981; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 13982; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v7, v0, vcc 13983; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 13984; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v1, v4, vcc 13985; GFX6-NEXT: v_mul_lo_u32 v3, s16, v3 13986; GFX6-NEXT: v_mul_hi_u32 v4, s16, v2 13987; GFX6-NEXT: v_mul_lo_u32 v5, s17, v2 13988; GFX6-NEXT: v_mul_lo_u32 v2, s16, v2 13989; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 13990; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 13991; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s9, v3 13992; GFX6-NEXT: v_mov_b32_e32 v5, s17 13993; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s8, v2 13994; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 13995; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s16, v2 13996; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] 13997; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s17, v7 13998; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 13999; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v6 14000; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1] 14001; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 14002; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v7 14003; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s16, v6 14004; GFX6-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 14005; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 14006; GFX6-NEXT: s_ashr_i32 s2, s15, 31 14007; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 14008; GFX6-NEXT: s_add_u32 s8, s14, s2 14009; GFX6-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] 14010; GFX6-NEXT: v_mov_b32_e32 v7, s9 14011; GFX6-NEXT: s_mov_b32 s3, s2 14012; GFX6-NEXT: s_addc_u32 s9, s15, s2 14013; GFX6-NEXT: s_xor_b64 s[8:9], s[8:9], s[2:3] 14014; GFX6-NEXT: v_cvt_f32_u32_e32 v8, s8 14015; GFX6-NEXT: v_cvt_f32_u32_e32 v9, s9 14016; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc 14017; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s17, v3 14018; GFX6-NEXT: v_mac_f32_e32 v8, s18, v9 14019; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 14020; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s16, v2 14021; GFX6-NEXT: v_rcp_f32_e32 v8, v8 14022; GFX6-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc 14023; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s17, v3 14024; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc 14025; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 14026; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 14027; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 14028; GFX6-NEXT: v_mul_f32_e32 v5, s19, v8 14029; GFX6-NEXT: v_mul_f32_e32 v6, s20, v5 14030; GFX6-NEXT: v_trunc_f32_e32 v6, v6 14031; GFX6-NEXT: v_mac_f32_e32 v5, s21, v6 14032; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 14033; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v6 14034; GFX6-NEXT: s_sub_u32 s2, 0, s8 14035; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 14036; GFX6-NEXT: v_mul_hi_u32 v4, s2, v5 14037; GFX6-NEXT: v_mul_lo_u32 v7, s2, v6 14038; GFX6-NEXT: s_subb_u32 s3, 0, s9 14039; GFX6-NEXT: v_mul_lo_u32 v8, s3, v5 14040; GFX6-NEXT: s_ashr_i32 s14, s11, 31 14041; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v7 14042; GFX6-NEXT: v_mul_lo_u32 v7, s2, v5 14043; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v8 14044; GFX6-NEXT: v_mul_lo_u32 v8, v5, v4 14045; GFX6-NEXT: v_mul_hi_u32 v9, v5, v7 14046; GFX6-NEXT: v_mul_hi_u32 v10, v5, v4 14047; GFX6-NEXT: v_mul_hi_u32 v11, v6, v4 14048; GFX6-NEXT: v_mul_lo_u32 v4, v6, v4 14049; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 14050; GFX6-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc 14051; GFX6-NEXT: v_mul_lo_u32 v10, v6, v7 14052; GFX6-NEXT: v_mul_hi_u32 v7, v6, v7 14053; GFX6-NEXT: s_mov_b32 s15, s14 14054; GFX6-NEXT: v_xor_b32_e32 v2, s12, v2 14055; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v10 14056; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc 14057; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v11, v0, vcc 14058; GFX6-NEXT: v_add_i32_e32 v4, vcc, v7, v4 14059; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v1, v8, vcc 14060; GFX6-NEXT: v_add_i32_e64 v4, s[0:1], v5, v4 14061; GFX6-NEXT: v_addc_u32_e64 v5, vcc, v6, v7, s[0:1] 14062; GFX6-NEXT: v_mul_lo_u32 v8, s2, v5 14063; GFX6-NEXT: v_mul_hi_u32 v9, s2, v4 14064; GFX6-NEXT: v_mul_lo_u32 v10, s3, v4 14065; GFX6-NEXT: v_xor_b32_e32 v3, s12, v3 14066; GFX6-NEXT: v_add_i32_e32 v8, vcc, v9, v8 14067; GFX6-NEXT: v_mul_lo_u32 v9, s2, v4 14068; GFX6-NEXT: v_add_i32_e32 v8, vcc, v10, v8 14069; GFX6-NEXT: v_mul_lo_u32 v12, v4, v8 14070; GFX6-NEXT: v_mul_hi_u32 v13, v4, v9 14071; GFX6-NEXT: v_mul_hi_u32 v14, v4, v8 14072; GFX6-NEXT: v_mul_hi_u32 v11, v5, v9 14073; GFX6-NEXT: v_mul_lo_u32 v9, v5, v9 14074; GFX6-NEXT: v_mul_hi_u32 v10, v5, v8 14075; GFX6-NEXT: v_add_i32_e32 v12, vcc, v13, v12 14076; GFX6-NEXT: v_addc_u32_e32 v13, vcc, 0, v14, vcc 14077; GFX6-NEXT: v_mul_lo_u32 v5, v5, v8 14078; GFX6-NEXT: v_add_i32_e32 v9, vcc, v12, v9 14079; GFX6-NEXT: v_addc_u32_e32 v9, vcc, v13, v11, vcc 14080; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v10, v0, vcc 14081; GFX6-NEXT: v_add_i32_e32 v5, vcc, v9, v5 14082; GFX6-NEXT: v_addc_u32_e32 v8, vcc, v1, v8, vcc 14083; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v7 14084; GFX6-NEXT: v_addc_u32_e64 v6, vcc, v6, v8, s[0:1] 14085; GFX6-NEXT: s_add_u32 s0, s10, s14 14086; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 14087; GFX6-NEXT: s_addc_u32 s1, s11, s14 14088; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 14089; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 14090; GFX6-NEXT: v_mul_lo_u32 v6, s10, v5 14091; GFX6-NEXT: v_mul_hi_u32 v7, s10, v4 14092; GFX6-NEXT: v_mul_hi_u32 v9, s10, v5 14093; GFX6-NEXT: v_mul_hi_u32 v10, s11, v5 14094; GFX6-NEXT: v_mul_lo_u32 v5, s11, v5 14095; GFX6-NEXT: v_add_i32_e32 v6, vcc, v7, v6 14096; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v9, vcc 14097; GFX6-NEXT: v_mul_lo_u32 v9, s11, v4 14098; GFX6-NEXT: v_mul_hi_u32 v4, s11, v4 14099; GFX6-NEXT: v_mov_b32_e32 v8, s12 14100; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v9 14101; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc 14102; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v10, v0, vcc 14103; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 14104; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v1, v0, vcc 14105; GFX6-NEXT: v_mul_lo_u32 v5, s8, v0 14106; GFX6-NEXT: v_mul_hi_u32 v6, s8, v4 14107; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v2 14108; GFX6-NEXT: v_mul_lo_u32 v2, s9, v4 14109; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v3, v8, vcc 14110; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v5 14111; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 14112; GFX6-NEXT: v_mul_lo_u32 v3, s8, v4 14113; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s11, v2 14114; GFX6-NEXT: v_mov_b32_e32 v5, s9 14115; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 14116; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 14117; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s8, v3 14118; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] 14119; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v7 14120; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 14121; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v6 14122; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1] 14123; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 14124; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v7 14125; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s8, v6 14126; GFX6-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 14127; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 14128; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 14129; GFX6-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] 14130; GFX6-NEXT: v_mov_b32_e32 v7, s11 14131; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v7, v2, vcc 14132; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 14133; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 14134; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 14135; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 14136; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v2 14137; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 14138; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 14139; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 14140; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 14141; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 14142; GFX6-NEXT: v_xor_b32_e32 v3, s14, v3 14143; GFX6-NEXT: v_xor_b32_e32 v4, s14, v2 14144; GFX6-NEXT: v_mov_b32_e32 v5, s14 14145; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s14, v3 14146; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v4, v5, vcc 14147; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 14148; GFX6-NEXT: s_endpgm 14149; 14150; GFX9-LABEL: srem_v2i64_pow2_shl_denom: 14151; GFX9: ; %bb.0: 14152; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 14153; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 14154; GFX9-NEXT: s_mov_b32 s16, 0x4f800000 14155; GFX9-NEXT: s_mov_b32 s17, 0x5f7ffffc 14156; GFX9-NEXT: s_mov_b32 s18, 0x2f800000 14157; GFX9-NEXT: s_waitcnt lgkmcnt(0) 14158; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s6 14159; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 14160; GFX9-NEXT: s_ashr_i32 s4, s3, 31 14161; GFX9-NEXT: s_add_u32 s2, s2, s4 14162; GFX9-NEXT: s_mov_b32 s5, s4 14163; GFX9-NEXT: s_addc_u32 s3, s3, s4 14164; GFX9-NEXT: s_xor_b64 s[12:13], s[2:3], s[4:5] 14165; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 14166; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 14167; GFX9-NEXT: s_mov_b32 s19, 0xcf800000 14168; GFX9-NEXT: s_sub_u32 s8, 0, s12 14169; GFX9-NEXT: s_subb_u32 s4, 0, s13 14170; GFX9-NEXT: v_mac_f32_e32 v0, s16, v1 14171; GFX9-NEXT: v_rcp_f32_e32 v0, v0 14172; GFX9-NEXT: v_mul_f32_e32 v0, s17, v0 14173; GFX9-NEXT: v_mul_f32_e32 v1, s18, v0 14174; GFX9-NEXT: v_trunc_f32_e32 v1, v1 14175; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 14176; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v1 14177; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v0 14178; GFX9-NEXT: v_mul_lo_u32 v0, s8, v2 14179; GFX9-NEXT: v_mul_hi_u32 v1, s8, v3 14180; GFX9-NEXT: v_mul_lo_u32 v5, s4, v3 14181; GFX9-NEXT: v_mul_lo_u32 v4, s8, v3 14182; GFX9-NEXT: v_add_u32_e32 v0, v1, v0 14183; GFX9-NEXT: v_add_u32_e32 v5, v0, v5 14184; GFX9-NEXT: v_mul_hi_u32 v1, v3, v4 14185; GFX9-NEXT: v_mul_lo_u32 v6, v3, v5 14186; GFX9-NEXT: v_mul_hi_u32 v7, v3, v5 14187; GFX9-NEXT: v_mul_hi_u32 v8, v2, v5 14188; GFX9-NEXT: v_mul_lo_u32 v5, v2, v5 14189; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v6 14190; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 14191; GFX9-NEXT: v_mul_lo_u32 v7, v2, v4 14192; GFX9-NEXT: v_mul_hi_u32 v4, v2, v4 14193; GFX9-NEXT: v_mov_b32_e32 v0, 0 14194; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v7 14195; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v4, vcc 14196; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v0, vcc 14197; GFX9-NEXT: v_mov_b32_e32 v1, 0 14198; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 14199; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v6, vcc 14200; GFX9-NEXT: v_add_co_u32_e64 v3, s[2:3], v3, v4 14201; GFX9-NEXT: v_addc_co_u32_e64 v4, vcc, v2, v5, s[2:3] 14202; GFX9-NEXT: v_mul_lo_u32 v6, s8, v4 14203; GFX9-NEXT: v_mul_hi_u32 v7, s8, v3 14204; GFX9-NEXT: v_mul_lo_u32 v8, s4, v3 14205; GFX9-NEXT: v_mul_lo_u32 v9, s8, v3 14206; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 14207; GFX9-NEXT: v_add_u32_e32 v6, v7, v6 14208; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 14209; GFX9-NEXT: v_mul_lo_u32 v10, v3, v6 14210; GFX9-NEXT: v_mul_hi_u32 v11, v3, v9 14211; GFX9-NEXT: v_mul_hi_u32 v12, v3, v6 14212; GFX9-NEXT: v_mul_hi_u32 v8, v4, v9 14213; GFX9-NEXT: v_mul_lo_u32 v9, v4, v9 14214; GFX9-NEXT: v_mul_hi_u32 v7, v4, v6 14215; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v11, v10 14216; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v12, vcc 14217; GFX9-NEXT: v_mul_lo_u32 v4, v4, v6 14218; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 14219; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v8, vcc 14220; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v7, v0, vcc 14221; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 14222; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v1, v6, vcc 14223; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 14224; GFX9-NEXT: s_waitcnt lgkmcnt(0) 14225; GFX9-NEXT: s_ashr_i32 s8, s5, 31 14226; GFX9-NEXT: v_addc_co_u32_e64 v2, vcc, v2, v6, s[2:3] 14227; GFX9-NEXT: s_add_u32 s2, s4, s8 14228; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 14229; GFX9-NEXT: s_mov_b32 s9, s8 14230; GFX9-NEXT: s_addc_u32 s3, s5, s8 14231; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 14232; GFX9-NEXT: s_xor_b64 s[14:15], s[2:3], s[8:9] 14233; GFX9-NEXT: v_mul_lo_u32 v4, s14, v2 14234; GFX9-NEXT: v_mul_hi_u32 v5, s14, v3 14235; GFX9-NEXT: v_mul_hi_u32 v6, s14, v2 14236; GFX9-NEXT: v_mul_hi_u32 v7, s15, v2 14237; GFX9-NEXT: v_mul_lo_u32 v2, s15, v2 14238; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 14239; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc 14240; GFX9-NEXT: v_mul_lo_u32 v6, s15, v3 14241; GFX9-NEXT: v_mul_hi_u32 v3, s15, v3 14242; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 14243; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 14244; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc 14245; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v0, vcc 14246; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 14247; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v4, vcc 14248; GFX9-NEXT: v_mul_lo_u32 v3, s12, v3 14249; GFX9-NEXT: v_mul_hi_u32 v4, s12, v2 14250; GFX9-NEXT: v_mul_lo_u32 v5, s13, v2 14251; GFX9-NEXT: v_mul_lo_u32 v2, s12, v2 14252; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 14253; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 14254; GFX9-NEXT: v_sub_u32_e32 v4, s15, v3 14255; GFX9-NEXT: v_mov_b32_e32 v5, s13 14256; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s14, v2 14257; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v5, vcc 14258; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s12, v2 14259; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[2:3], 0, v4, s[0:1] 14260; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v7 14261; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 14262; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v6 14263; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v5, s[0:1] 14264; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 14265; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v7 14266; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s12, v6 14267; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 14268; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] 14269; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 14270; GFX9-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[0:1] 14271; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] 14272; GFX9-NEXT: s_ashr_i32 s0, s11, 31 14273; GFX9-NEXT: s_add_u32 s2, s10, s0 14274; GFX9-NEXT: s_mov_b32 s1, s0 14275; GFX9-NEXT: s_addc_u32 s3, s11, s0 14276; GFX9-NEXT: v_mov_b32_e32 v6, s15 14277; GFX9-NEXT: s_xor_b64 s[10:11], s[2:3], s[0:1] 14278; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v6, v3, vcc 14279; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s10 14280; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s11 14281; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v3 14282; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 14283; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v2 14284; GFX9-NEXT: v_mac_f32_e32 v6, s16, v7 14285; GFX9-NEXT: v_rcp_f32_e32 v6, v6 14286; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc 14287; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s13, v3 14288; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v9, vcc 14289; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 14290; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 14291; GFX9-NEXT: v_mul_f32_e32 v5, s17, v6 14292; GFX9-NEXT: v_mul_f32_e32 v6, s18, v5 14293; GFX9-NEXT: v_trunc_f32_e32 v6, v6 14294; GFX9-NEXT: v_mac_f32_e32 v5, s19, v6 14295; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 14296; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v6 14297; GFX9-NEXT: s_sub_u32 s2, 0, s10 14298; GFX9-NEXT: s_subb_u32 s3, 0, s11 14299; GFX9-NEXT: v_mul_hi_u32 v7, s2, v5 14300; GFX9-NEXT: v_mul_lo_u32 v8, s2, v6 14301; GFX9-NEXT: v_mul_lo_u32 v9, s3, v5 14302; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 14303; GFX9-NEXT: v_mul_lo_u32 v4, s2, v5 14304; GFX9-NEXT: v_add_u32_e32 v7, v7, v8 14305; GFX9-NEXT: v_add_u32_e32 v7, v7, v9 14306; GFX9-NEXT: v_mul_lo_u32 v8, v5, v7 14307; GFX9-NEXT: v_mul_hi_u32 v9, v5, v4 14308; GFX9-NEXT: v_mul_hi_u32 v10, v5, v7 14309; GFX9-NEXT: v_mul_hi_u32 v11, v6, v7 14310; GFX9-NEXT: v_mul_lo_u32 v7, v6, v7 14311; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 14312; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, 0, v10, vcc 14313; GFX9-NEXT: v_mul_lo_u32 v10, v6, v4 14314; GFX9-NEXT: v_mul_hi_u32 v4, v6, v4 14315; GFX9-NEXT: s_ashr_i32 s12, s7, 31 14316; GFX9-NEXT: s_mov_b32 s13, s12 14317; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 14318; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v9, v4, vcc 14319; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v0, vcc 14320; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 14321; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v1, v8, vcc 14322; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v5, v4 14323; GFX9-NEXT: v_addc_co_u32_e64 v5, vcc, v6, v7, s[0:1] 14324; GFX9-NEXT: v_mul_lo_u32 v8, s2, v5 14325; GFX9-NEXT: v_mul_hi_u32 v9, s2, v4 14326; GFX9-NEXT: v_mul_lo_u32 v10, s3, v4 14327; GFX9-NEXT: v_mul_lo_u32 v11, s2, v4 14328; GFX9-NEXT: v_add_u32_e32 v6, v6, v7 14329; GFX9-NEXT: v_add_u32_e32 v8, v9, v8 14330; GFX9-NEXT: v_add_u32_e32 v8, v8, v10 14331; GFX9-NEXT: v_mul_lo_u32 v12, v4, v8 14332; GFX9-NEXT: v_mul_hi_u32 v13, v4, v11 14333; GFX9-NEXT: v_mul_hi_u32 v14, v4, v8 14334; GFX9-NEXT: v_mul_hi_u32 v10, v5, v11 14335; GFX9-NEXT: v_mul_lo_u32 v11, v5, v11 14336; GFX9-NEXT: v_mul_hi_u32 v9, v5, v8 14337; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v13, v12 14338; GFX9-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v14, vcc 14339; GFX9-NEXT: v_mul_lo_u32 v5, v5, v8 14340; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v12, v11 14341; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v10, vcc 14342; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v0, vcc 14343; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v10, v5 14344; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v1, v8, vcc 14345; GFX9-NEXT: v_addc_co_u32_e64 v6, vcc, v6, v8, s[0:1] 14346; GFX9-NEXT: s_add_u32 s0, s6, s12 14347; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 14348; GFX9-NEXT: s_addc_u32 s1, s7, s12 14349; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc 14350; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] 14351; GFX9-NEXT: v_mul_lo_u32 v6, s6, v5 14352; GFX9-NEXT: v_mul_hi_u32 v7, s6, v4 14353; GFX9-NEXT: v_mul_hi_u32 v9, s6, v5 14354; GFX9-NEXT: v_mul_hi_u32 v10, s7, v5 14355; GFX9-NEXT: v_mul_lo_u32 v5, s7, v5 14356; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 14357; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v9, vcc 14358; GFX9-NEXT: v_mul_lo_u32 v9, s7, v4 14359; GFX9-NEXT: v_mul_hi_u32 v4, s7, v4 14360; GFX9-NEXT: v_xor_b32_e32 v2, s8, v2 14361; GFX9-NEXT: v_xor_b32_e32 v3, s8, v3 14362; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v9 14363; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc 14364; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v10, v0, vcc 14365; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 14366; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v6, vcc 14367; GFX9-NEXT: v_mul_lo_u32 v5, s10, v1 14368; GFX9-NEXT: v_mul_hi_u32 v6, s10, v4 14369; GFX9-NEXT: v_mul_lo_u32 v7, s11, v4 14370; GFX9-NEXT: v_mul_lo_u32 v4, s10, v4 14371; GFX9-NEXT: v_mov_b32_e32 v8, s8 14372; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s8, v2 14373; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v8, vcc 14374; GFX9-NEXT: v_add_u32_e32 v3, v6, v5 14375; GFX9-NEXT: v_add_u32_e32 v3, v3, v7 14376; GFX9-NEXT: v_sub_u32_e32 v5, s7, v3 14377; GFX9-NEXT: v_mov_b32_e32 v6, s11 14378; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s6, v4 14379; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc 14380; GFX9-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s10, v4 14381; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[2:3], 0, v5, s[0:1] 14382; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s11, v8 14383; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 14384; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s10, v7 14385; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, s[0:1] 14386; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[2:3] 14387; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s11, v8 14388; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s10, v7 14389; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[2:3] 14390; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] 14391; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 14392; GFX9-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[0:1] 14393; GFX9-NEXT: v_mov_b32_e32 v7, s7 14394; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc 14395; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 14396; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 14397; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v4 14398; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1] 14399; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 14400; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v3 14401; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 14402; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 14403; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc 14404; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 14405; GFX9-NEXT: v_xor_b32_e32 v4, s12, v4 14406; GFX9-NEXT: v_xor_b32_e32 v5, s12, v3 14407; GFX9-NEXT: v_mov_b32_e32 v6, s12 14408; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s12, v4 14409; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v5, v6, vcc 14410; GFX9-NEXT: s_waitcnt lgkmcnt(0) 14411; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[4:5] 14412; GFX9-NEXT: s_endpgm 14413; 14414; GFX90A-LABEL: srem_v2i64_pow2_shl_denom: 14415; GFX90A: ; %bb.0: 14416; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 14417; GFX90A-NEXT: s_mov_b64 s[2:3], 0x1000 14418; GFX90A-NEXT: s_mov_b32 s16, 0x4f800000 14419; GFX90A-NEXT: s_mov_b32 s17, 0x5f7ffffc 14420; GFX90A-NEXT: s_mov_b32 s18, 0x2f800000 14421; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 14422; GFX90A-NEXT: s_lshl_b64 s[10:11], s[2:3], s6 14423; GFX90A-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 14424; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 14425; GFX90A-NEXT: s_add_u32 s2, s2, s4 14426; GFX90A-NEXT: s_mov_b32 s5, s4 14427; GFX90A-NEXT: s_addc_u32 s3, s3, s4 14428; GFX90A-NEXT: s_xor_b64 s[12:13], s[2:3], s[4:5] 14429; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s12 14430; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s13 14431; GFX90A-NEXT: s_mov_b32 s19, 0xcf800000 14432; GFX90A-NEXT: s_sub_u32 s2, 0, s12 14433; GFX90A-NEXT: s_subb_u32 s3, 0, s13 14434; GFX90A-NEXT: v_mac_f32_e32 v0, s16, v1 14435; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 14436; GFX90A-NEXT: v_mov_b32_e32 v4, 0 14437; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 14438; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 14439; GFX90A-NEXT: v_mul_f32_e32 v0, s17, v0 14440; GFX90A-NEXT: v_mul_f32_e32 v1, s18, v0 14441; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 14442; GFX90A-NEXT: v_mac_f32_e32 v0, s19, v1 14443; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 14444; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 14445; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 14446; GFX90A-NEXT: s_ashr_i32 s14, s5, 31 14447; GFX90A-NEXT: s_mov_b32 s15, s14 14448; GFX90A-NEXT: v_mul_hi_u32 v3, s2, v0 14449; GFX90A-NEXT: v_mul_lo_u32 v5, s2, v1 14450; GFX90A-NEXT: v_mul_lo_u32 v2, s3, v0 14451; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 14452; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 14453; GFX90A-NEXT: v_mul_lo_u32 v6, s2, v0 14454; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 14455; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 14456; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 14457; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 14458; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 14459; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 14460; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 14461; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 14462; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 14463; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc 14464; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc 14465; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v2 14466; GFX90A-NEXT: v_mov_b32_e32 v2, 0 14467; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 14468; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v2, v5, vcc 14469; GFX90A-NEXT: v_add_co_u32_e64 v0, s[0:1], v0, v3 14470; GFX90A-NEXT: v_addc_co_u32_e64 v3, vcc, v1, v5, s[0:1] 14471; GFX90A-NEXT: v_mul_lo_u32 v6, s2, v3 14472; GFX90A-NEXT: v_mul_hi_u32 v7, s2, v0 14473; GFX90A-NEXT: v_add_u32_e32 v6, v7, v6 14474; GFX90A-NEXT: v_mul_lo_u32 v7, s3, v0 14475; GFX90A-NEXT: v_add_u32_e32 v6, v6, v7 14476; GFX90A-NEXT: v_mul_lo_u32 v8, s2, v0 14477; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v8 14478; GFX90A-NEXT: v_mul_lo_u32 v10, v3, v8 14479; GFX90A-NEXT: v_mul_lo_u32 v12, v0, v6 14480; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v8 14481; GFX90A-NEXT: v_mul_hi_u32 v11, v0, v6 14482; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 14483; GFX90A-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v11, vcc 14484; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v10 14485; GFX90A-NEXT: v_mul_hi_u32 v7, v3, v6 14486; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v11, v9, vcc 14487; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v4, vcc 14488; GFX90A-NEXT: v_mul_lo_u32 v3, v3, v6 14489; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v8, v3 14490; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v2, v7, vcc 14491; GFX90A-NEXT: v_add_u32_e32 v1, v1, v5 14492; GFX90A-NEXT: v_addc_co_u32_e64 v1, vcc, v1, v6, s[0:1] 14493; GFX90A-NEXT: s_add_u32 s0, s4, s14 14494; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 14495; GFX90A-NEXT: s_addc_u32 s1, s5, s14 14496; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 14497; GFX90A-NEXT: s_xor_b64 s[4:5], s[0:1], s[14:15] 14498; GFX90A-NEXT: v_mul_lo_u32 v5, s4, v1 14499; GFX90A-NEXT: v_mul_hi_u32 v6, s4, v0 14500; GFX90A-NEXT: v_mul_hi_u32 v3, s4, v1 14501; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 14502; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 14503; GFX90A-NEXT: v_mul_hi_u32 v7, s5, v0 14504; GFX90A-NEXT: v_mul_lo_u32 v0, s5, v0 14505; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v5, v0 14506; GFX90A-NEXT: v_mul_hi_u32 v6, s5, v1 14507; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v7, vcc 14508; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc 14509; GFX90A-NEXT: v_mul_lo_u32 v1, s5, v1 14510; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 14511; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v3, vcc 14512; GFX90A-NEXT: v_mul_lo_u32 v1, s12, v1 14513; GFX90A-NEXT: v_mul_hi_u32 v3, s12, v0 14514; GFX90A-NEXT: v_add_u32_e32 v1, v3, v1 14515; GFX90A-NEXT: v_mul_lo_u32 v3, s13, v0 14516; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 14517; GFX90A-NEXT: v_mul_lo_u32 v0, s12, v0 14518; GFX90A-NEXT: v_sub_u32_e32 v3, s5, v1 14519; GFX90A-NEXT: v_mov_b32_e32 v5, s13 14520; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 14521; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v5, vcc 14522; GFX90A-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s12, v0 14523; GFX90A-NEXT: v_subbrev_co_u32_e64 v7, s[2:3], 0, v3, s[0:1] 14524; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v7 14525; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 14526; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v6 14527; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v5, s[0:1] 14528; GFX90A-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 14529; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v7 14530; GFX90A-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s12, v6 14531; GFX90A-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 14532; GFX90A-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] 14533; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 14534; GFX90A-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[0:1] 14535; GFX90A-NEXT: v_mov_b32_e32 v6, s5 14536; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v6, v1, vcc 14537; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 14538; GFX90A-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[0:1] 14539; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 14540; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 14541; GFX90A-NEXT: s_ashr_i32 s0, s11, 31 14542; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 14543; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 14544; GFX90A-NEXT: s_add_u32 s2, s10, s0 14545; GFX90A-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc 14546; GFX90A-NEXT: s_mov_b32 s1, s0 14547; GFX90A-NEXT: s_addc_u32 s3, s11, s0 14548; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 14549; GFX90A-NEXT: s_xor_b64 s[4:5], s[2:3], s[0:1] 14550; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 14551; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 14552; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s4 14553; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s5 14554; GFX90A-NEXT: v_xor_b32_e32 v0, s14, v0 14555; GFX90A-NEXT: s_sub_u32 s2, 0, s4 14556; GFX90A-NEXT: v_xor_b32_e32 v1, s14, v1 14557; GFX90A-NEXT: v_mac_f32_e32 v3, s16, v5 14558; GFX90A-NEXT: v_rcp_f32_e32 v3, v3 14559; GFX90A-NEXT: v_mov_b32_e32 v6, s14 14560; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s14, v0 14561; GFX90A-NEXT: v_mul_f32_e32 v3, s17, v3 14562; GFX90A-NEXT: v_mul_f32_e32 v5, s18, v3 14563; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 14564; GFX90A-NEXT: v_mac_f32_e32 v3, s19, v5 14565; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 14566; GFX90A-NEXT: v_cvt_u32_f32_e32 v5, v5 14567; GFX90A-NEXT: s_subb_u32 s3, 0, s5 14568; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc 14569; GFX90A-NEXT: v_mul_hi_u32 v7, s2, v3 14570; GFX90A-NEXT: v_mul_lo_u32 v8, s2, v5 14571; GFX90A-NEXT: v_mul_lo_u32 v6, s3, v3 14572; GFX90A-NEXT: v_add_u32_e32 v7, v7, v8 14573; GFX90A-NEXT: v_add_u32_e32 v6, v7, v6 14574; GFX90A-NEXT: v_mul_lo_u32 v9, s2, v3 14575; GFX90A-NEXT: v_mul_lo_u32 v8, v3, v6 14576; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v9 14577; GFX90A-NEXT: v_mul_hi_u32 v7, v3, v6 14578; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v10, v8 14579; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc 14580; GFX90A-NEXT: v_mul_hi_u32 v11, v5, v9 14581; GFX90A-NEXT: v_mul_lo_u32 v9, v5, v9 14582; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9 14583; GFX90A-NEXT: v_mul_hi_u32 v10, v5, v6 14584; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v7, v11, vcc 14585; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v10, v4, vcc 14586; GFX90A-NEXT: v_mul_lo_u32 v6, v5, v6 14587; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 14588; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v2, v8, vcc 14589; GFX90A-NEXT: v_add_co_u32_e64 v3, s[0:1], v3, v6 14590; GFX90A-NEXT: v_addc_co_u32_e64 v6, vcc, v5, v7, s[0:1] 14591; GFX90A-NEXT: v_mul_lo_u32 v8, s2, v6 14592; GFX90A-NEXT: v_mul_hi_u32 v9, s2, v3 14593; GFX90A-NEXT: v_add_u32_e32 v8, v9, v8 14594; GFX90A-NEXT: v_mul_lo_u32 v9, s3, v3 14595; GFX90A-NEXT: v_add_u32_e32 v8, v8, v9 14596; GFX90A-NEXT: v_mul_lo_u32 v10, s2, v3 14597; GFX90A-NEXT: v_mul_hi_u32 v11, v6, v10 14598; GFX90A-NEXT: v_mul_lo_u32 v12, v6, v10 14599; GFX90A-NEXT: v_mul_lo_u32 v14, v3, v8 14600; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v10 14601; GFX90A-NEXT: v_mul_hi_u32 v13, v3, v8 14602; GFX90A-NEXT: v_add_co_u32_e32 v10, vcc, v10, v14 14603; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, 0, v13, vcc 14604; GFX90A-NEXT: v_add_co_u32_e32 v10, vcc, v10, v12 14605; GFX90A-NEXT: v_mul_hi_u32 v9, v6, v8 14606; GFX90A-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v11, vcc 14607; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v4, vcc 14608; GFX90A-NEXT: v_mul_lo_u32 v6, v6, v8 14609; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6 14610; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, v2, v9, vcc 14611; GFX90A-NEXT: v_add_u32_e32 v5, v5, v7 14612; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 14613; GFX90A-NEXT: v_addc_co_u32_e64 v5, vcc, v5, v8, s[0:1] 14614; GFX90A-NEXT: s_add_u32 s0, s6, s10 14615; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 14616; GFX90A-NEXT: s_mov_b32 s11, s10 14617; GFX90A-NEXT: s_addc_u32 s1, s7, s10 14618; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc 14619; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 14620; GFX90A-NEXT: v_mul_lo_u32 v7, s6, v5 14621; GFX90A-NEXT: v_mul_hi_u32 v8, s6, v3 14622; GFX90A-NEXT: v_mul_hi_u32 v6, s6, v5 14623; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 14624; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 14625; GFX90A-NEXT: v_mul_hi_u32 v9, s7, v3 14626; GFX90A-NEXT: v_mul_lo_u32 v3, s7, v3 14627; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v7, v3 14628; GFX90A-NEXT: v_mul_hi_u32 v8, s7, v5 14629; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v9, vcc 14630; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v4, vcc 14631; GFX90A-NEXT: v_mul_lo_u32 v5, s7, v5 14632; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 14633; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v2, v6, vcc 14634; GFX90A-NEXT: v_mul_lo_u32 v2, s4, v2 14635; GFX90A-NEXT: v_mul_hi_u32 v5, s4, v3 14636; GFX90A-NEXT: v_add_u32_e32 v2, v5, v2 14637; GFX90A-NEXT: v_mul_lo_u32 v5, s5, v3 14638; GFX90A-NEXT: v_add_u32_e32 v2, v2, v5 14639; GFX90A-NEXT: v_mul_lo_u32 v3, s4, v3 14640; GFX90A-NEXT: v_sub_u32_e32 v5, s7, v2 14641; GFX90A-NEXT: v_mov_b32_e32 v6, s5 14642; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 14643; GFX90A-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc 14644; GFX90A-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s4, v3 14645; GFX90A-NEXT: v_subbrev_co_u32_e64 v8, s[2:3], 0, v5, s[0:1] 14646; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v8 14647; GFX90A-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 14648; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v7 14649; GFX90A-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, s[0:1] 14650; GFX90A-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[2:3] 14651; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], s5, v8 14652; GFX90A-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s4, v7 14653; GFX90A-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[2:3] 14654; GFX90A-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] 14655; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 14656; GFX90A-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[0:1] 14657; GFX90A-NEXT: v_mov_b32_e32 v7, s7 14658; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v7, v2, vcc 14659; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 14660; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 14661; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 14662; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1] 14663; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 14664; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v2 14665; GFX90A-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 14666; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 14667; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 14668; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 14669; GFX90A-NEXT: v_xor_b32_e32 v3, s10, v3 14670; GFX90A-NEXT: v_xor_b32_e32 v5, s10, v2 14671; GFX90A-NEXT: v_mov_b32_e32 v6, s10 14672; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s10, v3 14673; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc 14674; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] 14675; GFX90A-NEXT: s_endpgm 14676 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 14677 %r = srem <2 x i64> %x, %shl.y 14678 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 14679 ret void 14680} 14681