1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --force-update 2; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s 3; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s 4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s 5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx90a -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX90A %s 6 7define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 8; CHECK-LABEL: @udiv_i32( 9; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 10; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 11; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000 12; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 13; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]] 14; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]] 15; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64 16; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 17; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 18; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 19; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 20; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 21; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]] 22; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64 23; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 24; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 25; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 26; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 27; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 28; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]] 29; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]] 30; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]] 31; CHECK-NEXT: [[TMP23:%.*]] = add i32 [[TMP19]], 1 32; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP19]] 33; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP21]], [[Y]] 34; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP22]], i32 [[TMP25]], i32 [[TMP21]] 35; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[Y]] 36; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[TMP24]], 1 37; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP24]] 38; CHECK-NEXT: store i32 [[TMP29]], i32 addrspace(1)* [[OUT:%.*]], align 4 39; CHECK-NEXT: ret void 40; 41; GFX6-LABEL: udiv_i32: 42; GFX6: ; %bb.0: 43; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 44; GFX6-NEXT: s_mov_b32 s7, 0xf000 45; GFX6-NEXT: s_mov_b32 s6, -1 46; GFX6-NEXT: s_waitcnt lgkmcnt(0) 47; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 48; GFX6-NEXT: s_sub_i32 s4, 0, s3 49; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 50; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 51; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 52; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 53; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 54; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 55; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 56; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 57; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 58; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 59; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s2, v1 60; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 61; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 62; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 63; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 64; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 65; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 66; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 67; GFX6-NEXT: s_waitcnt lgkmcnt(0) 68; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 69; GFX6-NEXT: s_endpgm 70; 71; GFX9-LABEL: udiv_i32: 72; GFX9: ; %bb.0: 73; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 74; GFX9-NEXT: v_mov_b32_e32 v2, 0 75; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 76; GFX9-NEXT: s_waitcnt lgkmcnt(0) 77; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 78; GFX9-NEXT: s_sub_i32 s4, 0, s3 79; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 80; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 81; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 82; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 83; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 84; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 85; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 86; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3 87; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 88; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 89; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 90; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 91; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 92; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 93; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 94; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 95; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 96; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 97; GFX9-NEXT: s_endpgm 98; 99; GFX90A-LABEL: udiv_i32: 100; GFX90A: ; %bb.0: 101; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 102; GFX90A-NEXT: v_mov_b32_e32 v1, 0 103; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 104; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 105; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 106; GFX90A-NEXT: s_sub_i32 s4, 0, s3 107; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 108; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 109; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 110; GFX90A-NEXT: v_mul_lo_u32 v2, s4, v0 111; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 112; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 113; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 114; GFX90A-NEXT: v_mul_lo_u32 v2, v0, s3 115; GFX90A-NEXT: v_sub_u32_e32 v2, s2, v2 116; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 117; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 118; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 119; GFX90A-NEXT: v_subrev_u32_e32 v3, s3, v2 120; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 121; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 122; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 123; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 124; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] 125; GFX90A-NEXT: s_endpgm 126 %r = udiv i32 %x, %y 127 store i32 %r, i32 addrspace(1)* %out 128 ret void 129} 130 131define amdgpu_kernel void @urem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 132; CHECK-LABEL: @urem_i32( 133; CHECK-NEXT: [[TMP1:%.*]] = uitofp i32 [[Y:%.*]] to float 134; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP1]]) 135; CHECK-NEXT: [[TMP3:%.*]] = fmul fast float [[TMP2]], 0x41EFFFFFC0000000 136; CHECK-NEXT: [[TMP4:%.*]] = fptoui float [[TMP3]] to i32 137; CHECK-NEXT: [[TMP5:%.*]] = sub i32 0, [[Y]] 138; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[TMP4]] 139; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP4]] to i64 140; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 141; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP7]], [[TMP8]] 142; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32 143; CHECK-NEXT: [[TMP11:%.*]] = lshr i64 [[TMP9]], 32 144; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 145; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP4]], [[TMP12]] 146; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[X:%.*]] to i64 147; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 148; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 149; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 150; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 151; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 152; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP19]], [[Y]] 153; CHECK-NEXT: [[TMP21:%.*]] = sub i32 [[X]], [[TMP20]] 154; CHECK-NEXT: [[TMP22:%.*]] = icmp uge i32 [[TMP21]], [[Y]] 155; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP21]], [[Y]] 156; CHECK-NEXT: [[TMP24:%.*]] = select i1 [[TMP22]], i32 [[TMP23]], i32 [[TMP21]] 157; CHECK-NEXT: [[TMP25:%.*]] = icmp uge i32 [[TMP24]], [[Y]] 158; CHECK-NEXT: [[TMP26:%.*]] = sub i32 [[TMP24]], [[Y]] 159; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP25]], i32 [[TMP26]], i32 [[TMP24]] 160; CHECK-NEXT: store i32 [[TMP27]], i32 addrspace(1)* [[OUT:%.*]], align 4 161; CHECK-NEXT: ret void 162; 163; GFX6-LABEL: urem_i32: 164; GFX6: ; %bb.0: 165; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 166; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 167; GFX6-NEXT: s_mov_b32 s3, 0xf000 168; GFX6-NEXT: s_waitcnt lgkmcnt(0) 169; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 170; GFX6-NEXT: s_sub_i32 s2, 0, s5 171; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 172; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 173; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 174; GFX6-NEXT: v_mul_lo_u32 v1, s2, v0 175; GFX6-NEXT: s_mov_b32 s2, -1 176; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 177; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 178; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 179; GFX6-NEXT: v_mul_lo_u32 v0, v0, s5 180; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 181; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s5, v0 182; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 183; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 184; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s5, v0 185; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v0 186; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 187; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 188; GFX6-NEXT: s_endpgm 189; 190; GFX9-LABEL: urem_i32: 191; GFX9: ; %bb.0: 192; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 193; GFX9-NEXT: s_waitcnt lgkmcnt(0) 194; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 195; GFX9-NEXT: s_sub_i32 s4, 0, s3 196; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 197; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 198; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 199; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 200; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 201; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 202; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 203; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 204; GFX9-NEXT: v_mov_b32_e32 v1, 0 205; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 206; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 207; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 208; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 209; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 210; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 211; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 212; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 213; GFX9-NEXT: s_waitcnt lgkmcnt(0) 214; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 215; GFX9-NEXT: s_endpgm 216; 217; GFX90A-LABEL: urem_i32: 218; GFX90A: ; %bb.0: 219; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 220; GFX90A-NEXT: v_mov_b32_e32 v1, 0 221; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 222; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 223; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 224; GFX90A-NEXT: s_sub_i32 s4, 0, s3 225; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 226; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 227; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 228; GFX90A-NEXT: v_mul_lo_u32 v2, s4, v0 229; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 230; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 231; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 232; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s3 233; GFX90A-NEXT: v_sub_u32_e32 v0, s2, v0 234; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 235; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 236; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 237; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 238; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 239; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 240; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] 241; GFX90A-NEXT: s_endpgm 242 %r = urem i32 %x, %y 243 store i32 %r, i32 addrspace(1)* %out 244 ret void 245} 246 247define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 248; CHECK-LABEL: @sdiv_i32( 249; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 250; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 251; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 252; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[X]], [[TMP1]] 253; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[Y]], [[TMP2]] 254; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP1]] 255; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP2]] 256; CHECK-NEXT: [[TMP8:%.*]] = uitofp i32 [[TMP7]] to float 257; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP8]]) 258; CHECK-NEXT: [[TMP10:%.*]] = fmul fast float [[TMP9]], 0x41EFFFFFC0000000 259; CHECK-NEXT: [[TMP11:%.*]] = fptoui float [[TMP10]] to i32 260; CHECK-NEXT: [[TMP12:%.*]] = sub i32 0, [[TMP7]] 261; CHECK-NEXT: [[TMP13:%.*]] = mul i32 [[TMP12]], [[TMP11]] 262; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP11]] to i64 263; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 264; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP14]], [[TMP15]] 265; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32 266; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP16]], 32 267; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 268; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP11]], [[TMP19]] 269; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP6]] to i64 270; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP20]] to i64 271; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP21]], [[TMP22]] 272; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 273; CHECK-NEXT: [[TMP25:%.*]] = lshr i64 [[TMP23]], 32 274; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 275; CHECK-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], [[TMP7]] 276; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP6]], [[TMP27]] 277; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP7]] 278; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 279; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 280; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP28]], [[TMP7]] 281; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP29]], i32 [[TMP32]], i32 [[TMP28]] 282; CHECK-NEXT: [[TMP34:%.*]] = icmp uge i32 [[TMP33]], [[TMP7]] 283; CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP31]], 1 284; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP34]], i32 [[TMP35]], i32 [[TMP31]] 285; CHECK-NEXT: [[TMP37:%.*]] = xor i32 [[TMP36]], [[TMP3]] 286; CHECK-NEXT: [[TMP38:%.*]] = sub i32 [[TMP37]], [[TMP3]] 287; CHECK-NEXT: store i32 [[TMP38]], i32 addrspace(1)* [[OUT:%.*]], align 4 288; CHECK-NEXT: ret void 289; 290; GFX6-LABEL: sdiv_i32: 291; GFX6: ; %bb.0: 292; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 293; GFX6-NEXT: s_mov_b32 s7, 0xf000 294; GFX6-NEXT: s_mov_b32 s6, -1 295; GFX6-NEXT: s_waitcnt lgkmcnt(0) 296; GFX6-NEXT: s_ashr_i32 s8, s3, 31 297; GFX6-NEXT: s_add_i32 s3, s3, s8 298; GFX6-NEXT: s_xor_b32 s3, s3, s8 299; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 300; GFX6-NEXT: s_sub_i32 s4, 0, s3 301; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 302; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 303; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 304; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 305; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 306; GFX6-NEXT: s_ashr_i32 s0, s2, 31 307; GFX6-NEXT: s_add_i32 s1, s2, s0 308; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 309; GFX6-NEXT: s_xor_b32 s1, s1, s0 310; GFX6-NEXT: s_xor_b32 s2, s0, s8 311; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 312; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 313; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 314; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 315; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 316; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 317; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 318; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 319; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 320; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 321; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 322; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 323; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 324; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 325; GFX6-NEXT: s_waitcnt lgkmcnt(0) 326; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 327; GFX6-NEXT: s_endpgm 328; 329; GFX9-LABEL: sdiv_i32: 330; GFX9: ; %bb.0: 331; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 332; GFX9-NEXT: v_mov_b32_e32 v2, 0 333; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 334; GFX9-NEXT: s_waitcnt lgkmcnt(0) 335; GFX9-NEXT: s_ashr_i32 s4, s3, 31 336; GFX9-NEXT: s_add_i32 s3, s3, s4 337; GFX9-NEXT: s_xor_b32 s3, s3, s4 338; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 339; GFX9-NEXT: s_sub_i32 s5, 0, s3 340; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 341; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 342; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 343; GFX9-NEXT: v_mul_lo_u32 v1, s5, v0 344; GFX9-NEXT: s_ashr_i32 s5, s2, 31 345; GFX9-NEXT: s_add_i32 s2, s2, s5 346; GFX9-NEXT: s_xor_b32 s2, s2, s5 347; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 348; GFX9-NEXT: s_xor_b32 s4, s5, s4 349; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 350; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 351; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3 352; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 353; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 354; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 355; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 356; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 357; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 358; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 359; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 360; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 361; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 362; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 363; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 364; GFX9-NEXT: s_endpgm 365; 366; GFX90A-LABEL: sdiv_i32: 367; GFX90A: ; %bb.0: 368; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 369; GFX90A-NEXT: v_mov_b32_e32 v1, 0 370; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 371; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 372; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 373; GFX90A-NEXT: s_add_i32 s3, s3, s4 374; GFX90A-NEXT: s_xor_b32 s3, s3, s4 375; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 376; GFX90A-NEXT: s_ashr_i32 s5, s2, 31 377; GFX90A-NEXT: s_add_i32 s2, s2, s5 378; GFX90A-NEXT: s_xor_b32 s4, s5, s4 379; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 380; GFX90A-NEXT: s_xor_b32 s2, s2, s5 381; GFX90A-NEXT: s_sub_i32 s5, 0, s3 382; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 383; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 384; GFX90A-NEXT: v_mul_lo_u32 v2, s5, v0 385; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 386; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 387; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 388; GFX90A-NEXT: v_mul_lo_u32 v2, v0, s3 389; GFX90A-NEXT: v_sub_u32_e32 v2, s2, v2 390; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 391; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 392; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 393; GFX90A-NEXT: v_subrev_u32_e32 v3, s3, v2 394; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 395; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 396; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 397; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 398; GFX90A-NEXT: v_xor_b32_e32 v0, s4, v0 399; GFX90A-NEXT: v_subrev_u32_e32 v0, s4, v0 400; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] 401; GFX90A-NEXT: s_endpgm 402 %r = sdiv i32 %x, %y 403 store i32 %r, i32 addrspace(1)* %out 404 ret void 405} 406 407define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) { 408; CHECK-LABEL: @srem_i32( 409; CHECK-NEXT: [[TMP1:%.*]] = ashr i32 [[X:%.*]], 31 410; CHECK-NEXT: [[TMP2:%.*]] = ashr i32 [[Y:%.*]], 31 411; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[X]], [[TMP1]] 412; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[Y]], [[TMP2]] 413; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP1]] 414; CHECK-NEXT: [[TMP6:%.*]] = xor i32 [[TMP4]], [[TMP2]] 415; CHECK-NEXT: [[TMP7:%.*]] = uitofp i32 [[TMP6]] to float 416; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 417; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP8]], 0x41EFFFFFC0000000 418; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP9]] to i32 419; CHECK-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP6]] 420; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], [[TMP10]] 421; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP10]] to i64 422; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64 423; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]] 424; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 425; CHECK-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP15]], 32 426; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 427; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP10]], [[TMP18]] 428; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64 429; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP19]] to i64 430; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP20]], [[TMP21]] 431; CHECK-NEXT: [[TMP23:%.*]] = trunc i64 [[TMP22]] to i32 432; CHECK-NEXT: [[TMP24:%.*]] = lshr i64 [[TMP22]], 32 433; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 434; CHECK-NEXT: [[TMP26:%.*]] = mul i32 [[TMP25]], [[TMP6]] 435; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP5]], [[TMP26]] 436; CHECK-NEXT: [[TMP28:%.*]] = icmp uge i32 [[TMP27]], [[TMP6]] 437; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP27]], [[TMP6]] 438; CHECK-NEXT: [[TMP30:%.*]] = select i1 [[TMP28]], i32 [[TMP29]], i32 [[TMP27]] 439; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP6]] 440; CHECK-NEXT: [[TMP32:%.*]] = sub i32 [[TMP30]], [[TMP6]] 441; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP30]] 442; CHECK-NEXT: [[TMP34:%.*]] = xor i32 [[TMP33]], [[TMP1]] 443; CHECK-NEXT: [[TMP35:%.*]] = sub i32 [[TMP34]], [[TMP1]] 444; CHECK-NEXT: store i32 [[TMP35]], i32 addrspace(1)* [[OUT:%.*]], align 4 445; CHECK-NEXT: ret void 446; 447; GFX6-LABEL: srem_i32: 448; GFX6: ; %bb.0: 449; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 450; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 451; GFX6-NEXT: s_waitcnt lgkmcnt(0) 452; GFX6-NEXT: s_ashr_i32 s4, s3, 31 453; GFX6-NEXT: s_add_i32 s3, s3, s4 454; GFX6-NEXT: s_xor_b32 s4, s3, s4 455; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 456; GFX6-NEXT: s_sub_i32 s3, 0, s4 457; GFX6-NEXT: s_ashr_i32 s5, s2, 31 458; GFX6-NEXT: s_add_i32 s2, s2, s5 459; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 460; GFX6-NEXT: s_xor_b32 s6, s2, s5 461; GFX6-NEXT: s_mov_b32 s2, -1 462; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 463; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 464; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 465; GFX6-NEXT: s_mov_b32 s3, 0xf000 466; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 467; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 468; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 469; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 470; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 471; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 472; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 473; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 474; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 475; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 476; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 477; GFX6-NEXT: v_xor_b32_e32 v0, s5, v0 478; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s5, v0 479; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 480; GFX6-NEXT: s_endpgm 481; 482; GFX9-LABEL: srem_i32: 483; GFX9: ; %bb.0: 484; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 485; GFX9-NEXT: s_waitcnt lgkmcnt(0) 486; GFX9-NEXT: s_ashr_i32 s4, s3, 31 487; GFX9-NEXT: s_add_i32 s3, s3, s4 488; GFX9-NEXT: s_xor_b32 s3, s3, s4 489; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 490; GFX9-NEXT: s_sub_i32 s4, 0, s3 491; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 492; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 493; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 494; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 495; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 496; GFX9-NEXT: s_ashr_i32 s4, s2, 31 497; GFX9-NEXT: s_add_i32 s2, s2, s4 498; GFX9-NEXT: s_xor_b32 s2, s2, s4 499; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 500; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 501; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 502; GFX9-NEXT: v_mov_b32_e32 v1, 0 503; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 504; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 505; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 506; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 507; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 508; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 509; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 510; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 511; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 512; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 513; GFX9-NEXT: s_waitcnt lgkmcnt(0) 514; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 515; GFX9-NEXT: s_endpgm 516; 517; GFX90A-LABEL: srem_i32: 518; GFX90A: ; %bb.0: 519; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 520; GFX90A-NEXT: v_mov_b32_e32 v1, 0 521; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 522; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 523; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 524; GFX90A-NEXT: s_add_i32 s3, s3, s4 525; GFX90A-NEXT: s_xor_b32 s3, s3, s4 526; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 527; GFX90A-NEXT: s_sub_i32 s5, 0, s3 528; GFX90A-NEXT: s_ashr_i32 s4, s2, 31 529; GFX90A-NEXT: s_add_i32 s2, s2, s4 530; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 531; GFX90A-NEXT: s_xor_b32 s2, s2, s4 532; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 533; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 534; GFX90A-NEXT: v_mul_lo_u32 v2, s5, v0 535; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 536; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 537; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 538; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s3 539; GFX90A-NEXT: v_sub_u32_e32 v0, s2, v0 540; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 541; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 542; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 543; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 544; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 545; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 546; GFX90A-NEXT: v_xor_b32_e32 v0, s4, v0 547; GFX90A-NEXT: v_subrev_u32_e32 v0, s4, v0 548; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] 549; GFX90A-NEXT: s_endpgm 550 %r = srem i32 %x, %y 551 store i32 %r, i32 addrspace(1)* %out 552 ret void 553} 554 555define amdgpu_kernel void @udiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 556; CHECK-LABEL: @udiv_i16( 557; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 558; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 559; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 560; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 561; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 562; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 563; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 564; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 565; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 566; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 567; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 568; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 569; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 570; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 571; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 572; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 65535 573; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 574; CHECK-NEXT: store i16 [[TMP17]], i16 addrspace(1)* [[OUT:%.*]], align 2 575; CHECK-NEXT: ret void 576; 577; GFX6-LABEL: udiv_i16: 578; GFX6: ; %bb.0: 579; GFX6-NEXT: s_load_dword s2, s[0:1], 0xb 580; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 581; GFX6-NEXT: s_waitcnt lgkmcnt(0) 582; GFX6-NEXT: s_lshr_b32 s3, s2, 16 583; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 584; GFX6-NEXT: s_and_b32 s2, s2, 0xffff 585; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s2 586; GFX6-NEXT: s_mov_b32 s3, 0xf000 587; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 588; GFX6-NEXT: s_mov_b32 s2, -1 589; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 590; GFX6-NEXT: v_trunc_f32_e32 v2, v2 591; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 592; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 593; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 594; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 595; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 596; GFX6-NEXT: s_endpgm 597; 598; GFX9-LABEL: udiv_i16: 599; GFX9: ; %bb.0: 600; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 601; GFX9-NEXT: v_mov_b32_e32 v3, 0 602; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 603; GFX9-NEXT: s_waitcnt lgkmcnt(0) 604; GFX9-NEXT: s_lshr_b32 s3, s2, 16 605; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 606; GFX9-NEXT: s_and_b32 s2, s2, 0xffff 607; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2 608; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 609; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 610; GFX9-NEXT: v_trunc_f32_e32 v2, v2 611; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2 612; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 613; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 614; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 615; GFX9-NEXT: global_store_short v3, v0, s[0:1] 616; GFX9-NEXT: s_endpgm 617; 618; GFX90A-LABEL: udiv_i16: 619; GFX90A: ; %bb.0: 620; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x2c 621; GFX90A-NEXT: v_mov_b32_e32 v3, 0 622; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 623; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 624; GFX90A-NEXT: s_lshr_b32 s3, s2, 16 625; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 626; GFX90A-NEXT: s_and_b32 s2, s2, 0xffff 627; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s2 628; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0 629; GFX90A-NEXT: v_mul_f32_e32 v2, v1, v2 630; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 631; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v2 632; GFX90A-NEXT: v_mad_f32 v1, -v2, v0, v1 633; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 634; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 635; GFX90A-NEXT: global_store_short v3, v0, s[0:1] 636; GFX90A-NEXT: s_endpgm 637 %r = udiv i16 %x, %y 638 store i16 %r, i16 addrspace(1)* %out 639 ret void 640} 641 642define amdgpu_kernel void @urem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 643; CHECK-LABEL: @urem_i16( 644; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[X:%.*]] to i32 645; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[Y:%.*]] to i32 646; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 647; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 648; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 649; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 650; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 651; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 652; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 653; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 654; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 655; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 656; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 657; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 658; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 659; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 660; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 661; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 662; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 663; CHECK-NEXT: store i16 [[TMP19]], i16 addrspace(1)* [[OUT:%.*]], align 2 664; CHECK-NEXT: ret void 665; 666; GFX6-LABEL: urem_i16: 667; GFX6: ; %bb.0: 668; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 669; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 670; GFX6-NEXT: s_waitcnt lgkmcnt(0) 671; GFX6-NEXT: s_lshr_b32 s2, s4, 16 672; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 673; GFX6-NEXT: s_and_b32 s3, s4, 0xffff 674; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3 675; GFX6-NEXT: s_mov_b32 s3, 0xf000 676; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 677; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 678; GFX6-NEXT: v_trunc_f32_e32 v2, v2 679; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 680; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 681; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 682; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 683; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 684; GFX6-NEXT: s_mov_b32 s2, -1 685; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 686; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 687; GFX6-NEXT: s_endpgm 688; 689; GFX9-LABEL: urem_i16: 690; GFX9: ; %bb.0: 691; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 692; GFX9-NEXT: s_waitcnt lgkmcnt(0) 693; GFX9-NEXT: s_lshr_b32 s3, s2, 16 694; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 695; GFX9-NEXT: s_and_b32 s4, s2, 0xffff 696; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 697; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 698; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 699; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 700; GFX9-NEXT: v_trunc_f32_e32 v2, v2 701; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 702; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 703; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 704; GFX9-NEXT: v_mov_b32_e32 v1, 0 705; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 706; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 707; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 708; GFX9-NEXT: s_waitcnt lgkmcnt(0) 709; GFX9-NEXT: global_store_short v1, v0, s[0:1] 710; GFX9-NEXT: s_endpgm 711; 712; GFX90A-LABEL: urem_i16: 713; GFX90A: ; %bb.0: 714; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x2c 715; GFX90A-NEXT: v_mov_b32_e32 v3, 0 716; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 717; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 718; GFX90A-NEXT: s_lshr_b32 s3, s2, 16 719; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 720; GFX90A-NEXT: s_and_b32 s4, s2, 0xffff 721; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s4 722; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0 723; GFX90A-NEXT: v_mul_f32_e32 v2, v1, v2 724; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 725; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v2 726; GFX90A-NEXT: v_mad_f32 v1, -v2, v0, v1 727; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 728; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 729; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s3 730; GFX90A-NEXT: v_sub_u32_e32 v0, s2, v0 731; GFX90A-NEXT: global_store_short v3, v0, s[0:1] 732; GFX90A-NEXT: s_endpgm 733 %r = urem i16 %x, %y 734 store i16 %r, i16 addrspace(1)* %out 735 ret void 736} 737 738define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 739; CHECK-LABEL: @sdiv_i16( 740; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 741; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 742; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 743; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 744; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 745; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 746; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 747; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 748; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 749; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 750; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 751; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 752; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 753; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 754; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 755; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 756; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 757; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 758; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 16 759; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 16 760; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 761; CHECK-NEXT: store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]], align 2 762; CHECK-NEXT: ret void 763; 764; GFX6-LABEL: sdiv_i16: 765; GFX6: ; %bb.0: 766; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 767; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 768; GFX6-NEXT: s_mov_b32 s3, 0xf000 769; GFX6-NEXT: s_mov_b32 s2, -1 770; GFX6-NEXT: s_waitcnt lgkmcnt(0) 771; GFX6-NEXT: s_ashr_i32 s5, s4, 16 772; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 773; GFX6-NEXT: s_sext_i32_i16 s4, s4 774; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 775; GFX6-NEXT: s_xor_b32 s4, s4, s5 776; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 777; GFX6-NEXT: s_ashr_i32 s4, s4, 30 778; GFX6-NEXT: s_or_b32 s4, s4, 1 779; GFX6-NEXT: v_mov_b32_e32 v3, s4 780; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 781; GFX6-NEXT: v_trunc_f32_e32 v2, v2 782; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 783; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 784; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 785; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 786; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 787; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 788; GFX6-NEXT: s_endpgm 789; 790; GFX9-LABEL: sdiv_i16: 791; GFX9: ; %bb.0: 792; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 793; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 794; GFX9-NEXT: v_mov_b32_e32 v1, 0 795; GFX9-NEXT: s_waitcnt lgkmcnt(0) 796; GFX9-NEXT: s_ashr_i32 s0, s4, 16 797; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 798; GFX9-NEXT: s_sext_i32_i16 s1, s4 799; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 800; GFX9-NEXT: s_xor_b32 s0, s1, s0 801; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 802; GFX9-NEXT: s_ashr_i32 s0, s0, 30 803; GFX9-NEXT: s_or_b32 s4, s0, 1 804; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 805; GFX9-NEXT: v_trunc_f32_e32 v3, v3 806; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 807; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 808; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 809; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 810; GFX9-NEXT: s_cselect_b32 s0, s4, 0 811; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 812; GFX9-NEXT: global_store_short v1, v0, s[2:3] 813; GFX9-NEXT: s_endpgm 814; 815; GFX90A-LABEL: sdiv_i16: 816; GFX90A: ; %bb.0: 817; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 818; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 819; GFX90A-NEXT: v_mov_b32_e32 v1, 0 820; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 821; GFX90A-NEXT: s_ashr_i32 s0, s4, 16 822; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 823; GFX90A-NEXT: s_sext_i32_i16 s1, s4 824; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 825; GFX90A-NEXT: s_xor_b32 s0, s1, s0 826; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 827; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 828; GFX90A-NEXT: s_or_b32 s4, s0, 1 829; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 830; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 831; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 832; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 833; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 834; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 835; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 836; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 837; GFX90A-NEXT: global_store_short v1, v0, s[2:3] 838; GFX90A-NEXT: s_endpgm 839 %r = sdiv i16 %x, %y 840 store i16 %r, i16 addrspace(1)* %out 841 ret void 842} 843 844define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) { 845; CHECK-LABEL: @srem_i16( 846; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 847; CHECK-NEXT: [[TMP2:%.*]] = sext i16 [[Y:%.*]] to i32 848; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 849; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 850; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 851; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 852; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 853; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 854; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 855; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 856; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 857; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 858; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 859; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 860; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 861; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 862; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 863; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 864; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 865; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 866; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 867; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 868; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 869; CHECK-NEXT: store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]], align 2 870; CHECK-NEXT: ret void 871; 872; GFX6-LABEL: srem_i16: 873; GFX6: ; %bb.0: 874; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 875; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 876; GFX6-NEXT: s_waitcnt lgkmcnt(0) 877; GFX6-NEXT: s_ashr_i32 s2, s4, 16 878; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 879; GFX6-NEXT: s_sext_i32_i16 s3, s4 880; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s3 881; GFX6-NEXT: s_xor_b32 s3, s3, s2 882; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 883; GFX6-NEXT: s_ashr_i32 s3, s3, 30 884; GFX6-NEXT: s_or_b32 s3, s3, 1 885; GFX6-NEXT: v_mov_b32_e32 v3, s3 886; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 887; GFX6-NEXT: v_trunc_f32_e32 v2, v2 888; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 889; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 890; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 891; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 892; GFX6-NEXT: s_mov_b32 s3, 0xf000 893; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 894; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 895; GFX6-NEXT: s_mov_b32 s2, -1 896; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 897; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0 898; GFX6-NEXT: s_endpgm 899; 900; GFX9-LABEL: srem_i16: 901; GFX9: ; %bb.0: 902; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 903; GFX9-NEXT: s_waitcnt lgkmcnt(0) 904; GFX9-NEXT: s_ashr_i32 s5, s4, 16 905; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5 906; GFX9-NEXT: s_sext_i32_i16 s2, s4 907; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2 908; GFX9-NEXT: s_xor_b32 s2, s2, s5 909; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 910; GFX9-NEXT: s_ashr_i32 s2, s2, 30 911; GFX9-NEXT: s_or_b32 s6, s2, 1 912; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 913; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 914; GFX9-NEXT: v_trunc_f32_e32 v2, v2 915; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 916; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 917; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 918; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 919; GFX9-NEXT: s_cselect_b32 s2, s6, 0 920; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 921; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 922; GFX9-NEXT: v_mov_b32_e32 v1, 0 923; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 924; GFX9-NEXT: s_waitcnt lgkmcnt(0) 925; GFX9-NEXT: global_store_short v1, v0, s[0:1] 926; GFX9-NEXT: s_endpgm 927; 928; GFX90A-LABEL: srem_i16: 929; GFX90A: ; %bb.0: 930; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 931; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 932; GFX90A-NEXT: v_mov_b32_e32 v1, 0 933; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 934; GFX90A-NEXT: s_ashr_i32 s5, s4, 16 935; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s5 936; GFX90A-NEXT: s_sext_i32_i16 s0, s4 937; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s0 938; GFX90A-NEXT: s_xor_b32 s0, s0, s5 939; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 940; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 941; GFX90A-NEXT: s_or_b32 s6, s0, 1 942; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 943; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 944; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 945; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 946; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 947; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 948; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 949; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 950; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s5 951; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 952; GFX90A-NEXT: global_store_short v1, v0, s[2:3] 953; GFX90A-NEXT: s_endpgm 954 %r = srem i16 %x, %y 955 store i16 %r, i16 addrspace(1)* %out 956 ret void 957} 958 959define amdgpu_kernel void @udiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 960; CHECK-LABEL: @udiv_i8( 961; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 962; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 963; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 964; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 965; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 966; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 967; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 968; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 969; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 970; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 971; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 972; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 973; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 974; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 975; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 976; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 255 977; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 978; CHECK-NEXT: store i8 [[TMP17]], i8 addrspace(1)* [[OUT:%.*]], align 1 979; CHECK-NEXT: ret void 980; 981; GFX6-LABEL: udiv_i8: 982; GFX6: ; %bb.0: 983; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 984; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 985; GFX6-NEXT: s_mov_b32 s3, 0xf000 986; GFX6-NEXT: s_mov_b32 s2, -1 987; GFX6-NEXT: s_waitcnt lgkmcnt(0) 988; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 989; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 990; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 991; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 992; GFX6-NEXT: v_trunc_f32_e32 v1, v1 993; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 994; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 995; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 996; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 997; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 998; GFX6-NEXT: s_endpgm 999; 1000; GFX9-LABEL: udiv_i8: 1001; GFX9: ; %bb.0: 1002; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 1003; GFX9-NEXT: v_mov_b32_e32 v2, 0 1004; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1005; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1006; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 1007; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 1008; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 1009; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 1010; GFX9-NEXT: v_trunc_f32_e32 v1, v1 1011; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 1012; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v3 1013; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1014; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 1015; GFX9-NEXT: global_store_byte v2, v0, s[0:1] 1016; GFX9-NEXT: s_endpgm 1017; 1018; GFX90A-LABEL: udiv_i8: 1019; GFX90A: ; %bb.0: 1020; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x2c 1021; GFX90A-NEXT: v_mov_b32_e32 v2, 0 1022; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1023; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1024; GFX90A-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 1025; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v0 1026; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v3, s2 1027; GFX90A-NEXT: v_mul_f32_e32 v1, v3, v1 1028; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 1029; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v1 1030; GFX90A-NEXT: v_mad_f32 v1, -v1, v0, v3 1031; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1032; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 1033; GFX90A-NEXT: global_store_byte v2, v0, s[0:1] 1034; GFX90A-NEXT: s_endpgm 1035 %r = udiv i8 %x, %y 1036 store i8 %r, i8 addrspace(1)* %out 1037 ret void 1038} 1039 1040define amdgpu_kernel void @urem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 1041; CHECK-LABEL: @urem_i8( 1042; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32 1043; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32 1044; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 1045; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 1046; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 1047; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 1048; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 1049; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 1050; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 1051; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 1052; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 1053; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 1054; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 1055; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 1056; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 1057; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 1058; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 1059; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 255 1060; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8 1061; CHECK-NEXT: store i8 [[TMP19]], i8 addrspace(1)* [[OUT:%.*]], align 1 1062; CHECK-NEXT: ret void 1063; 1064; GFX6-LABEL: urem_i8: 1065; GFX6: ; %bb.0: 1066; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 1067; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1068; GFX6-NEXT: s_mov_b32 s3, 0xf000 1069; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1070; GFX6-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 1071; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 1072; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 1073; GFX6-NEXT: s_lshr_b32 s2, s4, 8 1074; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 1075; GFX6-NEXT: v_trunc_f32_e32 v1, v1 1076; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 1077; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 1078; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1079; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 1080; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 1081; GFX6-NEXT: s_mov_b32 s2, -1 1082; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1083; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 1084; GFX6-NEXT: s_endpgm 1085; 1086; GFX9-LABEL: urem_i8: 1087; GFX9: ; %bb.0: 1088; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 1089; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1090; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2 1091; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 1092; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s2 1093; GFX9-NEXT: s_lshr_b32 s3, s2, 8 1094; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1095; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 1096; GFX9-NEXT: v_trunc_f32_e32 v1, v1 1097; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 1098; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2 1099; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1100; GFX9-NEXT: v_mov_b32_e32 v1, 0 1101; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 1102; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 1103; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 1104; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1105; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 1106; GFX9-NEXT: s_endpgm 1107; 1108; GFX90A-LABEL: urem_i8: 1109; GFX90A: ; %bb.0: 1110; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 1111; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1112; GFX90A-NEXT: v_mov_b32_e32 v2, 0 1113; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1114; GFX90A-NEXT: v_cvt_f32_ubyte1_e32 v0, s4 1115; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v0 1116; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v3, s4 1117; GFX90A-NEXT: s_lshr_b32 s0, s4, 8 1118; GFX90A-NEXT: v_mul_f32_e32 v1, v3, v1 1119; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 1120; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v1 1121; GFX90A-NEXT: v_mad_f32 v1, -v1, v0, v3 1122; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 1123; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 1124; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s0 1125; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 1126; GFX90A-NEXT: global_store_byte v2, v0, s[2:3] 1127; GFX90A-NEXT: s_endpgm 1128 %r = urem i8 %x, %y 1129 store i8 %r, i8 addrspace(1)* %out 1130 ret void 1131} 1132 1133define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 1134; CHECK-LABEL: @sdiv_i8( 1135; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 1136; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 1137; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 1138; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 1139; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 1140; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 1141; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 1142; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 1143; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 1144; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 1145; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 1146; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 1147; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 1148; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 1149; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 1150; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 1151; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 1152; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 1153; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 24 1154; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 24 1155; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8 1156; CHECK-NEXT: store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]], align 1 1157; CHECK-NEXT: ret void 1158; 1159; GFX6-LABEL: sdiv_i8: 1160; GFX6: ; %bb.0: 1161; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 1162; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1163; GFX6-NEXT: s_mov_b32 s3, 0xf000 1164; GFX6-NEXT: s_mov_b32 s2, -1 1165; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1166; GFX6-NEXT: s_bfe_i32 s5, s4, 0x80008 1167; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 1168; GFX6-NEXT: s_sext_i32_i8 s4, s4 1169; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 1170; GFX6-NEXT: s_xor_b32 s4, s4, s5 1171; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 1172; GFX6-NEXT: s_ashr_i32 s4, s4, 30 1173; GFX6-NEXT: s_or_b32 s4, s4, 1 1174; GFX6-NEXT: v_mov_b32_e32 v3, s4 1175; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 1176; GFX6-NEXT: v_trunc_f32_e32 v2, v2 1177; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 1178; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 1179; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 1180; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 1181; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1182; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 1183; GFX6-NEXT: s_endpgm 1184; 1185; GFX9-LABEL: sdiv_i8: 1186; GFX9: ; %bb.0: 1187; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 1188; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1189; GFX9-NEXT: v_mov_b32_e32 v1, 0 1190; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1191; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80008 1192; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 1193; GFX9-NEXT: s_sext_i32_i8 s1, s4 1194; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 1195; GFX9-NEXT: s_xor_b32 s0, s1, s0 1196; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 1197; GFX9-NEXT: s_ashr_i32 s0, s0, 30 1198; GFX9-NEXT: s_or_b32 s4, s0, 1 1199; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 1200; GFX9-NEXT: v_trunc_f32_e32 v3, v3 1201; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 1202; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 1203; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 1204; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 1205; GFX9-NEXT: s_cselect_b32 s0, s4, 0 1206; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 1207; GFX9-NEXT: global_store_byte v1, v0, s[2:3] 1208; GFX9-NEXT: s_endpgm 1209; 1210; GFX90A-LABEL: sdiv_i8: 1211; GFX90A: ; %bb.0: 1212; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 1213; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1214; GFX90A-NEXT: v_mov_b32_e32 v1, 0 1215; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1216; GFX90A-NEXT: s_bfe_i32 s0, s4, 0x80008 1217; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 1218; GFX90A-NEXT: s_sext_i32_i8 s1, s4 1219; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 1220; GFX90A-NEXT: s_xor_b32 s0, s1, s0 1221; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 1222; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 1223; GFX90A-NEXT: s_or_b32 s4, s0, 1 1224; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 1225; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 1226; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 1227; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 1228; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 1229; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 1230; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 1231; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 1232; GFX90A-NEXT: global_store_byte v1, v0, s[2:3] 1233; GFX90A-NEXT: s_endpgm 1234 %r = sdiv i8 %x, %y 1235 store i8 %r, i8 addrspace(1)* %out 1236 ret void 1237} 1238 1239define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) { 1240; CHECK-LABEL: @srem_i8( 1241; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X:%.*]] to i32 1242; CHECK-NEXT: [[TMP2:%.*]] = sext i8 [[Y:%.*]] to i32 1243; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 1244; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 1245; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 1246; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 1247; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 1248; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 1249; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 1250; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 1251; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 1252; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 1253; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 1254; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 1255; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 1256; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 1257; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 1258; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 1259; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 1260; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 1261; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 24 1262; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 24 1263; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8 1264; CHECK-NEXT: store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]], align 1 1265; CHECK-NEXT: ret void 1266; 1267; GFX6-LABEL: srem_i8: 1268; GFX6: ; %bb.0: 1269; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 1270; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1271; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1272; GFX6-NEXT: s_bfe_i32 s2, s4, 0x80008 1273; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 1274; GFX6-NEXT: s_sext_i32_i8 s5, s4 1275; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 1276; GFX6-NEXT: s_xor_b32 s2, s5, s2 1277; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 1278; GFX6-NEXT: s_ashr_i32 s2, s2, 30 1279; GFX6-NEXT: s_or_b32 s2, s2, 1 1280; GFX6-NEXT: v_mov_b32_e32 v3, s2 1281; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 1282; GFX6-NEXT: v_trunc_f32_e32 v2, v2 1283; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 1284; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 1285; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 1286; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 1287; GFX6-NEXT: s_lshr_b32 s3, s4, 8 1288; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 1289; GFX6-NEXT: v_mul_lo_u32 v0, v0, s3 1290; GFX6-NEXT: s_mov_b32 s3, 0xf000 1291; GFX6-NEXT: s_mov_b32 s2, -1 1292; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1293; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 1294; GFX6-NEXT: s_endpgm 1295; 1296; GFX9-LABEL: srem_i8: 1297; GFX9: ; %bb.0: 1298; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 1299; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1300; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1301; GFX9-NEXT: s_bfe_i32 s0, s4, 0x80008 1302; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 1303; GFX9-NEXT: s_sext_i32_i8 s1, s4 1304; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 1305; GFX9-NEXT: s_xor_b32 s0, s1, s0 1306; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 1307; GFX9-NEXT: s_ashr_i32 s0, s0, 30 1308; GFX9-NEXT: s_lshr_b32 s5, s4, 8 1309; GFX9-NEXT: s_or_b32 s6, s0, 1 1310; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 1311; GFX9-NEXT: v_trunc_f32_e32 v2, v2 1312; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 1313; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 1314; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 1315; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 1316; GFX9-NEXT: s_cselect_b32 s0, s6, 0 1317; GFX9-NEXT: v_add_u32_e32 v0, s0, v2 1318; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 1319; GFX9-NEXT: v_mov_b32_e32 v1, 0 1320; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 1321; GFX9-NEXT: global_store_byte v1, v0, s[2:3] 1322; GFX9-NEXT: s_endpgm 1323; 1324; GFX90A-LABEL: srem_i8: 1325; GFX90A: ; %bb.0: 1326; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 1327; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 1328; GFX90A-NEXT: v_mov_b32_e32 v0, 0 1329; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1330; GFX90A-NEXT: s_bfe_i32 s0, s4, 0x80008 1331; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s0 1332; GFX90A-NEXT: s_sext_i32_i8 s1, s4 1333; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 1334; GFX90A-NEXT: s_xor_b32 s0, s1, s0 1335; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v1 1336; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 1337; GFX90A-NEXT: s_lshr_b32 s5, s4, 8 1338; GFX90A-NEXT: s_or_b32 s6, s0, 1 1339; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 1340; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 1341; GFX90A-NEXT: v_mad_f32 v2, -v3, v1, v2 1342; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 1343; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v1| 1344; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 1345; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 1346; GFX90A-NEXT: v_add_u32_e32 v1, s0, v3 1347; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s5 1348; GFX90A-NEXT: v_sub_u32_e32 v1, s4, v1 1349; GFX90A-NEXT: global_store_byte v0, v1, s[2:3] 1350; GFX90A-NEXT: s_endpgm 1351 %r = srem i8 %x, %y 1352 store i8 %r, i8 addrspace(1)* %out 1353 ret void 1354} 1355 1356define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1357; CHECK-LABEL: @udiv_v4i32( 1358; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1359; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1360; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 1361; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 1362; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 1363; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 1364; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 1365; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 1366; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 1367; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 1368; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 1369; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 1370; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 1371; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 1372; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 1373; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 1374; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1375; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1376; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1377; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1378; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1379; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 1380; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 1381; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 1382; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1 1383; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]] 1384; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]] 1385; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]] 1386; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]] 1387; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 1388; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 1389; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i64 0 1390; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[X]], i64 1 1391; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1392; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float 1393; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 1394; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000 1395; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32 1396; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]] 1397; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]] 1398; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64 1399; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64 1400; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]] 1401; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 1402; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32 1403; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32 1404; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]] 1405; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64 1406; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64 1407; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]] 1408; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 1409; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32 1410; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32 1411; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]] 1412; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]] 1413; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]] 1414; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1 1415; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]] 1416; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]] 1417; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]] 1418; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]] 1419; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1 1420; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]] 1421; CHECK-NEXT: [[TMP64:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP63]], i64 1 1422; CHECK-NEXT: [[TMP65:%.*]] = extractelement <4 x i32> [[X]], i64 2 1423; CHECK-NEXT: [[TMP66:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1424; CHECK-NEXT: [[TMP67:%.*]] = uitofp i32 [[TMP66]] to float 1425; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP67]]) 1426; CHECK-NEXT: [[TMP69:%.*]] = fmul fast float [[TMP68]], 0x41EFFFFFC0000000 1427; CHECK-NEXT: [[TMP70:%.*]] = fptoui float [[TMP69]] to i32 1428; CHECK-NEXT: [[TMP71:%.*]] = sub i32 0, [[TMP66]] 1429; CHECK-NEXT: [[TMP72:%.*]] = mul i32 [[TMP71]], [[TMP70]] 1430; CHECK-NEXT: [[TMP73:%.*]] = zext i32 [[TMP70]] to i64 1431; CHECK-NEXT: [[TMP74:%.*]] = zext i32 [[TMP72]] to i64 1432; CHECK-NEXT: [[TMP75:%.*]] = mul i64 [[TMP73]], [[TMP74]] 1433; CHECK-NEXT: [[TMP76:%.*]] = trunc i64 [[TMP75]] to i32 1434; CHECK-NEXT: [[TMP77:%.*]] = lshr i64 [[TMP75]], 32 1435; CHECK-NEXT: [[TMP78:%.*]] = trunc i64 [[TMP77]] to i32 1436; CHECK-NEXT: [[TMP79:%.*]] = add i32 [[TMP70]], [[TMP78]] 1437; CHECK-NEXT: [[TMP80:%.*]] = zext i32 [[TMP65]] to i64 1438; CHECK-NEXT: [[TMP81:%.*]] = zext i32 [[TMP79]] to i64 1439; CHECK-NEXT: [[TMP82:%.*]] = mul i64 [[TMP80]], [[TMP81]] 1440; CHECK-NEXT: [[TMP83:%.*]] = trunc i64 [[TMP82]] to i32 1441; CHECK-NEXT: [[TMP84:%.*]] = lshr i64 [[TMP82]], 32 1442; CHECK-NEXT: [[TMP85:%.*]] = trunc i64 [[TMP84]] to i32 1443; CHECK-NEXT: [[TMP86:%.*]] = mul i32 [[TMP85]], [[TMP66]] 1444; CHECK-NEXT: [[TMP87:%.*]] = sub i32 [[TMP65]], [[TMP86]] 1445; CHECK-NEXT: [[TMP88:%.*]] = icmp uge i32 [[TMP87]], [[TMP66]] 1446; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP85]], 1 1447; CHECK-NEXT: [[TMP90:%.*]] = select i1 [[TMP88]], i32 [[TMP89]], i32 [[TMP85]] 1448; CHECK-NEXT: [[TMP91:%.*]] = sub i32 [[TMP87]], [[TMP66]] 1449; CHECK-NEXT: [[TMP92:%.*]] = select i1 [[TMP88]], i32 [[TMP91]], i32 [[TMP87]] 1450; CHECK-NEXT: [[TMP93:%.*]] = icmp uge i32 [[TMP92]], [[TMP66]] 1451; CHECK-NEXT: [[TMP94:%.*]] = add i32 [[TMP90]], 1 1452; CHECK-NEXT: [[TMP95:%.*]] = select i1 [[TMP93]], i32 [[TMP94]], i32 [[TMP90]] 1453; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i32> [[TMP64]], i32 [[TMP95]], i64 2 1454; CHECK-NEXT: [[TMP97:%.*]] = extractelement <4 x i32> [[X]], i64 3 1455; CHECK-NEXT: [[TMP98:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1456; CHECK-NEXT: [[TMP99:%.*]] = uitofp i32 [[TMP98]] to float 1457; CHECK-NEXT: [[TMP100:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP99]]) 1458; CHECK-NEXT: [[TMP101:%.*]] = fmul fast float [[TMP100]], 0x41EFFFFFC0000000 1459; CHECK-NEXT: [[TMP102:%.*]] = fptoui float [[TMP101]] to i32 1460; CHECK-NEXT: [[TMP103:%.*]] = sub i32 0, [[TMP98]] 1461; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP102]] 1462; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP102]] to i64 1463; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64 1464; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 1465; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 1466; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 1467; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 1468; CHECK-NEXT: [[TMP111:%.*]] = add i32 [[TMP102]], [[TMP110]] 1469; CHECK-NEXT: [[TMP112:%.*]] = zext i32 [[TMP97]] to i64 1470; CHECK-NEXT: [[TMP113:%.*]] = zext i32 [[TMP111]] to i64 1471; CHECK-NEXT: [[TMP114:%.*]] = mul i64 [[TMP112]], [[TMP113]] 1472; CHECK-NEXT: [[TMP115:%.*]] = trunc i64 [[TMP114]] to i32 1473; CHECK-NEXT: [[TMP116:%.*]] = lshr i64 [[TMP114]], 32 1474; CHECK-NEXT: [[TMP117:%.*]] = trunc i64 [[TMP116]] to i32 1475; CHECK-NEXT: [[TMP118:%.*]] = mul i32 [[TMP117]], [[TMP98]] 1476; CHECK-NEXT: [[TMP119:%.*]] = sub i32 [[TMP97]], [[TMP118]] 1477; CHECK-NEXT: [[TMP120:%.*]] = icmp uge i32 [[TMP119]], [[TMP98]] 1478; CHECK-NEXT: [[TMP121:%.*]] = add i32 [[TMP117]], 1 1479; CHECK-NEXT: [[TMP122:%.*]] = select i1 [[TMP120]], i32 [[TMP121]], i32 [[TMP117]] 1480; CHECK-NEXT: [[TMP123:%.*]] = sub i32 [[TMP119]], [[TMP98]] 1481; CHECK-NEXT: [[TMP124:%.*]] = select i1 [[TMP120]], i32 [[TMP123]], i32 [[TMP119]] 1482; CHECK-NEXT: [[TMP125:%.*]] = icmp uge i32 [[TMP124]], [[TMP98]] 1483; CHECK-NEXT: [[TMP126:%.*]] = add i32 [[TMP122]], 1 1484; CHECK-NEXT: [[TMP127:%.*]] = select i1 [[TMP125]], i32 [[TMP126]], i32 [[TMP122]] 1485; CHECK-NEXT: [[TMP128:%.*]] = insertelement <4 x i32> [[TMP96]], i32 [[TMP127]], i64 3 1486; CHECK-NEXT: store <4 x i32> [[TMP128]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1487; CHECK-NEXT: ret void 1488; 1489; GFX6-LABEL: udiv_v4i32: 1490; GFX6: ; %bb.0: 1491; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1492; GFX6-NEXT: s_mov_b32 s3, 0x4f7ffffe 1493; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 1494; GFX6-NEXT: s_mov_b32 s15, 0xf000 1495; GFX6-NEXT: s_mov_b32 s14, -1 1496; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1497; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 1498; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 1499; GFX6-NEXT: s_sub_i32 s2, 0, s8 1500; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s10 1501; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 1502; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 1503; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s11 1504; GFX6-NEXT: v_mul_f32_e32 v0, s3, v0 1505; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 1506; GFX6-NEXT: v_mul_f32_e32 v1, s3, v1 1507; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 1508; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 1509; GFX6-NEXT: s_sub_i32 s2, 0, s9 1510; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 1511; GFX6-NEXT: s_sub_i32 s2, 0, s10 1512; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 1513; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 1514; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1515; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 1516; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 1517; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 1518; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 1519; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 1520; GFX6-NEXT: v_mul_lo_u32 v5, v1, s9 1521; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 1522; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 1523; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 1524; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v2 1525; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 1526; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 1527; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 1528; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v4 1529; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1530; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s5, v5 1531; GFX6-NEXT: v_mul_f32_e32 v2, s3, v2 1532; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 1533; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 1534; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v3 1535; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 1536; GFX6-NEXT: v_mul_lo_u32 v4, s2, v2 1537; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s9, v3 1538; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1539; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4 1540; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v1 1541; GFX6-NEXT: s_sub_i32 s0, 0, s11 1542; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 1543; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v6 1544; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 1545; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1546; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 1547; GFX6-NEXT: v_mul_f32_e32 v4, s3, v4 1548; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 1549; GFX6-NEXT: v_mul_lo_u32 v3, v2, s10 1550; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v2 1551; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 1552; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 1553; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v3 1554; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 1555; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] 1556; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, s10, v3 1557; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 1558; GFX6-NEXT: v_mul_hi_u32 v4, s7, v4 1559; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] 1560; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v2 1561; GFX6-NEXT: v_mul_lo_u32 v6, v4, s11 1562; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 1563; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1564; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1565; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v6 1566; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v3 1567; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] 1568; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s11, v3 1569; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 1570; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 1571; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1572; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 1573; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 1574; GFX6-NEXT: s_endpgm 1575; 1576; GFX9-LABEL: udiv_v4i32: 1577; GFX9: ; %bb.0: 1578; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 1579; GFX9-NEXT: s_mov_b32 s12, 0x4f7ffffe 1580; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1581; GFX9-NEXT: v_mov_b32_e32 v4, 0 1582; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1583; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 1584; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 1585; GFX9-NEXT: s_sub_i32 s2, 0, s8 1586; GFX9-NEXT: s_sub_i32 s3, 0, s9 1587; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1588; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 1589; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s10 1590; GFX9-NEXT: v_mul_f32_e32 v0, s12, v0 1591; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1592; GFX9-NEXT: v_mul_f32_e32 v1, s12, v1 1593; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 1594; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 1595; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 1596; GFX9-NEXT: s_sub_i32 s2, 0, s10 1597; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 1598; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 1599; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 1600; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 1601; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 1602; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 1603; GFX9-NEXT: v_mul_f32_e32 v3, s12, v5 1604; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 1605; GFX9-NEXT: v_mul_lo_u32 v5, v0, s8 1606; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s11 1607; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 1608; GFX9-NEXT: v_add_u32_e32 v7, 1, v0 1609; GFX9-NEXT: v_sub_u32_e32 v5, s4, v5 1610; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 1611; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc 1612; GFX9-NEXT: v_subrev_u32_e32 v7, s8, v5 1613; GFX9-NEXT: v_mul_lo_u32 v6, v1, s9 1614; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc 1615; GFX9-NEXT: v_add_u32_e32 v7, 1, v0 1616; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 1617; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 1618; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc 1619; GFX9-NEXT: v_mul_lo_u32 v7, s2, v3 1620; GFX9-NEXT: v_sub_u32_e32 v6, s5, v6 1621; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 1622; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v6 1623; GFX9-NEXT: v_mul_f32_e32 v2, s12, v2 1624; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1625; GFX9-NEXT: v_mul_hi_u32 v5, v3, v7 1626; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 1627; GFX9-NEXT: s_sub_i32 s2, 0, s11 1628; GFX9-NEXT: v_subrev_u32_e32 v7, s9, v6 1629; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 1630; GFX9-NEXT: v_mul_lo_u32 v5, s2, v2 1631; GFX9-NEXT: v_mul_hi_u32 v3, s6, v3 1632; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc 1633; GFX9-NEXT: v_add_u32_e32 v7, 1, v1 1634; GFX9-NEXT: v_mul_hi_u32 v5, v2, v5 1635; GFX9-NEXT: v_mul_lo_u32 v8, v3, s10 1636; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v6 1637; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc 1638; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 1639; GFX9-NEXT: v_mul_hi_u32 v5, s7, v2 1640; GFX9-NEXT: v_sub_u32_e32 v6, s6, v8 1641; GFX9-NEXT: v_add_u32_e32 v7, 1, v3 1642; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v6 1643; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v7, vcc 1644; GFX9-NEXT: v_subrev_u32_e32 v3, s10, v6 1645; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc 1646; GFX9-NEXT: v_mul_lo_u32 v6, v5, s11 1647; GFX9-NEXT: v_add_u32_e32 v7, 1, v2 1648; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 1649; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc 1650; GFX9-NEXT: v_sub_u32_e32 v3, s7, v6 1651; GFX9-NEXT: v_add_u32_e32 v6, 1, v5 1652; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1653; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 1654; GFX9-NEXT: v_subrev_u32_e32 v6, s11, v3 1655; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1656; GFX9-NEXT: v_add_u32_e32 v6, 1, v5 1657; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1658; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc 1659; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1660; GFX9-NEXT: s_endpgm 1661; 1662; GFX90A-LABEL: udiv_v4i32: 1663; GFX90A: ; %bb.0: 1664; GFX90A-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 1665; GFX90A-NEXT: s_mov_b32 s3, 0x4f7ffffe 1666; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1667; GFX90A-NEXT: v_mov_b32_e32 v4, 0 1668; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 1669; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 1670; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 1671; GFX90A-NEXT: s_sub_i32 s2, 0, s8 1672; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 1673; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 1674; GFX90A-NEXT: v_mul_f32_e32 v0, s3, v0 1675; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 1676; GFX90A-NEXT: v_mul_f32_e32 v1, s3, v1 1677; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 1678; GFX90A-NEXT: v_mul_lo_u32 v2, s2, v0 1679; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 1680; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 1681; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 1682; GFX90A-NEXT: v_mul_lo_u32 v2, v0, s8 1683; GFX90A-NEXT: v_sub_u32_e32 v2, s4, v2 1684; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 1685; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 1686; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1687; GFX90A-NEXT: v_subrev_u32_e32 v3, s8, v2 1688; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 1689; GFX90A-NEXT: s_sub_i32 s2, 0, s9 1690; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 1691; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 1692; GFX90A-NEXT: v_mul_lo_u32 v2, s2, v1 1693; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1694; GFX90A-NEXT: v_mul_hi_u32 v2, v1, v2 1695; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s10 1696; GFX90A-NEXT: v_add_u32_e32 v1, v1, v2 1697; GFX90A-NEXT: v_mul_hi_u32 v1, s5, v1 1698; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s9 1699; GFX90A-NEXT: v_sub_u32_e32 v2, s5, v2 1700; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v3 1701; GFX90A-NEXT: v_add_u32_e32 v5, 1, v1 1702; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 1703; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1704; GFX90A-NEXT: v_subrev_u32_e32 v5, s9, v2 1705; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1706; GFX90A-NEXT: v_add_u32_e32 v5, 1, v1 1707; GFX90A-NEXT: v_mul_f32_e32 v3, s3, v3 1708; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 1709; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 1710; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 1711; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s11 1712; GFX90A-NEXT: s_sub_i32 s2, 0, s10 1713; GFX90A-NEXT: v_mul_lo_u32 v2, s2, v3 1714; GFX90A-NEXT: v_mul_hi_u32 v2, v3, v2 1715; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v5 1716; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 1717; GFX90A-NEXT: v_mul_hi_u32 v2, s6, v2 1718; GFX90A-NEXT: v_mul_lo_u32 v3, v2, s10 1719; GFX90A-NEXT: v_mul_f32_e32 v5, s3, v5 1720; GFX90A-NEXT: v_sub_u32_e32 v3, s6, v3 1721; GFX90A-NEXT: v_cvt_u32_f32_e32 v5, v5 1722; GFX90A-NEXT: v_add_u32_e32 v6, 1, v2 1723; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 1724; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 1725; GFX90A-NEXT: v_subrev_u32_e32 v6, s10, v3 1726; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1727; GFX90A-NEXT: s_sub_i32 s2, 0, s11 1728; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 1729; GFX90A-NEXT: v_mul_lo_u32 v3, s2, v5 1730; GFX90A-NEXT: v_mul_hi_u32 v3, v5, v3 1731; GFX90A-NEXT: v_add_u32_e32 v3, v5, v3 1732; GFX90A-NEXT: v_mul_hi_u32 v3, s7, v3 1733; GFX90A-NEXT: v_mul_lo_u32 v5, v3, s11 1734; GFX90A-NEXT: v_add_u32_e32 v6, 1, v2 1735; GFX90A-NEXT: v_sub_u32_e32 v5, s7, v5 1736; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 1737; GFX90A-NEXT: v_add_u32_e32 v6, 1, v3 1738; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 1739; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1740; GFX90A-NEXT: v_subrev_u32_e32 v6, s11, v5 1741; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 1742; GFX90A-NEXT: v_add_u32_e32 v6, 1, v3 1743; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s11, v5 1744; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 1745; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 1746; GFX90A-NEXT: s_endpgm 1747 %r = udiv <4 x i32> %x, %y 1748 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 1749 ret void 1750} 1751 1752define amdgpu_kernel void @urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 1753; CHECK-LABEL: @urem_v4i32( 1754; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 1755; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 1756; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 1757; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 1758; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 1759; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 1760; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 1761; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 1762; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 1763; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 1764; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 1765; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 1766; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 1767; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 1768; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 1769; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 1770; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 1771; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 1772; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 1773; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 1774; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 1775; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 1776; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 1777; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 1778; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]] 1779; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]] 1780; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]] 1781; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]] 1782; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]] 1783; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> undef, i32 [[TMP29]], i64 0 1784; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[X]], i64 1 1785; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[Y]], i64 1 1786; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float 1787; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 1788; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000 1789; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32 1790; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]] 1791; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]] 1792; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64 1793; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64 1794; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]] 1795; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32 1796; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32 1797; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 1798; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]] 1799; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64 1800; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64 1801; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]] 1802; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32 1803; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32 1804; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 1805; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]] 1806; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]] 1807; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]] 1808; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]] 1809; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]] 1810; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]] 1811; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]] 1812; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]] 1813; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP59]], i64 1 1814; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i32> [[X]], i64 2 1815; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i32> [[Y]], i64 2 1816; CHECK-NEXT: [[TMP63:%.*]] = uitofp i32 [[TMP62]] to float 1817; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP63]]) 1818; CHECK-NEXT: [[TMP65:%.*]] = fmul fast float [[TMP64]], 0x41EFFFFFC0000000 1819; CHECK-NEXT: [[TMP66:%.*]] = fptoui float [[TMP65]] to i32 1820; CHECK-NEXT: [[TMP67:%.*]] = sub i32 0, [[TMP62]] 1821; CHECK-NEXT: [[TMP68:%.*]] = mul i32 [[TMP67]], [[TMP66]] 1822; CHECK-NEXT: [[TMP69:%.*]] = zext i32 [[TMP66]] to i64 1823; CHECK-NEXT: [[TMP70:%.*]] = zext i32 [[TMP68]] to i64 1824; CHECK-NEXT: [[TMP71:%.*]] = mul i64 [[TMP69]], [[TMP70]] 1825; CHECK-NEXT: [[TMP72:%.*]] = trunc i64 [[TMP71]] to i32 1826; CHECK-NEXT: [[TMP73:%.*]] = lshr i64 [[TMP71]], 32 1827; CHECK-NEXT: [[TMP74:%.*]] = trunc i64 [[TMP73]] to i32 1828; CHECK-NEXT: [[TMP75:%.*]] = add i32 [[TMP66]], [[TMP74]] 1829; CHECK-NEXT: [[TMP76:%.*]] = zext i32 [[TMP61]] to i64 1830; CHECK-NEXT: [[TMP77:%.*]] = zext i32 [[TMP75]] to i64 1831; CHECK-NEXT: [[TMP78:%.*]] = mul i64 [[TMP76]], [[TMP77]] 1832; CHECK-NEXT: [[TMP79:%.*]] = trunc i64 [[TMP78]] to i32 1833; CHECK-NEXT: [[TMP80:%.*]] = lshr i64 [[TMP78]], 32 1834; CHECK-NEXT: [[TMP81:%.*]] = trunc i64 [[TMP80]] to i32 1835; CHECK-NEXT: [[TMP82:%.*]] = mul i32 [[TMP81]], [[TMP62]] 1836; CHECK-NEXT: [[TMP83:%.*]] = sub i32 [[TMP61]], [[TMP82]] 1837; CHECK-NEXT: [[TMP84:%.*]] = icmp uge i32 [[TMP83]], [[TMP62]] 1838; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP83]], [[TMP62]] 1839; CHECK-NEXT: [[TMP86:%.*]] = select i1 [[TMP84]], i32 [[TMP85]], i32 [[TMP83]] 1840; CHECK-NEXT: [[TMP87:%.*]] = icmp uge i32 [[TMP86]], [[TMP62]] 1841; CHECK-NEXT: [[TMP88:%.*]] = sub i32 [[TMP86]], [[TMP62]] 1842; CHECK-NEXT: [[TMP89:%.*]] = select i1 [[TMP87]], i32 [[TMP88]], i32 [[TMP86]] 1843; CHECK-NEXT: [[TMP90:%.*]] = insertelement <4 x i32> [[TMP60]], i32 [[TMP89]], i64 2 1844; CHECK-NEXT: [[TMP91:%.*]] = extractelement <4 x i32> [[X]], i64 3 1845; CHECK-NEXT: [[TMP92:%.*]] = extractelement <4 x i32> [[Y]], i64 3 1846; CHECK-NEXT: [[TMP93:%.*]] = uitofp i32 [[TMP92]] to float 1847; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP93]]) 1848; CHECK-NEXT: [[TMP95:%.*]] = fmul fast float [[TMP94]], 0x41EFFFFFC0000000 1849; CHECK-NEXT: [[TMP96:%.*]] = fptoui float [[TMP95]] to i32 1850; CHECK-NEXT: [[TMP97:%.*]] = sub i32 0, [[TMP92]] 1851; CHECK-NEXT: [[TMP98:%.*]] = mul i32 [[TMP97]], [[TMP96]] 1852; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP96]] to i64 1853; CHECK-NEXT: [[TMP100:%.*]] = zext i32 [[TMP98]] to i64 1854; CHECK-NEXT: [[TMP101:%.*]] = mul i64 [[TMP99]], [[TMP100]] 1855; CHECK-NEXT: [[TMP102:%.*]] = trunc i64 [[TMP101]] to i32 1856; CHECK-NEXT: [[TMP103:%.*]] = lshr i64 [[TMP101]], 32 1857; CHECK-NEXT: [[TMP104:%.*]] = trunc i64 [[TMP103]] to i32 1858; CHECK-NEXT: [[TMP105:%.*]] = add i32 [[TMP96]], [[TMP104]] 1859; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP91]] to i64 1860; CHECK-NEXT: [[TMP107:%.*]] = zext i32 [[TMP105]] to i64 1861; CHECK-NEXT: [[TMP108:%.*]] = mul i64 [[TMP106]], [[TMP107]] 1862; CHECK-NEXT: [[TMP109:%.*]] = trunc i64 [[TMP108]] to i32 1863; CHECK-NEXT: [[TMP110:%.*]] = lshr i64 [[TMP108]], 32 1864; CHECK-NEXT: [[TMP111:%.*]] = trunc i64 [[TMP110]] to i32 1865; CHECK-NEXT: [[TMP112:%.*]] = mul i32 [[TMP111]], [[TMP92]] 1866; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP91]], [[TMP112]] 1867; CHECK-NEXT: [[TMP114:%.*]] = icmp uge i32 [[TMP113]], [[TMP92]] 1868; CHECK-NEXT: [[TMP115:%.*]] = sub i32 [[TMP113]], [[TMP92]] 1869; CHECK-NEXT: [[TMP116:%.*]] = select i1 [[TMP114]], i32 [[TMP115]], i32 [[TMP113]] 1870; CHECK-NEXT: [[TMP117:%.*]] = icmp uge i32 [[TMP116]], [[TMP92]] 1871; CHECK-NEXT: [[TMP118:%.*]] = sub i32 [[TMP116]], [[TMP92]] 1872; CHECK-NEXT: [[TMP119:%.*]] = select i1 [[TMP117]], i32 [[TMP118]], i32 [[TMP116]] 1873; CHECK-NEXT: [[TMP120:%.*]] = insertelement <4 x i32> [[TMP90]], i32 [[TMP119]], i64 3 1874; CHECK-NEXT: store <4 x i32> [[TMP120]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 1875; CHECK-NEXT: ret void 1876; 1877; GFX6-LABEL: urem_v4i32: 1878; GFX6: ; %bb.0: 1879; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 1880; GFX6-NEXT: s_mov_b32 s13, 0x4f7ffffe 1881; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 1882; GFX6-NEXT: s_mov_b32 s3, 0xf000 1883; GFX6-NEXT: s_waitcnt lgkmcnt(0) 1884; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 1885; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 1886; GFX6-NEXT: s_sub_i32 s2, 0, s8 1887; GFX6-NEXT: s_sub_i32 s12, 0, s9 1888; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 1889; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 1890; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 1891; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s11 1892; GFX6-NEXT: v_mul_f32_e32 v0, s13, v0 1893; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 1894; GFX6-NEXT: v_mul_f32_e32 v1, s13, v1 1895; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 1896; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 1897; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 1898; GFX6-NEXT: s_mov_b32 s2, -1 1899; GFX6-NEXT: v_mul_lo_u32 v4, s12, v1 1900; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 1901; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 1902; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 1903; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 1904; GFX6-NEXT: v_add_i32_e32 v1, vcc, v4, v1 1905; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 1906; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 1907; GFX6-NEXT: v_mul_f32_e32 v2, s13, v3 1908; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 1909; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 1910; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 1911; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 1912; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1913; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1914; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 1915; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 1916; GFX6-NEXT: s_sub_i32 s4, 0, s10 1917; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 1918; GFX6-NEXT: v_mul_lo_u32 v3, s4, v2 1919; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 1920; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 1921; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1922; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 1923; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1924; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 1925; GFX6-NEXT: s_sub_i32 s4, 0, s11 1926; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 1927; GFX6-NEXT: v_mul_f32_e32 v3, s13, v4 1928; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 1929; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v1 1930; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 1931; GFX6-NEXT: v_mul_lo_u32 v5, s4, v3 1932; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 1933; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 1934; GFX6-NEXT: v_mul_lo_u32 v2, v2, s10 1935; GFX6-NEXT: v_mul_hi_u32 v4, v3, v5 1936; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 1937; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 1938; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 1939; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s10, v2 1940; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1941; GFX6-NEXT: v_mul_lo_u32 v3, v3, s11 1942; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 1943; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s10, v2 1944; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 1945; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 1946; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 1947; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 1948; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1949; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1950; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s11, v3 1951; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 1952; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 1953; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 1954; GFX6-NEXT: s_endpgm 1955; 1956; GFX9-LABEL: urem_v4i32: 1957; GFX9: ; %bb.0: 1958; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 1959; GFX9-NEXT: s_mov_b32 s12, 0x4f7ffffe 1960; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 1961; GFX9-NEXT: v_mov_b32_e32 v4, 0 1962; GFX9-NEXT: s_waitcnt lgkmcnt(0) 1963; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 1964; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 1965; GFX9-NEXT: s_sub_i32 s2, 0, s8 1966; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s10 1967; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 1968; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 1969; GFX9-NEXT: s_sub_i32 s3, 0, s9 1970; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 1971; GFX9-NEXT: v_mul_f32_e32 v0, s12, v0 1972; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 1973; GFX9-NEXT: v_mul_f32_e32 v1, s12, v1 1974; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 1975; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s11 1976; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 1977; GFX9-NEXT: s_sub_i32 s2, 0, s10 1978; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 1979; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 1980; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 1981; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 1982; GFX9-NEXT: v_mul_f32_e32 v2, s12, v5 1983; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 1984; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 1985; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v6 1986; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 1987; GFX9-NEXT: v_mul_lo_u32 v5, s2, v2 1988; GFX9-NEXT: s_sub_i32 s2, 0, s11 1989; GFX9-NEXT: v_mul_f32_e32 v3, s12, v3 1990; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 1991; GFX9-NEXT: v_mul_hi_u32 v5, v2, v5 1992; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 1993; GFX9-NEXT: v_mul_lo_u32 v0, v0, s8 1994; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 1995; GFX9-NEXT: v_mul_lo_u32 v5, s2, v3 1996; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 1997; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 1998; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 1999; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 2000; GFX9-NEXT: v_subrev_u32_e32 v6, s8, v0 2001; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2002; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 2003; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 2004; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 2005; GFX9-NEXT: v_mul_lo_u32 v2, v2, s10 2006; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 2007; GFX9-NEXT: v_subrev_u32_e32 v6, s8, v0 2008; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2009; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc 2010; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v1 2011; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2012; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 2013; GFX9-NEXT: v_mul_lo_u32 v3, v3, s11 2014; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v1 2015; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2016; GFX9-NEXT: v_sub_u32_e32 v2, s6, v2 2017; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 2018; GFX9-NEXT: v_subrev_u32_e32 v5, s10, v2 2019; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 2020; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2021; GFX9-NEXT: v_subrev_u32_e32 v5, s10, v2 2022; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 2023; GFX9-NEXT: v_sub_u32_e32 v3, s7, v3 2024; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2025; GFX9-NEXT: v_subrev_u32_e32 v5, s11, v3 2026; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 2027; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2028; GFX9-NEXT: v_subrev_u32_e32 v5, s11, v3 2029; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 2030; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2031; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2032; GFX9-NEXT: s_endpgm 2033; 2034; GFX90A-LABEL: urem_v4i32: 2035; GFX90A: ; %bb.0: 2036; GFX90A-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 2037; GFX90A-NEXT: s_mov_b32 s12, 0x4f7ffffe 2038; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2039; GFX90A-NEXT: v_mov_b32_e32 v4, 0 2040; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 2041; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 2042; GFX90A-NEXT: s_sub_i32 s2, 0, s8 2043; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 2044; GFX90A-NEXT: s_sub_i32 s3, 0, s9 2045; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 2046; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 2047; GFX90A-NEXT: v_mul_f32_e32 v0, s12, v0 2048; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 2049; GFX90A-NEXT: v_mul_f32_e32 v1, s12, v1 2050; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 2051; GFX90A-NEXT: v_mul_lo_u32 v2, s2, v0 2052; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 2053; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 2054; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 2055; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s8 2056; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 2057; GFX90A-NEXT: v_subrev_u32_e32 v2, s8, v0 2058; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2059; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2060; GFX90A-NEXT: v_subrev_u32_e32 v2, s8, v0 2061; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2062; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2063; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s10 2064; GFX90A-NEXT: v_mul_lo_u32 v3, s3, v1 2065; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 2066; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 2067; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v2 2068; GFX90A-NEXT: v_mul_hi_u32 v1, s5, v1 2069; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s9 2070; GFX90A-NEXT: v_sub_u32_e32 v1, s5, v1 2071; GFX90A-NEXT: v_mul_f32_e32 v2, s12, v2 2072; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 2073; GFX90A-NEXT: v_subrev_u32_e32 v3, s9, v1 2074; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2075; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2076; GFX90A-NEXT: v_subrev_u32_e32 v3, s9, v1 2077; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2078; GFX90A-NEXT: s_sub_i32 s2, 0, s10 2079; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2080; GFX90A-NEXT: v_mul_lo_u32 v3, s2, v2 2081; GFX90A-NEXT: v_mul_hi_u32 v3, v2, v3 2082; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 2083; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s11 2084; GFX90A-NEXT: v_mul_hi_u32 v2, s6, v2 2085; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s10 2086; GFX90A-NEXT: v_sub_u32_e32 v2, s6, v2 2087; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v3 2088; GFX90A-NEXT: v_subrev_u32_e32 v5, s10, v2 2089; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 2090; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2091; GFX90A-NEXT: v_mul_f32_e32 v3, s12, v3 2092; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 2093; GFX90A-NEXT: v_subrev_u32_e32 v5, s10, v2 2094; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 2095; GFX90A-NEXT: s_sub_i32 s2, 0, s11 2096; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2097; GFX90A-NEXT: v_mul_lo_u32 v5, s2, v3 2098; GFX90A-NEXT: v_mul_hi_u32 v5, v3, v5 2099; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 2100; GFX90A-NEXT: v_mul_hi_u32 v3, s7, v3 2101; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s11 2102; GFX90A-NEXT: v_sub_u32_e32 v3, s7, v3 2103; GFX90A-NEXT: v_subrev_u32_e32 v5, s11, v3 2104; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 2105; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2106; GFX90A-NEXT: v_subrev_u32_e32 v5, s11, v3 2107; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 2108; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2109; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2110; GFX90A-NEXT: s_endpgm 2111 %r = urem <4 x i32> %x, %y 2112 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 2113 ret void 2114} 2115 2116define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 2117; CHECK-LABEL: @sdiv_v4i32( 2118; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 2119; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 2120; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 2121; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 2122; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 2123; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 2124; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 2125; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 2126; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 2127; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 2128; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 2129; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000 2130; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 2131; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]] 2132; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]] 2133; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64 2134; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 2135; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 2136; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 2137; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 2138; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 2139; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]] 2140; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64 2141; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64 2142; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 2143; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 2144; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 2145; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 2146; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]] 2147; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]] 2148; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]] 2149; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1 2150; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]] 2151; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]] 2152; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]] 2153; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]] 2154; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1 2155; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]] 2156; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]] 2157; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]] 2158; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> undef, i32 [[TMP40]], i64 0 2159; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i32> [[X]], i64 1 2160; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i32> [[Y]], i64 1 2161; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31 2162; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31 2163; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]] 2164; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]] 2165; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]] 2166; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]] 2167; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]] 2168; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float 2169; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]]) 2170; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000 2171; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32 2172; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]] 2173; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]] 2174; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64 2175; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64 2176; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]] 2177; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32 2178; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32 2179; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32 2180; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]] 2181; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64 2182; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64 2183; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]] 2184; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32 2185; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32 2186; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32 2187; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]] 2188; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]] 2189; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]] 2190; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1 2191; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]] 2192; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]] 2193; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]] 2194; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]] 2195; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1 2196; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]] 2197; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]] 2198; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]] 2199; CHECK-NEXT: [[TMP82:%.*]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP81]], i64 1 2200; CHECK-NEXT: [[TMP83:%.*]] = extractelement <4 x i32> [[X]], i64 2 2201; CHECK-NEXT: [[TMP84:%.*]] = extractelement <4 x i32> [[Y]], i64 2 2202; CHECK-NEXT: [[TMP85:%.*]] = ashr i32 [[TMP83]], 31 2203; CHECK-NEXT: [[TMP86:%.*]] = ashr i32 [[TMP84]], 31 2204; CHECK-NEXT: [[TMP87:%.*]] = xor i32 [[TMP85]], [[TMP86]] 2205; CHECK-NEXT: [[TMP88:%.*]] = add i32 [[TMP83]], [[TMP85]] 2206; CHECK-NEXT: [[TMP89:%.*]] = add i32 [[TMP84]], [[TMP86]] 2207; CHECK-NEXT: [[TMP90:%.*]] = xor i32 [[TMP88]], [[TMP85]] 2208; CHECK-NEXT: [[TMP91:%.*]] = xor i32 [[TMP89]], [[TMP86]] 2209; CHECK-NEXT: [[TMP92:%.*]] = uitofp i32 [[TMP91]] to float 2210; CHECK-NEXT: [[TMP93:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP92]]) 2211; CHECK-NEXT: [[TMP94:%.*]] = fmul fast float [[TMP93]], 0x41EFFFFFC0000000 2212; CHECK-NEXT: [[TMP95:%.*]] = fptoui float [[TMP94]] to i32 2213; CHECK-NEXT: [[TMP96:%.*]] = sub i32 0, [[TMP91]] 2214; CHECK-NEXT: [[TMP97:%.*]] = mul i32 [[TMP96]], [[TMP95]] 2215; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP95]] to i64 2216; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64 2217; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]] 2218; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 2219; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32 2220; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32 2221; CHECK-NEXT: [[TMP104:%.*]] = add i32 [[TMP95]], [[TMP103]] 2222; CHECK-NEXT: [[TMP105:%.*]] = zext i32 [[TMP90]] to i64 2223; CHECK-NEXT: [[TMP106:%.*]] = zext i32 [[TMP104]] to i64 2224; CHECK-NEXT: [[TMP107:%.*]] = mul i64 [[TMP105]], [[TMP106]] 2225; CHECK-NEXT: [[TMP108:%.*]] = trunc i64 [[TMP107]] to i32 2226; CHECK-NEXT: [[TMP109:%.*]] = lshr i64 [[TMP107]], 32 2227; CHECK-NEXT: [[TMP110:%.*]] = trunc i64 [[TMP109]] to i32 2228; CHECK-NEXT: [[TMP111:%.*]] = mul i32 [[TMP110]], [[TMP91]] 2229; CHECK-NEXT: [[TMP112:%.*]] = sub i32 [[TMP90]], [[TMP111]] 2230; CHECK-NEXT: [[TMP113:%.*]] = icmp uge i32 [[TMP112]], [[TMP91]] 2231; CHECK-NEXT: [[TMP114:%.*]] = add i32 [[TMP110]], 1 2232; CHECK-NEXT: [[TMP115:%.*]] = select i1 [[TMP113]], i32 [[TMP114]], i32 [[TMP110]] 2233; CHECK-NEXT: [[TMP116:%.*]] = sub i32 [[TMP112]], [[TMP91]] 2234; CHECK-NEXT: [[TMP117:%.*]] = select i1 [[TMP113]], i32 [[TMP116]], i32 [[TMP112]] 2235; CHECK-NEXT: [[TMP118:%.*]] = icmp uge i32 [[TMP117]], [[TMP91]] 2236; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], 1 2237; CHECK-NEXT: [[TMP120:%.*]] = select i1 [[TMP118]], i32 [[TMP119]], i32 [[TMP115]] 2238; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP120]], [[TMP87]] 2239; CHECK-NEXT: [[TMP122:%.*]] = sub i32 [[TMP121]], [[TMP87]] 2240; CHECK-NEXT: [[TMP123:%.*]] = insertelement <4 x i32> [[TMP82]], i32 [[TMP122]], i64 2 2241; CHECK-NEXT: [[TMP124:%.*]] = extractelement <4 x i32> [[X]], i64 3 2242; CHECK-NEXT: [[TMP125:%.*]] = extractelement <4 x i32> [[Y]], i64 3 2243; CHECK-NEXT: [[TMP126:%.*]] = ashr i32 [[TMP124]], 31 2244; CHECK-NEXT: [[TMP127:%.*]] = ashr i32 [[TMP125]], 31 2245; CHECK-NEXT: [[TMP128:%.*]] = xor i32 [[TMP126]], [[TMP127]] 2246; CHECK-NEXT: [[TMP129:%.*]] = add i32 [[TMP124]], [[TMP126]] 2247; CHECK-NEXT: [[TMP130:%.*]] = add i32 [[TMP125]], [[TMP127]] 2248; CHECK-NEXT: [[TMP131:%.*]] = xor i32 [[TMP129]], [[TMP126]] 2249; CHECK-NEXT: [[TMP132:%.*]] = xor i32 [[TMP130]], [[TMP127]] 2250; CHECK-NEXT: [[TMP133:%.*]] = uitofp i32 [[TMP132]] to float 2251; CHECK-NEXT: [[TMP134:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP133]]) 2252; CHECK-NEXT: [[TMP135:%.*]] = fmul fast float [[TMP134]], 0x41EFFFFFC0000000 2253; CHECK-NEXT: [[TMP136:%.*]] = fptoui float [[TMP135]] to i32 2254; CHECK-NEXT: [[TMP137:%.*]] = sub i32 0, [[TMP132]] 2255; CHECK-NEXT: [[TMP138:%.*]] = mul i32 [[TMP137]], [[TMP136]] 2256; CHECK-NEXT: [[TMP139:%.*]] = zext i32 [[TMP136]] to i64 2257; CHECK-NEXT: [[TMP140:%.*]] = zext i32 [[TMP138]] to i64 2258; CHECK-NEXT: [[TMP141:%.*]] = mul i64 [[TMP139]], [[TMP140]] 2259; CHECK-NEXT: [[TMP142:%.*]] = trunc i64 [[TMP141]] to i32 2260; CHECK-NEXT: [[TMP143:%.*]] = lshr i64 [[TMP141]], 32 2261; CHECK-NEXT: [[TMP144:%.*]] = trunc i64 [[TMP143]] to i32 2262; CHECK-NEXT: [[TMP145:%.*]] = add i32 [[TMP136]], [[TMP144]] 2263; CHECK-NEXT: [[TMP146:%.*]] = zext i32 [[TMP131]] to i64 2264; CHECK-NEXT: [[TMP147:%.*]] = zext i32 [[TMP145]] to i64 2265; CHECK-NEXT: [[TMP148:%.*]] = mul i64 [[TMP146]], [[TMP147]] 2266; CHECK-NEXT: [[TMP149:%.*]] = trunc i64 [[TMP148]] to i32 2267; CHECK-NEXT: [[TMP150:%.*]] = lshr i64 [[TMP148]], 32 2268; CHECK-NEXT: [[TMP151:%.*]] = trunc i64 [[TMP150]] to i32 2269; CHECK-NEXT: [[TMP152:%.*]] = mul i32 [[TMP151]], [[TMP132]] 2270; CHECK-NEXT: [[TMP153:%.*]] = sub i32 [[TMP131]], [[TMP152]] 2271; CHECK-NEXT: [[TMP154:%.*]] = icmp uge i32 [[TMP153]], [[TMP132]] 2272; CHECK-NEXT: [[TMP155:%.*]] = add i32 [[TMP151]], 1 2273; CHECK-NEXT: [[TMP156:%.*]] = select i1 [[TMP154]], i32 [[TMP155]], i32 [[TMP151]] 2274; CHECK-NEXT: [[TMP157:%.*]] = sub i32 [[TMP153]], [[TMP132]] 2275; CHECK-NEXT: [[TMP158:%.*]] = select i1 [[TMP154]], i32 [[TMP157]], i32 [[TMP153]] 2276; CHECK-NEXT: [[TMP159:%.*]] = icmp uge i32 [[TMP158]], [[TMP132]] 2277; CHECK-NEXT: [[TMP160:%.*]] = add i32 [[TMP156]], 1 2278; CHECK-NEXT: [[TMP161:%.*]] = select i1 [[TMP159]], i32 [[TMP160]], i32 [[TMP156]] 2279; CHECK-NEXT: [[TMP162:%.*]] = xor i32 [[TMP161]], [[TMP128]] 2280; CHECK-NEXT: [[TMP163:%.*]] = sub i32 [[TMP162]], [[TMP128]] 2281; CHECK-NEXT: [[TMP164:%.*]] = insertelement <4 x i32> [[TMP123]], i32 [[TMP163]], i64 3 2282; CHECK-NEXT: store <4 x i32> [[TMP164]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 2283; CHECK-NEXT: ret void 2284; 2285; GFX6-LABEL: sdiv_v4i32: 2286; GFX6: ; %bb.0: 2287; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 2288; GFX6-NEXT: s_mov_b32 s16, 0x4f7ffffe 2289; GFX6-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x9 2290; GFX6-NEXT: s_mov_b32 s15, 0xf000 2291; GFX6-NEXT: s_mov_b32 s14, -1 2292; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2293; GFX6-NEXT: s_ashr_i32 s2, s8, 31 2294; GFX6-NEXT: s_add_i32 s3, s8, s2 2295; GFX6-NEXT: s_xor_b32 s3, s3, s2 2296; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 2297; GFX6-NEXT: s_ashr_i32 s8, s9, 31 2298; GFX6-NEXT: s_add_i32 s0, s9, s8 2299; GFX6-NEXT: s_xor_b32 s9, s0, s8 2300; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 2301; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 2302; GFX6-NEXT: s_sub_i32 s1, 0, s3 2303; GFX6-NEXT: s_ashr_i32 s0, s4, 31 2304; GFX6-NEXT: v_mul_f32_e32 v0, s16, v0 2305; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 2306; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 2307; GFX6-NEXT: s_xor_b32 s2, s0, s2 2308; GFX6-NEXT: v_mul_lo_u32 v2, s1, v0 2309; GFX6-NEXT: s_add_i32 s1, s4, s0 2310; GFX6-NEXT: v_mul_f32_e32 v1, s16, v1 2311; GFX6-NEXT: s_xor_b32 s1, s1, s0 2312; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 2313; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2314; GFX6-NEXT: s_sub_i32 s0, 0, s9 2315; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 2316; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 2317; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 2318; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 2319; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 2320; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 2321; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 2322; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v3 2323; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] 2324; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s3, v3 2325; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[0:1] 2326; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 2327; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2328; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 2329; GFX6-NEXT: s_ashr_i32 s0, s5, 31 2330; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 2331; GFX6-NEXT: s_add_i32 s1, s5, s0 2332; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 2333; GFX6-NEXT: s_ashr_i32 s3, s10, 31 2334; GFX6-NEXT: s_xor_b32 s1, s1, s0 2335; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 2336; GFX6-NEXT: s_xor_b32 s2, s0, s8 2337; GFX6-NEXT: s_add_i32 s0, s10, s3 2338; GFX6-NEXT: s_xor_b32 s4, s0, s3 2339; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 2340; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 2341; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 2342; GFX6-NEXT: v_mul_lo_u32 v2, v1, s9 2343; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 2344; GFX6-NEXT: v_mul_f32_e32 v3, s16, v3 2345; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 2346; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 2347; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 2348; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] 2349; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s9, v2 2350; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] 2351; GFX6-NEXT: s_sub_i32 s0, 0, s4 2352; GFX6-NEXT: v_mul_lo_u32 v5, s0, v3 2353; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v1 2354; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 2355; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 2356; GFX6-NEXT: v_mul_hi_u32 v2, v3, v5 2357; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 2358; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 2359; GFX6-NEXT: s_ashr_i32 s2, s11, 31 2360; GFX6-NEXT: s_ashr_i32 s0, s6, 31 2361; GFX6-NEXT: s_add_i32 s5, s11, s2 2362; GFX6-NEXT: s_add_i32 s1, s6, s0 2363; GFX6-NEXT: s_xor_b32 s5, s5, s2 2364; GFX6-NEXT: s_xor_b32 s1, s1, s0 2365; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 2366; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s5 2367; GFX6-NEXT: v_mul_hi_u32 v2, s1, v2 2368; GFX6-NEXT: s_xor_b32 s3, s0, s3 2369; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v4 2370; GFX6-NEXT: v_mul_lo_u32 v3, v2, s4 2371; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v2 2372; GFX6-NEXT: v_mul_f32_e32 v4, s16, v4 2373; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 2374; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 2375; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v3 2376; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] 2377; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s4, v3 2378; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 2379; GFX6-NEXT: s_sub_i32 s0, 0, s5 2380; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 2381; GFX6-NEXT: s_ashr_i32 s0, s7, 31 2382; GFX6-NEXT: s_add_i32 s1, s7, s0 2383; GFX6-NEXT: s_xor_b32 s1, s1, s0 2384; GFX6-NEXT: v_mul_hi_u32 v5, v4, v5 2385; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v2 2386; GFX6-NEXT: s_xor_b32 s2, s0, s2 2387; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 2388; GFX6-NEXT: v_mul_hi_u32 v4, s1, v4 2389; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 2390; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2391; GFX6-NEXT: v_xor_b32_e32 v2, s3, v2 2392; GFX6-NEXT: v_mul_lo_u32 v3, v4, s5 2393; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 2394; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v2 2395; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 2396; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s5, v3 2397; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[0:1] 2398; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v3 2399; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[0:1] 2400; GFX6-NEXT: v_add_i32_e32 v5, vcc, 1, v4 2401; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 2402; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc 2403; GFX6-NEXT: v_xor_b32_e32 v3, s2, v3 2404; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v3 2405; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0 2406; GFX6-NEXT: s_endpgm 2407; 2408; GFX9-LABEL: sdiv_v4i32: 2409; GFX9: ; %bb.0: 2410; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 2411; GFX9-NEXT: s_mov_b32 s15, 0x4f7ffffe 2412; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2413; GFX9-NEXT: v_mov_b32_e32 v4, 0 2414; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2415; GFX9-NEXT: s_ashr_i32 s2, s8, 31 2416; GFX9-NEXT: s_add_i32 s3, s8, s2 2417; GFX9-NEXT: s_xor_b32 s3, s3, s2 2418; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 2419; GFX9-NEXT: s_ashr_i32 s12, s9, 31 2420; GFX9-NEXT: s_add_i32 s9, s9, s12 2421; GFX9-NEXT: s_xor_b32 s9, s9, s12 2422; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 2423; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 2424; GFX9-NEXT: s_sub_i32 s14, 0, s3 2425; GFX9-NEXT: s_ashr_i32 s8, s4, 31 2426; GFX9-NEXT: v_mul_f32_e32 v0, s15, v0 2427; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 2428; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 2429; GFX9-NEXT: s_add_i32 s4, s4, s8 2430; GFX9-NEXT: s_xor_b32 s4, s4, s8 2431; GFX9-NEXT: v_mul_lo_u32 v2, s14, v0 2432; GFX9-NEXT: v_mul_f32_e32 v1, s15, v1 2433; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 2434; GFX9-NEXT: s_sub_i32 s14, 0, s9 2435; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 2436; GFX9-NEXT: s_ashr_i32 s13, s5, 31 2437; GFX9-NEXT: v_mul_lo_u32 v3, s14, v1 2438; GFX9-NEXT: s_add_i32 s5, s5, s13 2439; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 2440; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 2441; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3 2442; GFX9-NEXT: s_xor_b32 s5, s5, s13 2443; GFX9-NEXT: s_xor_b32 s2, s8, s2 2444; GFX9-NEXT: v_mul_lo_u32 v3, v0, s3 2445; GFX9-NEXT: v_add_u32_e32 v1, v1, v2 2446; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 2447; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 2448; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3 2449; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 2450; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2451; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v3 2452; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 2453; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 2454; GFX9-NEXT: s_ashr_i32 s3, s10, 31 2455; GFX9-NEXT: s_add_i32 s4, s10, s3 2456; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 2457; GFX9-NEXT: s_xor_b32 s4, s4, s3 2458; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 2459; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4 2460; GFX9-NEXT: v_mul_lo_u32 v2, v1, s9 2461; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 2462; GFX9-NEXT: s_ashr_i32 s8, s11, 31 2463; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3 2464; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 2465; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 2466; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 2467; GFX9-NEXT: v_mul_f32_e32 v3, s15, v3 2468; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 2469; GFX9-NEXT: v_subrev_u32_e32 v5, s9, v2 2470; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2471; GFX9-NEXT: s_sub_i32 s5, 0, s4 2472; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 2473; GFX9-NEXT: v_mul_lo_u32 v2, s5, v3 2474; GFX9-NEXT: s_add_i32 s9, s11, s8 2475; GFX9-NEXT: v_add_u32_e32 v5, 1, v1 2476; GFX9-NEXT: s_xor_b32 s9, s9, s8 2477; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 2478; GFX9-NEXT: v_mul_hi_u32 v2, v3, v2 2479; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s9 2480; GFX9-NEXT: s_ashr_i32 s5, s6, 31 2481; GFX9-NEXT: s_add_i32 s6, s6, s5 2482; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 2483; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v5 2484; GFX9-NEXT: s_xor_b32 s6, s6, s5 2485; GFX9-NEXT: v_mul_hi_u32 v2, s6, v2 2486; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 2487; GFX9-NEXT: v_mul_f32_e32 v3, s15, v3 2488; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 2489; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 2490; GFX9-NEXT: s_xor_b32 s2, s13, s12 2491; GFX9-NEXT: v_mul_lo_u32 v5, v2, s4 2492; GFX9-NEXT: v_xor_b32_e32 v1, s2, v1 2493; GFX9-NEXT: v_subrev_u32_e32 v1, s2, v1 2494; GFX9-NEXT: s_xor_b32 s2, s5, s3 2495; GFX9-NEXT: s_sub_i32 s3, 0, s9 2496; GFX9-NEXT: v_mul_lo_u32 v7, s3, v3 2497; GFX9-NEXT: v_sub_u32_e32 v5, s6, v5 2498; GFX9-NEXT: v_add_u32_e32 v6, 1, v2 2499; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 2500; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2501; GFX9-NEXT: v_subrev_u32_e32 v6, s4, v5 2502; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 2503; GFX9-NEXT: v_mul_hi_u32 v6, v3, v7 2504; GFX9-NEXT: s_ashr_i32 s3, s7, 31 2505; GFX9-NEXT: s_add_i32 s5, s7, s3 2506; GFX9-NEXT: s_xor_b32 s5, s5, s3 2507; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 2508; GFX9-NEXT: v_mul_hi_u32 v3, s5, v3 2509; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 2510; GFX9-NEXT: v_add_u32_e32 v6, 1, v2 2511; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 2512; GFX9-NEXT: v_mul_lo_u32 v5, v3, s9 2513; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 2514; GFX9-NEXT: v_xor_b32_e32 v2, s2, v2 2515; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v2 2516; GFX9-NEXT: v_sub_u32_e32 v5, s5, v5 2517; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 2518; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2519; GFX9-NEXT: v_subrev_u32_e32 v6, s9, v5 2520; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 2521; GFX9-NEXT: v_add_u32_e32 v6, 1, v3 2522; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 2523; GFX9-NEXT: s_xor_b32 s2, s3, s8 2524; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2525; GFX9-NEXT: v_xor_b32_e32 v3, s2, v3 2526; GFX9-NEXT: v_subrev_u32_e32 v3, s2, v3 2527; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2528; GFX9-NEXT: s_endpgm 2529; 2530; GFX90A-LABEL: sdiv_v4i32: 2531; GFX90A: ; %bb.0: 2532; GFX90A-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 2533; GFX90A-NEXT: s_mov_b32 s13, 0x4f7ffffe 2534; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2535; GFX90A-NEXT: v_mov_b32_e32 v4, 0 2536; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 2537; GFX90A-NEXT: s_ashr_i32 s2, s8, 31 2538; GFX90A-NEXT: s_add_i32 s3, s8, s2 2539; GFX90A-NEXT: s_xor_b32 s3, s3, s2 2540; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 2541; GFX90A-NEXT: s_ashr_i32 s8, s4, 31 2542; GFX90A-NEXT: s_add_i32 s4, s4, s8 2543; GFX90A-NEXT: s_xor_b32 s2, s8, s2 2544; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 2545; GFX90A-NEXT: s_xor_b32 s4, s4, s8 2546; GFX90A-NEXT: s_sub_i32 s8, 0, s3 2547; GFX90A-NEXT: s_ashr_i32 s12, s9, 31 2548; GFX90A-NEXT: v_mul_f32_e32 v0, s13, v0 2549; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 2550; GFX90A-NEXT: v_mul_lo_u32 v1, s8, v0 2551; GFX90A-NEXT: v_mul_hi_u32 v1, v0, v1 2552; GFX90A-NEXT: v_add_u32_e32 v0, v0, v1 2553; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 2554; GFX90A-NEXT: v_mul_lo_u32 v1, v0, s3 2555; GFX90A-NEXT: v_sub_u32_e32 v1, s4, v1 2556; GFX90A-NEXT: s_add_i32 s4, s9, s12 2557; GFX90A-NEXT: s_xor_b32 s4, s4, s12 2558; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s4 2559; GFX90A-NEXT: v_add_u32_e32 v2, 1, v0 2560; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 2561; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2562; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v1 2563; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 2564; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 2565; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v3 2566; GFX90A-NEXT: v_add_u32_e32 v2, 1, v0 2567; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2568; GFX90A-NEXT: v_xor_b32_e32 v0, s2, v0 2569; GFX90A-NEXT: v_mul_f32_e32 v1, s13, v1 2570; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 2571; GFX90A-NEXT: v_subrev_u32_e32 v0, s2, v0 2572; GFX90A-NEXT: s_ashr_i32 s2, s5, 31 2573; GFX90A-NEXT: s_add_i32 s5, s5, s2 2574; GFX90A-NEXT: s_xor_b32 s3, s2, s12 2575; GFX90A-NEXT: s_xor_b32 s2, s5, s2 2576; GFX90A-NEXT: s_sub_i32 s5, 0, s4 2577; GFX90A-NEXT: v_mul_lo_u32 v2, s5, v1 2578; GFX90A-NEXT: v_mul_hi_u32 v2, v1, v2 2579; GFX90A-NEXT: v_add_u32_e32 v1, v1, v2 2580; GFX90A-NEXT: v_mul_hi_u32 v1, s2, v1 2581; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s4 2582; GFX90A-NEXT: v_sub_u32_e32 v2, s2, v2 2583; GFX90A-NEXT: s_ashr_i32 s2, s10, 31 2584; GFX90A-NEXT: s_add_i32 s5, s10, s2 2585; GFX90A-NEXT: s_xor_b32 s5, s5, s2 2586; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s5 2587; GFX90A-NEXT: v_add_u32_e32 v3, 1, v1 2588; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 2589; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2590; GFX90A-NEXT: v_subrev_u32_e32 v3, s4, v2 2591; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 2592; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 2593; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v5 2594; GFX90A-NEXT: v_add_u32_e32 v3, 1, v1 2595; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2596; GFX90A-NEXT: v_xor_b32_e32 v1, s3, v1 2597; GFX90A-NEXT: v_mul_f32_e32 v2, s13, v2 2598; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 2599; GFX90A-NEXT: v_subrev_u32_e32 v1, s3, v1 2600; GFX90A-NEXT: s_ashr_i32 s3, s6, 31 2601; GFX90A-NEXT: s_add_i32 s4, s6, s3 2602; GFX90A-NEXT: s_xor_b32 s2, s3, s2 2603; GFX90A-NEXT: s_xor_b32 s3, s4, s3 2604; GFX90A-NEXT: s_sub_i32 s4, 0, s5 2605; GFX90A-NEXT: v_mul_lo_u32 v3, s4, v2 2606; GFX90A-NEXT: v_mul_hi_u32 v3, v2, v3 2607; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 2608; GFX90A-NEXT: v_mul_hi_u32 v2, s3, v2 2609; GFX90A-NEXT: v_mul_lo_u32 v3, v2, s5 2610; GFX90A-NEXT: v_sub_u32_e32 v3, s3, v3 2611; GFX90A-NEXT: s_ashr_i32 s3, s11, 31 2612; GFX90A-NEXT: s_add_i32 s4, s11, s3 2613; GFX90A-NEXT: s_xor_b32 s4, s4, s3 2614; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s4 2615; GFX90A-NEXT: v_add_u32_e32 v5, 1, v2 2616; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 2617; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2618; GFX90A-NEXT: v_subrev_u32_e32 v5, s5, v3 2619; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 2620; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 2621; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v6 2622; GFX90A-NEXT: v_add_u32_e32 v5, 1, v2 2623; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2624; GFX90A-NEXT: v_xor_b32_e32 v2, s2, v2 2625; GFX90A-NEXT: v_mul_f32_e32 v3, s13, v3 2626; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 2627; GFX90A-NEXT: v_subrev_u32_e32 v2, s2, v2 2628; GFX90A-NEXT: s_ashr_i32 s2, s7, 31 2629; GFX90A-NEXT: s_add_i32 s5, s7, s2 2630; GFX90A-NEXT: s_xor_b32 s3, s2, s3 2631; GFX90A-NEXT: s_xor_b32 s2, s5, s2 2632; GFX90A-NEXT: s_sub_i32 s5, 0, s4 2633; GFX90A-NEXT: v_mul_lo_u32 v5, s5, v3 2634; GFX90A-NEXT: v_mul_hi_u32 v5, v3, v5 2635; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 2636; GFX90A-NEXT: v_mul_hi_u32 v3, s2, v3 2637; GFX90A-NEXT: v_mul_lo_u32 v5, v3, s4 2638; GFX90A-NEXT: v_sub_u32_e32 v5, s2, v5 2639; GFX90A-NEXT: v_add_u32_e32 v6, 1, v3 2640; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 2641; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2642; GFX90A-NEXT: v_subrev_u32_e32 v6, s4, v5 2643; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 2644; GFX90A-NEXT: v_add_u32_e32 v6, 1, v3 2645; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v5 2646; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 2647; GFX90A-NEXT: v_xor_b32_e32 v3, s3, v3 2648; GFX90A-NEXT: v_subrev_u32_e32 v3, s3, v3 2649; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 2650; GFX90A-NEXT: s_endpgm 2651 %r = sdiv <4 x i32> %x, %y 2652 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 2653 ret void 2654} 2655 2656define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) { 2657; CHECK-LABEL: @srem_v4i32( 2658; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[X:%.*]], i64 0 2659; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[Y:%.*]], i64 0 2660; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 2661; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 2662; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 2663; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 2664; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 2665; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 2666; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 2667; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 2668; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000 2669; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 2670; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]] 2671; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]] 2672; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64 2673; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 2674; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]] 2675; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 2676; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32 2677; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 2678; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] 2679; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 2680; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64 2681; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 2682; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 2683; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 2684; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 2685; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]] 2686; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]] 2687; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]] 2688; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]] 2689; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]] 2690; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]] 2691; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]] 2692; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]] 2693; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]] 2694; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]] 2695; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i32> undef, i32 [[TMP37]], i64 0 2696; CHECK-NEXT: [[TMP39:%.*]] = extractelement <4 x i32> [[X]], i64 1 2697; CHECK-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[Y]], i64 1 2698; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31 2699; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31 2700; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]] 2701; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]] 2702; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]] 2703; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]] 2704; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float 2705; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]]) 2706; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000 2707; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32 2708; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]] 2709; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]] 2710; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64 2711; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64 2712; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]] 2713; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32 2714; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32 2715; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32 2716; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]] 2717; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64 2718; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64 2719; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]] 2720; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32 2721; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32 2722; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32 2723; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]] 2724; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]] 2725; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]] 2726; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]] 2727; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]] 2728; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]] 2729; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]] 2730; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]] 2731; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]] 2732; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]] 2733; CHECK-NEXT: [[TMP76:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP75]], i64 1 2734; CHECK-NEXT: [[TMP77:%.*]] = extractelement <4 x i32> [[X]], i64 2 2735; CHECK-NEXT: [[TMP78:%.*]] = extractelement <4 x i32> [[Y]], i64 2 2736; CHECK-NEXT: [[TMP79:%.*]] = ashr i32 [[TMP77]], 31 2737; CHECK-NEXT: [[TMP80:%.*]] = ashr i32 [[TMP78]], 31 2738; CHECK-NEXT: [[TMP81:%.*]] = add i32 [[TMP77]], [[TMP79]] 2739; CHECK-NEXT: [[TMP82:%.*]] = add i32 [[TMP78]], [[TMP80]] 2740; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP79]] 2741; CHECK-NEXT: [[TMP84:%.*]] = xor i32 [[TMP82]], [[TMP80]] 2742; CHECK-NEXT: [[TMP85:%.*]] = uitofp i32 [[TMP84]] to float 2743; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP85]]) 2744; CHECK-NEXT: [[TMP87:%.*]] = fmul fast float [[TMP86]], 0x41EFFFFFC0000000 2745; CHECK-NEXT: [[TMP88:%.*]] = fptoui float [[TMP87]] to i32 2746; CHECK-NEXT: [[TMP89:%.*]] = sub i32 0, [[TMP84]] 2747; CHECK-NEXT: [[TMP90:%.*]] = mul i32 [[TMP89]], [[TMP88]] 2748; CHECK-NEXT: [[TMP91:%.*]] = zext i32 [[TMP88]] to i64 2749; CHECK-NEXT: [[TMP92:%.*]] = zext i32 [[TMP90]] to i64 2750; CHECK-NEXT: [[TMP93:%.*]] = mul i64 [[TMP91]], [[TMP92]] 2751; CHECK-NEXT: [[TMP94:%.*]] = trunc i64 [[TMP93]] to i32 2752; CHECK-NEXT: [[TMP95:%.*]] = lshr i64 [[TMP93]], 32 2753; CHECK-NEXT: [[TMP96:%.*]] = trunc i64 [[TMP95]] to i32 2754; CHECK-NEXT: [[TMP97:%.*]] = add i32 [[TMP88]], [[TMP96]] 2755; CHECK-NEXT: [[TMP98:%.*]] = zext i32 [[TMP83]] to i64 2756; CHECK-NEXT: [[TMP99:%.*]] = zext i32 [[TMP97]] to i64 2757; CHECK-NEXT: [[TMP100:%.*]] = mul i64 [[TMP98]], [[TMP99]] 2758; CHECK-NEXT: [[TMP101:%.*]] = trunc i64 [[TMP100]] to i32 2759; CHECK-NEXT: [[TMP102:%.*]] = lshr i64 [[TMP100]], 32 2760; CHECK-NEXT: [[TMP103:%.*]] = trunc i64 [[TMP102]] to i32 2761; CHECK-NEXT: [[TMP104:%.*]] = mul i32 [[TMP103]], [[TMP84]] 2762; CHECK-NEXT: [[TMP105:%.*]] = sub i32 [[TMP83]], [[TMP104]] 2763; CHECK-NEXT: [[TMP106:%.*]] = icmp uge i32 [[TMP105]], [[TMP84]] 2764; CHECK-NEXT: [[TMP107:%.*]] = sub i32 [[TMP105]], [[TMP84]] 2765; CHECK-NEXT: [[TMP108:%.*]] = select i1 [[TMP106]], i32 [[TMP107]], i32 [[TMP105]] 2766; CHECK-NEXT: [[TMP109:%.*]] = icmp uge i32 [[TMP108]], [[TMP84]] 2767; CHECK-NEXT: [[TMP110:%.*]] = sub i32 [[TMP108]], [[TMP84]] 2768; CHECK-NEXT: [[TMP111:%.*]] = select i1 [[TMP109]], i32 [[TMP110]], i32 [[TMP108]] 2769; CHECK-NEXT: [[TMP112:%.*]] = xor i32 [[TMP111]], [[TMP79]] 2770; CHECK-NEXT: [[TMP113:%.*]] = sub i32 [[TMP112]], [[TMP79]] 2771; CHECK-NEXT: [[TMP114:%.*]] = insertelement <4 x i32> [[TMP76]], i32 [[TMP113]], i64 2 2772; CHECK-NEXT: [[TMP115:%.*]] = extractelement <4 x i32> [[X]], i64 3 2773; CHECK-NEXT: [[TMP116:%.*]] = extractelement <4 x i32> [[Y]], i64 3 2774; CHECK-NEXT: [[TMP117:%.*]] = ashr i32 [[TMP115]], 31 2775; CHECK-NEXT: [[TMP118:%.*]] = ashr i32 [[TMP116]], 31 2776; CHECK-NEXT: [[TMP119:%.*]] = add i32 [[TMP115]], [[TMP117]] 2777; CHECK-NEXT: [[TMP120:%.*]] = add i32 [[TMP116]], [[TMP118]] 2778; CHECK-NEXT: [[TMP121:%.*]] = xor i32 [[TMP119]], [[TMP117]] 2779; CHECK-NEXT: [[TMP122:%.*]] = xor i32 [[TMP120]], [[TMP118]] 2780; CHECK-NEXT: [[TMP123:%.*]] = uitofp i32 [[TMP122]] to float 2781; CHECK-NEXT: [[TMP124:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP123]]) 2782; CHECK-NEXT: [[TMP125:%.*]] = fmul fast float [[TMP124]], 0x41EFFFFFC0000000 2783; CHECK-NEXT: [[TMP126:%.*]] = fptoui float [[TMP125]] to i32 2784; CHECK-NEXT: [[TMP127:%.*]] = sub i32 0, [[TMP122]] 2785; CHECK-NEXT: [[TMP128:%.*]] = mul i32 [[TMP127]], [[TMP126]] 2786; CHECK-NEXT: [[TMP129:%.*]] = zext i32 [[TMP126]] to i64 2787; CHECK-NEXT: [[TMP130:%.*]] = zext i32 [[TMP128]] to i64 2788; CHECK-NEXT: [[TMP131:%.*]] = mul i64 [[TMP129]], [[TMP130]] 2789; CHECK-NEXT: [[TMP132:%.*]] = trunc i64 [[TMP131]] to i32 2790; CHECK-NEXT: [[TMP133:%.*]] = lshr i64 [[TMP131]], 32 2791; CHECK-NEXT: [[TMP134:%.*]] = trunc i64 [[TMP133]] to i32 2792; CHECK-NEXT: [[TMP135:%.*]] = add i32 [[TMP126]], [[TMP134]] 2793; CHECK-NEXT: [[TMP136:%.*]] = zext i32 [[TMP121]] to i64 2794; CHECK-NEXT: [[TMP137:%.*]] = zext i32 [[TMP135]] to i64 2795; CHECK-NEXT: [[TMP138:%.*]] = mul i64 [[TMP136]], [[TMP137]] 2796; CHECK-NEXT: [[TMP139:%.*]] = trunc i64 [[TMP138]] to i32 2797; CHECK-NEXT: [[TMP140:%.*]] = lshr i64 [[TMP138]], 32 2798; CHECK-NEXT: [[TMP141:%.*]] = trunc i64 [[TMP140]] to i32 2799; CHECK-NEXT: [[TMP142:%.*]] = mul i32 [[TMP141]], [[TMP122]] 2800; CHECK-NEXT: [[TMP143:%.*]] = sub i32 [[TMP121]], [[TMP142]] 2801; CHECK-NEXT: [[TMP144:%.*]] = icmp uge i32 [[TMP143]], [[TMP122]] 2802; CHECK-NEXT: [[TMP145:%.*]] = sub i32 [[TMP143]], [[TMP122]] 2803; CHECK-NEXT: [[TMP146:%.*]] = select i1 [[TMP144]], i32 [[TMP145]], i32 [[TMP143]] 2804; CHECK-NEXT: [[TMP147:%.*]] = icmp uge i32 [[TMP146]], [[TMP122]] 2805; CHECK-NEXT: [[TMP148:%.*]] = sub i32 [[TMP146]], [[TMP122]] 2806; CHECK-NEXT: [[TMP149:%.*]] = select i1 [[TMP147]], i32 [[TMP148]], i32 [[TMP146]] 2807; CHECK-NEXT: [[TMP150:%.*]] = xor i32 [[TMP149]], [[TMP117]] 2808; CHECK-NEXT: [[TMP151:%.*]] = sub i32 [[TMP150]], [[TMP117]] 2809; CHECK-NEXT: [[TMP152:%.*]] = insertelement <4 x i32> [[TMP114]], i32 [[TMP151]], i64 3 2810; CHECK-NEXT: store <4 x i32> [[TMP152]], <4 x i32> addrspace(1)* [[OUT:%.*]], align 16 2811; CHECK-NEXT: ret void 2812; 2813; GFX6-LABEL: srem_v4i32: 2814; GFX6: ; %bb.0: 2815; GFX6-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0xd 2816; GFX6-NEXT: s_mov_b32 s14, 0x4f7ffffe 2817; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 2818; GFX6-NEXT: s_mov_b32 s3, 0xf000 2819; GFX6-NEXT: s_waitcnt lgkmcnt(0) 2820; GFX6-NEXT: s_ashr_i32 s2, s8, 31 2821; GFX6-NEXT: s_add_i32 s8, s8, s2 2822; GFX6-NEXT: s_xor_b32 s8, s8, s2 2823; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 2824; GFX6-NEXT: s_ashr_i32 s12, s9, 31 2825; GFX6-NEXT: s_add_i32 s9, s9, s12 2826; GFX6-NEXT: s_xor_b32 s9, s9, s12 2827; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 2828; GFX6-NEXT: s_sub_i32 s13, 0, s8 2829; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 2830; GFX6-NEXT: s_ashr_i32 s12, s4, 31 2831; GFX6-NEXT: v_mul_f32_e32 v0, s14, v0 2832; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 2833; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 2834; GFX6-NEXT: s_add_i32 s4, s4, s12 2835; GFX6-NEXT: s_xor_b32 s4, s4, s12 2836; GFX6-NEXT: v_mul_lo_u32 v2, s13, v0 2837; GFX6-NEXT: v_mul_f32_e32 v1, s14, v1 2838; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 2839; GFX6-NEXT: s_sub_i32 s13, 0, s9 2840; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 2841; GFX6-NEXT: s_mov_b32 s2, -1 2842; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 2843; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 2844; GFX6-NEXT: v_mul_lo_u32 v2, s13, v1 2845; GFX6-NEXT: s_ashr_i32 s13, s5, 31 2846; GFX6-NEXT: s_add_i32 s5, s5, s13 2847; GFX6-NEXT: v_mul_lo_u32 v0, v0, s8 2848; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 2849; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 2850; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 2851; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2852; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 2853; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v0 2854; GFX6-NEXT: s_xor_b32 s4, s5, s13 2855; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 2856; GFX6-NEXT: s_ashr_i32 s5, s10, 31 2857; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 2858; GFX6-NEXT: s_add_i32 s8, s10, s5 2859; GFX6-NEXT: s_xor_b32 s5, s8, s5 2860; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 2861; GFX6-NEXT: v_mul_hi_u32 v1, s4, v1 2862; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 2863; GFX6-NEXT: v_xor_b32_e32 v0, s12, v0 2864; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 2865; GFX6-NEXT: v_mul_lo_u32 v1, v1, s9 2866; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 2867; GFX6-NEXT: v_mul_f32_e32 v2, s14, v2 2868; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 2869; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 2870; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v1 2871; GFX6-NEXT: s_sub_i32 s4, 0, s5 2872; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2873; GFX6-NEXT: v_mul_lo_u32 v4, s4, v2 2874; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2875; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v1 2876; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 2877; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2878; GFX6-NEXT: v_mul_hi_u32 v3, v2, v4 2879; GFX6-NEXT: s_ashr_i32 s8, s11, 31 2880; GFX6-NEXT: s_add_i32 s9, s11, s8 2881; GFX6-NEXT: s_ashr_i32 s4, s6, 31 2882; GFX6-NEXT: s_xor_b32 s8, s9, s8 2883; GFX6-NEXT: s_add_i32 s6, s6, s4 2884; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 2885; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 2886; GFX6-NEXT: s_xor_b32 s6, s6, s4 2887; GFX6-NEXT: v_mul_hi_u32 v2, s6, v2 2888; GFX6-NEXT: v_xor_b32_e32 v1, s13, v1 2889; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v3 2890; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s13, v1 2891; GFX6-NEXT: v_mul_lo_u32 v2, v2, s5 2892; GFX6-NEXT: v_mul_f32_e32 v3, s14, v3 2893; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 2894; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 2895; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s5, v2 2896; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 2897; GFX6-NEXT: s_sub_i32 s6, 0, s8 2898; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 2899; GFX6-NEXT: v_mul_lo_u32 v4, s6, v3 2900; GFX6-NEXT: s_ashr_i32 s6, s7, 31 2901; GFX6-NEXT: s_add_i32 s7, s7, s6 2902; GFX6-NEXT: s_xor_b32 s7, s7, s6 2903; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 2904; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s5, v2 2905; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 2906; GFX6-NEXT: v_mul_hi_u32 v3, s7, v3 2907; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s5, v2 2908; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 2909; GFX6-NEXT: v_xor_b32_e32 v2, s4, v2 2910; GFX6-NEXT: v_mul_lo_u32 v3, v3, s8 2911; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s4, v2 2912; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s7, v3 2913; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 2914; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 2915; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 2916; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s8, v3 2917; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 2918; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 2919; GFX6-NEXT: v_xor_b32_e32 v3, s6, v3 2920; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s6, v3 2921; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 2922; GFX6-NEXT: s_endpgm 2923; 2924; GFX9-LABEL: srem_v4i32: 2925; GFX9: ; %bb.0: 2926; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 2927; GFX9-NEXT: s_mov_b32 s13, 0x4f7ffffe 2928; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 2929; GFX9-NEXT: v_mov_b32_e32 v4, 0 2930; GFX9-NEXT: s_waitcnt lgkmcnt(0) 2931; GFX9-NEXT: s_ashr_i32 s2, s8, 31 2932; GFX9-NEXT: s_add_i32 s8, s8, s2 2933; GFX9-NEXT: s_xor_b32 s2, s8, s2 2934; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 2935; GFX9-NEXT: s_ashr_i32 s3, s9, 31 2936; GFX9-NEXT: s_sub_i32 s12, 0, s2 2937; GFX9-NEXT: s_add_i32 s8, s9, s3 2938; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 2939; GFX9-NEXT: s_xor_b32 s3, s8, s3 2940; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3 2941; GFX9-NEXT: s_ashr_i32 s8, s4, 31 2942; GFX9-NEXT: v_mul_f32_e32 v0, s13, v0 2943; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 2944; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 2945; GFX9-NEXT: s_add_i32 s4, s4, s8 2946; GFX9-NEXT: s_xor_b32 s4, s4, s8 2947; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0 2948; GFX9-NEXT: v_mul_f32_e32 v1, s13, v1 2949; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 2950; GFX9-NEXT: s_sub_i32 s12, 0, s3 2951; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 2952; GFX9-NEXT: s_ashr_i32 s9, s5, 31 2953; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1 2954; GFX9-NEXT: s_add_i32 s5, s5, s9 2955; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 2956; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0 2957; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 2958; GFX9-NEXT: s_xor_b32 s5, s5, s9 2959; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 2960; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 2961; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1 2962; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 2963; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v0 2964; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 2965; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2966; GFX9-NEXT: v_subrev_u32_e32 v2, s2, v0 2967; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 2968; GFX9-NEXT: s_ashr_i32 s2, s10, 31 2969; GFX9-NEXT: s_add_i32 s4, s10, s2 2970; GFX9-NEXT: s_xor_b32 s2, s4, s2 2971; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 2972; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2 2973; GFX9-NEXT: v_mul_lo_u32 v1, v1, s3 2974; GFX9-NEXT: v_xor_b32_e32 v0, s8, v0 2975; GFX9-NEXT: v_subrev_u32_e32 v0, s8, v0 2976; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2 2977; GFX9-NEXT: v_sub_u32_e32 v1, s5, v1 2978; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 2979; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 2980; GFX9-NEXT: v_mul_f32_e32 v2, s13, v2 2981; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 2982; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2983; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 2984; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 2985; GFX9-NEXT: s_sub_i32 s3, 0, s2 2986; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 2987; GFX9-NEXT: v_mul_lo_u32 v3, s3, v2 2988; GFX9-NEXT: s_ashr_i32 s3, s11, 31 2989; GFX9-NEXT: s_add_i32 s4, s11, s3 2990; GFX9-NEXT: s_xor_b32 s3, s4, s3 2991; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s3 2992; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 2993; GFX9-NEXT: s_ashr_i32 s4, s6, 31 2994; GFX9-NEXT: s_add_i32 s5, s6, s4 2995; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v5 2996; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 2997; GFX9-NEXT: s_xor_b32 s5, s5, s4 2998; GFX9-NEXT: v_mul_hi_u32 v2, s5, v2 2999; GFX9-NEXT: v_mul_f32_e32 v3, s13, v5 3000; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 3001; GFX9-NEXT: s_sub_i32 s6, 0, s3 3002; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2 3003; GFX9-NEXT: v_xor_b32_e32 v1, s9, v1 3004; GFX9-NEXT: v_mul_lo_u32 v5, s6, v3 3005; GFX9-NEXT: v_subrev_u32_e32 v1, s9, v1 3006; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 3007; GFX9-NEXT: s_ashr_i32 s5, s7, 31 3008; GFX9-NEXT: v_mul_hi_u32 v5, v3, v5 3009; GFX9-NEXT: s_add_i32 s6, s7, s5 3010; GFX9-NEXT: s_xor_b32 s6, s6, s5 3011; GFX9-NEXT: v_subrev_u32_e32 v6, s2, v2 3012; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 3013; GFX9-NEXT: v_mul_hi_u32 v3, s6, v3 3014; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 3015; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 3016; GFX9-NEXT: v_subrev_u32_e32 v5, s2, v2 3017; GFX9-NEXT: v_mul_lo_u32 v3, v3, s3 3018; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 3019; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 3020; GFX9-NEXT: v_xor_b32_e32 v2, s4, v2 3021; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 3022; GFX9-NEXT: v_subrev_u32_e32 v5, s3, v3 3023; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 3024; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 3025; GFX9-NEXT: v_subrev_u32_e32 v5, s3, v3 3026; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 3027; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 3028; GFX9-NEXT: v_xor_b32_e32 v3, s5, v3 3029; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2 3030; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v3 3031; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 3032; GFX9-NEXT: s_endpgm 3033; 3034; GFX90A-LABEL: srem_v4i32: 3035; GFX90A: ; %bb.0: 3036; GFX90A-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34 3037; GFX90A-NEXT: s_mov_b32 s12, 0x4f7ffffe 3038; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 3039; GFX90A-NEXT: v_mov_b32_e32 v4, 0 3040; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 3041; GFX90A-NEXT: s_ashr_i32 s2, s8, 31 3042; GFX90A-NEXT: s_add_i32 s3, s8, s2 3043; GFX90A-NEXT: s_xor_b32 s2, s3, s2 3044; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s2 3045; GFX90A-NEXT: s_ashr_i32 s8, s9, 31 3046; GFX90A-NEXT: s_add_i32 s9, s9, s8 3047; GFX90A-NEXT: s_xor_b32 s8, s9, s8 3048; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 3049; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s8 3050; GFX90A-NEXT: s_sub_i32 s9, 0, s2 3051; GFX90A-NEXT: s_ashr_i32 s3, s4, 31 3052; GFX90A-NEXT: v_mul_f32_e32 v0, s12, v0 3053; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 3054; GFX90A-NEXT: s_add_i32 s4, s4, s3 3055; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 3056; GFX90A-NEXT: s_xor_b32 s4, s4, s3 3057; GFX90A-NEXT: v_mul_lo_u32 v2, s9, v0 3058; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 3059; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 3060; GFX90A-NEXT: v_mul_hi_u32 v0, s4, v0 3061; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s2 3062; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 3063; GFX90A-NEXT: v_mul_f32_e32 v1, s12, v1 3064; GFX90A-NEXT: v_subrev_u32_e32 v2, s2, v0 3065; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 3066; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 3067; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3068; GFX90A-NEXT: v_subrev_u32_e32 v2, s2, v0 3069; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 3070; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 3071; GFX90A-NEXT: s_sub_i32 s4, 0, s8 3072; GFX90A-NEXT: v_xor_b32_e32 v0, s3, v0 3073; GFX90A-NEXT: s_ashr_i32 s2, s5, 31 3074; GFX90A-NEXT: v_mul_lo_u32 v2, s4, v1 3075; GFX90A-NEXT: v_subrev_u32_e32 v0, s3, v0 3076; GFX90A-NEXT: s_add_i32 s3, s5, s2 3077; GFX90A-NEXT: v_mul_hi_u32 v2, v1, v2 3078; GFX90A-NEXT: s_xor_b32 s3, s3, s2 3079; GFX90A-NEXT: v_add_u32_e32 v1, v1, v2 3080; GFX90A-NEXT: v_mul_hi_u32 v1, s3, v1 3081; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s8 3082; GFX90A-NEXT: v_sub_u32_e32 v1, s3, v1 3083; GFX90A-NEXT: s_ashr_i32 s3, s10, 31 3084; GFX90A-NEXT: s_add_i32 s4, s10, s3 3085; GFX90A-NEXT: v_subrev_u32_e32 v2, s8, v1 3086; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 3087; GFX90A-NEXT: s_xor_b32 s3, s4, s3 3088; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 3089; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s3 3090; GFX90A-NEXT: v_subrev_u32_e32 v3, s8, v1 3091; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v1 3092; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 3093; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v2 3094; GFX90A-NEXT: v_xor_b32_e32 v1, s2, v1 3095; GFX90A-NEXT: s_sub_i32 s5, 0, s3 3096; GFX90A-NEXT: v_subrev_u32_e32 v1, s2, v1 3097; GFX90A-NEXT: v_mul_f32_e32 v2, s12, v2 3098; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 3099; GFX90A-NEXT: s_ashr_i32 s2, s6, 31 3100; GFX90A-NEXT: s_add_i32 s4, s6, s2 3101; GFX90A-NEXT: s_xor_b32 s4, s4, s2 3102; GFX90A-NEXT: v_mul_lo_u32 v3, s5, v2 3103; GFX90A-NEXT: v_mul_hi_u32 v3, v2, v3 3104; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 3105; GFX90A-NEXT: v_mul_hi_u32 v2, s4, v2 3106; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s3 3107; GFX90A-NEXT: v_sub_u32_e32 v2, s4, v2 3108; GFX90A-NEXT: s_ashr_i32 s4, s11, 31 3109; GFX90A-NEXT: s_add_i32 s5, s11, s4 3110; GFX90A-NEXT: v_subrev_u32_e32 v3, s3, v2 3111; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 3112; GFX90A-NEXT: s_xor_b32 s4, s5, s4 3113; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 3114; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s4 3115; GFX90A-NEXT: v_subrev_u32_e32 v5, s3, v2 3116; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 3117; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 3118; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v3 3119; GFX90A-NEXT: v_xor_b32_e32 v2, s2, v2 3120; GFX90A-NEXT: s_sub_i32 s5, 0, s4 3121; GFX90A-NEXT: v_subrev_u32_e32 v2, s2, v2 3122; GFX90A-NEXT: v_mul_f32_e32 v3, s12, v3 3123; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 3124; GFX90A-NEXT: s_ashr_i32 s2, s7, 31 3125; GFX90A-NEXT: s_add_i32 s3, s7, s2 3126; GFX90A-NEXT: s_xor_b32 s3, s3, s2 3127; GFX90A-NEXT: v_mul_lo_u32 v5, s5, v3 3128; GFX90A-NEXT: v_mul_hi_u32 v5, v3, v5 3129; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 3130; GFX90A-NEXT: v_mul_hi_u32 v3, s3, v3 3131; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s4 3132; GFX90A-NEXT: v_sub_u32_e32 v3, s3, v3 3133; GFX90A-NEXT: v_subrev_u32_e32 v5, s4, v3 3134; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 3135; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 3136; GFX90A-NEXT: v_subrev_u32_e32 v5, s4, v3 3137; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 3138; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 3139; GFX90A-NEXT: v_xor_b32_e32 v3, s2, v3 3140; GFX90A-NEXT: v_subrev_u32_e32 v3, s2, v3 3141; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 3142; GFX90A-NEXT: s_endpgm 3143 %r = srem <4 x i32> %x, %y 3144 store <4 x i32> %r, <4 x i32> addrspace(1)* %out 3145 ret void 3146} 3147 3148define amdgpu_kernel void @udiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 3149; CHECK-LABEL: @udiv_v4i16( 3150; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 3151; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 3152; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 3153; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 3154; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3155; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3156; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3157; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3158; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3159; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3160; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3161; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3162; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3163; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3164; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3165; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3166; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3167; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 3168; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 3169; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i16> undef, i16 [[TMP19]], i64 0 3170; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i16> [[X]], i64 1 3171; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i16> [[Y]], i64 1 3172; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 3173; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 3174; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 3175; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 3176; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 3177; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 3178; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 3179; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 3180; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 3181; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 3182; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 3183; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 3184; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 3185; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 3186; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 3187; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 3188; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 3189; CHECK-NEXT: [[TMP40:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP39]], i64 1 3190; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i16> [[X]], i64 2 3191; CHECK-NEXT: [[TMP42:%.*]] = extractelement <4 x i16> [[Y]], i64 2 3192; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 3193; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 3194; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 3195; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 3196; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 3197; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 3198; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 3199; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 3200; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 3201; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 3202; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 3203; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 3204; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 3205; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 3206; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 3207; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 3208; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 3209; CHECK-NEXT: [[TMP60:%.*]] = insertelement <4 x i16> [[TMP40]], i16 [[TMP59]], i64 2 3210; CHECK-NEXT: [[TMP61:%.*]] = extractelement <4 x i16> [[X]], i64 3 3211; CHECK-NEXT: [[TMP62:%.*]] = extractelement <4 x i16> [[Y]], i64 3 3212; CHECK-NEXT: [[TMP63:%.*]] = zext i16 [[TMP61]] to i32 3213; CHECK-NEXT: [[TMP64:%.*]] = zext i16 [[TMP62]] to i32 3214; CHECK-NEXT: [[TMP65:%.*]] = uitofp i32 [[TMP63]] to float 3215; CHECK-NEXT: [[TMP66:%.*]] = uitofp i32 [[TMP64]] to float 3216; CHECK-NEXT: [[TMP67:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP66]]) 3217; CHECK-NEXT: [[TMP68:%.*]] = fmul fast float [[TMP65]], [[TMP67]] 3218; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.trunc.f32(float [[TMP68]]) 3219; CHECK-NEXT: [[TMP70:%.*]] = fneg fast float [[TMP69]] 3220; CHECK-NEXT: [[TMP71:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP70]], float [[TMP66]], float [[TMP65]]) 3221; CHECK-NEXT: [[TMP72:%.*]] = fptoui float [[TMP69]] to i32 3222; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.fabs.f32(float [[TMP71]]) 3223; CHECK-NEXT: [[TMP74:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 3224; CHECK-NEXT: [[TMP75:%.*]] = fcmp fast oge float [[TMP73]], [[TMP74]] 3225; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP75]], i32 1, i32 0 3226; CHECK-NEXT: [[TMP77:%.*]] = add i32 [[TMP72]], [[TMP76]] 3227; CHECK-NEXT: [[TMP78:%.*]] = and i32 [[TMP77]], 65535 3228; CHECK-NEXT: [[TMP79:%.*]] = trunc i32 [[TMP78]] to i16 3229; CHECK-NEXT: [[TMP80:%.*]] = insertelement <4 x i16> [[TMP60]], i16 [[TMP79]], i64 3 3230; CHECK-NEXT: store <4 x i16> [[TMP80]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 3231; CHECK-NEXT: ret void 3232; 3233; GFX6-LABEL: udiv_v4i16: 3234; GFX6: ; %bb.0: 3235; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 3236; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3237; GFX6-NEXT: s_mov_b32 s8, 0xffff 3238; GFX6-NEXT: s_mov_b32 s3, 0xf000 3239; GFX6-NEXT: s_mov_b32 s2, -1 3240; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3241; GFX6-NEXT: s_and_b32 s9, s6, s8 3242; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 3243; GFX6-NEXT: s_lshr_b32 s9, s4, 16 3244; GFX6-NEXT: s_and_b32 s4, s4, s8 3245; GFX6-NEXT: s_lshr_b32 s6, s6, 16 3246; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 3247; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3248; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 3249; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s9 3250; GFX6-NEXT: s_and_b32 s6, s7, s8 3251; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3252; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 3253; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3254; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3255; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 3256; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3257; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 3258; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3259; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 3260; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 3261; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s6 3262; GFX6-NEXT: s_lshr_b32 s4, s5, 16 3263; GFX6-NEXT: s_lshr_b32 s10, s7, 16 3264; GFX6-NEXT: s_and_b32 s5, s5, s8 3265; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 3266; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s5 3267; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 3268; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 3269; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 3270; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v1, vcc 3271; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 3272; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 3273; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v3 3274; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3275; GFX6-NEXT: v_mad_f32 v5, -v1, v4, v5 3276; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 3277; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 3278; GFX6-NEXT: v_mul_f32_e32 v4, v6, v7 3279; GFX6-NEXT: v_trunc_f32_e32 v4, v4 3280; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v4 3281; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3282; GFX6-NEXT: v_mad_f32 v4, -v4, v3, v6 3283; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 3284; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 3285; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 3286; GFX6-NEXT: v_and_b32_e32 v1, s8, v1 3287; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3288; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 3289; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 3290; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 3291; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3292; GFX6-NEXT: s_endpgm 3293; 3294; GFX9-LABEL: udiv_v4i16: 3295; GFX9: ; %bb.0: 3296; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3297; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3298; GFX9-NEXT: s_mov_b32 s0, 0xffff 3299; GFX9-NEXT: v_mov_b32_e32 v2, 0 3300; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3301; GFX9-NEXT: s_and_b32 s8, s6, s0 3302; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 3303; GFX9-NEXT: s_lshr_b32 s1, s4, 16 3304; GFX9-NEXT: s_and_b32 s4, s4, s0 3305; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4 3306; GFX9-NEXT: s_lshr_b32 s4, s6, 16 3307; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3308; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s4 3309; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 3310; GFX9-NEXT: s_and_b32 s1, s7, s0 3311; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 3312; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 3313; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3314; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 3315; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 3316; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3317; GFX9-NEXT: v_mul_f32_e32 v1, v5, v6 3318; GFX9-NEXT: v_trunc_f32_e32 v1, v1 3319; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 3320; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 3321; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 3322; GFX9-NEXT: s_lshr_b32 s6, s7, 16 3323; GFX9-NEXT: s_and_b32 s0, s5, s0 3324; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 3325; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 3326; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 3327; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 3328; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s6 3329; GFX9-NEXT: s_lshr_b32 s8, s5, 16 3330; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc 3331; GFX9-NEXT: v_mul_f32_e32 v1, v6, v7 3332; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s8 3333; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v4 3334; GFX9-NEXT: v_trunc_f32_e32 v1, v1 3335; GFX9-NEXT: v_mad_f32 v6, -v1, v5, v6 3336; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 3337; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 3338; GFX9-NEXT: v_mul_f32_e32 v5, v7, v8 3339; GFX9-NEXT: v_trunc_f32_e32 v5, v5 3340; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v5 3341; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 3342; GFX9-NEXT: v_mad_f32 v5, -v5, v4, v7 3343; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 3344; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff 3345; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc 3346; GFX9-NEXT: v_and_b32_e32 v1, v5, v1 3347; GFX9-NEXT: v_and_b32_e32 v0, v5, v0 3348; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1 3349; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0 3350; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3351; GFX9-NEXT: s_endpgm 3352; 3353; GFX90A-LABEL: udiv_v4i16: 3354; GFX90A: ; %bb.0: 3355; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3356; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3357; GFX90A-NEXT: s_mov_b32 s0, 0xffff 3358; GFX90A-NEXT: v_mov_b32_e32 v2, 0 3359; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 3360; GFX90A-NEXT: s_and_b32 s8, s6, s0 3361; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 3362; GFX90A-NEXT: s_lshr_b32 s1, s4, 16 3363; GFX90A-NEXT: s_and_b32 s4, s4, s0 3364; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s4 3365; GFX90A-NEXT: s_lshr_b32 s4, s6, 16 3366; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 3367; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s4 3368; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s1 3369; GFX90A-NEXT: s_and_b32 s1, s7, s0 3370; GFX90A-NEXT: v_mul_f32_e32 v3, v1, v3 3371; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 3372; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 3373; GFX90A-NEXT: v_mad_f32 v1, -v3, v0, v1 3374; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 3375; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3376; GFX90A-NEXT: v_mul_f32_e32 v1, v5, v6 3377; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 3378; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 3379; GFX90A-NEXT: v_mad_f32 v3, -v1, v4, v5 3380; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s1 3381; GFX90A-NEXT: s_lshr_b32 s6, s7, 16 3382; GFX90A-NEXT: s_and_b32 s0, s5, s0 3383; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 3384; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 3385; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 3386; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 3387; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s6 3388; GFX90A-NEXT: s_lshr_b32 s8, s5, 16 3389; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc 3390; GFX90A-NEXT: v_mul_f32_e32 v1, v6, v7 3391; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s8 3392; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v4 3393; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 3394; GFX90A-NEXT: v_mad_f32 v6, -v1, v5, v6 3395; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 3396; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 3397; GFX90A-NEXT: v_mul_f32_e32 v5, v7, v8 3398; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 3399; GFX90A-NEXT: v_cvt_u32_f32_e32 v6, v5 3400; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 3401; GFX90A-NEXT: v_mad_f32 v5, -v5, v4, v7 3402; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 3403; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff 3404; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc 3405; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 3406; GFX90A-NEXT: v_and_b32_e32 v0, v5, v0 3407; GFX90A-NEXT: v_lshl_or_b32 v1, v4, 16, v1 3408; GFX90A-NEXT: v_lshl_or_b32 v0, v3, 16, v0 3409; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3410; GFX90A-NEXT: s_endpgm 3411 %r = udiv <4 x i16> %x, %y 3412 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 3413 ret void 3414} 3415 3416define amdgpu_kernel void @urem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 3417; CHECK-LABEL: @urem_v4i16( 3418; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 3419; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 3420; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 3421; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 3422; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 3423; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 3424; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 3425; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 3426; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 3427; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 3428; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 3429; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 3430; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 3431; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 3432; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 3433; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 3434; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 3435; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 3436; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 3437; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 3438; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 3439; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i16> undef, i16 [[TMP21]], i64 0 3440; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i16> [[X]], i64 1 3441; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i16> [[Y]], i64 1 3442; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 3443; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 3444; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 3445; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 3446; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 3447; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 3448; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 3449; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 3450; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 3451; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 3452; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 3453; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 3454; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 3455; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 3456; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 3457; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 3458; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 3459; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 3460; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 3461; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[TMP43]], i64 1 3462; CHECK-NEXT: [[TMP45:%.*]] = extractelement <4 x i16> [[X]], i64 2 3463; CHECK-NEXT: [[TMP46:%.*]] = extractelement <4 x i16> [[Y]], i64 2 3464; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 3465; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 3466; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 3467; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 3468; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 3469; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 3470; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 3471; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 3472; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 3473; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 3474; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 3475; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 3476; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 3477; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 3478; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 3479; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 3480; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 3481; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 3482; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 3483; CHECK-NEXT: [[TMP66:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP65]], i64 2 3484; CHECK-NEXT: [[TMP67:%.*]] = extractelement <4 x i16> [[X]], i64 3 3485; CHECK-NEXT: [[TMP68:%.*]] = extractelement <4 x i16> [[Y]], i64 3 3486; CHECK-NEXT: [[TMP69:%.*]] = zext i16 [[TMP67]] to i32 3487; CHECK-NEXT: [[TMP70:%.*]] = zext i16 [[TMP68]] to i32 3488; CHECK-NEXT: [[TMP71:%.*]] = uitofp i32 [[TMP69]] to float 3489; CHECK-NEXT: [[TMP72:%.*]] = uitofp i32 [[TMP70]] to float 3490; CHECK-NEXT: [[TMP73:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP72]]) 3491; CHECK-NEXT: [[TMP74:%.*]] = fmul fast float [[TMP71]], [[TMP73]] 3492; CHECK-NEXT: [[TMP75:%.*]] = call fast float @llvm.trunc.f32(float [[TMP74]]) 3493; CHECK-NEXT: [[TMP76:%.*]] = fneg fast float [[TMP75]] 3494; CHECK-NEXT: [[TMP77:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP76]], float [[TMP72]], float [[TMP71]]) 3495; CHECK-NEXT: [[TMP78:%.*]] = fptoui float [[TMP75]] to i32 3496; CHECK-NEXT: [[TMP79:%.*]] = call fast float @llvm.fabs.f32(float [[TMP77]]) 3497; CHECK-NEXT: [[TMP80:%.*]] = call fast float @llvm.fabs.f32(float [[TMP72]]) 3498; CHECK-NEXT: [[TMP81:%.*]] = fcmp fast oge float [[TMP79]], [[TMP80]] 3499; CHECK-NEXT: [[TMP82:%.*]] = select i1 [[TMP81]], i32 1, i32 0 3500; CHECK-NEXT: [[TMP83:%.*]] = add i32 [[TMP78]], [[TMP82]] 3501; CHECK-NEXT: [[TMP84:%.*]] = mul i32 [[TMP83]], [[TMP70]] 3502; CHECK-NEXT: [[TMP85:%.*]] = sub i32 [[TMP69]], [[TMP84]] 3503; CHECK-NEXT: [[TMP86:%.*]] = and i32 [[TMP85]], 65535 3504; CHECK-NEXT: [[TMP87:%.*]] = trunc i32 [[TMP86]] to i16 3505; CHECK-NEXT: [[TMP88:%.*]] = insertelement <4 x i16> [[TMP66]], i16 [[TMP87]], i64 3 3506; CHECK-NEXT: store <4 x i16> [[TMP88]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 3507; CHECK-NEXT: ret void 3508; 3509; GFX6-LABEL: urem_v4i16: 3510; GFX6: ; %bb.0: 3511; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 3512; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3513; GFX6-NEXT: s_mov_b32 s8, 0xffff 3514; GFX6-NEXT: s_mov_b32 s3, 0xf000 3515; GFX6-NEXT: s_mov_b32 s2, -1 3516; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3517; GFX6-NEXT: s_and_b32 s9, s6, s8 3518; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s9 3519; GFX6-NEXT: s_and_b32 s10, s4, s8 3520; GFX6-NEXT: s_lshr_b32 s11, s6, 16 3521; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 3522; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3523; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s11 3524; GFX6-NEXT: s_lshr_b32 s9, s4, 16 3525; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s9 3526; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3527; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 3528; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3529; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3530; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 3531; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3532; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 3533; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3534; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 3535; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v1 3536; GFX6-NEXT: v_mad_f32 v1, -v1, v3, v4 3537; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 3538; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v3 3539; GFX6-NEXT: s_and_b32 s6, s7, s8 3540; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 3541; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 3542; GFX6-NEXT: s_and_b32 s6, s5, s8 3543; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11 3544; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s6 3545; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 3546; GFX6-NEXT: s_lshr_b32 s12, s7, 16 3547; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s9, v1 3548; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 3549; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s12 3550; GFX6-NEXT: s_lshr_b32 s10, s5, 16 3551; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s10 3552; GFX6-NEXT: v_trunc_f32_e32 v1, v1 3553; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 3554; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 3555; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v3 3556; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 3557; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 3558; GFX6-NEXT: v_mul_f32_e32 v2, v6, v7 3559; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3560; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 3561; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 3562; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v6 3563; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 3564; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 3565; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 3566; GFX6-NEXT: v_mul_lo_u32 v2, v2, s12 3567; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 3568; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 3569; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s10, v2 3570; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3571; GFX6-NEXT: v_and_b32_e32 v1, s8, v1 3572; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 3573; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 3574; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 3575; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3576; GFX6-NEXT: s_endpgm 3577; 3578; GFX9-LABEL: urem_v4i16: 3579; GFX9: ; %bb.0: 3580; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3581; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3582; GFX9-NEXT: s_mov_b32 s0, 0xffff 3583; GFX9-NEXT: v_mov_b32_e32 v2, 0 3584; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3585; GFX9-NEXT: s_and_b32 s8, s6, s0 3586; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 3587; GFX9-NEXT: s_and_b32 s9, s4, s0 3588; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 3589; GFX9-NEXT: s_lshr_b32 s9, s6, 16 3590; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3591; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s9 3592; GFX9-NEXT: s_lshr_b32 s1, s4, 16 3593; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 3594; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 3595; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3596; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 3597; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 3598; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 3599; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3600; GFX9-NEXT: s_lshr_b32 s10, s7, 16 3601; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 3602; GFX9-NEXT: v_mul_f32_e32 v1, v5, v6 3603; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6 3604; GFX9-NEXT: v_trunc_f32_e32 v1, v1 3605; GFX9-NEXT: s_and_b32 s6, s7, s0 3606; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5 3607; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s6 3608; GFX9-NEXT: s_and_b32 s0, s5, s0 3609; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 3610; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 3611; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 3612; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s10 3613; GFX9-NEXT: s_lshr_b32 s8, s5, 16 3614; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 3615; GFX9-NEXT: v_mul_f32_e32 v3, v6, v7 3616; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s8 3617; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v4 3618; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3619; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 3620; GFX9-NEXT: v_mad_f32 v6, -v3, v5, v6 3621; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 3622; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 3623; GFX9-NEXT: v_mul_f32_e32 v5, v7, v8 3624; GFX9-NEXT: v_trunc_f32_e32 v5, v5 3625; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v5 3626; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 3627; GFX9-NEXT: v_mad_f32 v5, -v5, v4, v7 3628; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 3629; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc 3630; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 3631; GFX9-NEXT: v_mul_lo_u32 v3, v3, s7 3632; GFX9-NEXT: v_mul_lo_u32 v4, v4, s10 3633; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 3634; GFX9-NEXT: v_sub_u32_e32 v5, s1, v1 3635; GFX9-NEXT: v_sub_u32_e32 v1, s5, v3 3636; GFX9-NEXT: v_sub_u32_e32 v3, s8, v4 3637; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff 3638; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 3639; GFX9-NEXT: v_and_b32_e32 v0, v4, v0 3640; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 3641; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0 3642; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3643; GFX9-NEXT: s_endpgm 3644; 3645; GFX90A-LABEL: urem_v4i16: 3646; GFX90A: ; %bb.0: 3647; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3648; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3649; GFX90A-NEXT: s_mov_b32 s0, 0xffff 3650; GFX90A-NEXT: v_mov_b32_e32 v2, 0 3651; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 3652; GFX90A-NEXT: s_and_b32 s8, s6, s0 3653; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 3654; GFX90A-NEXT: s_and_b32 s9, s4, s0 3655; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 3656; GFX90A-NEXT: s_lshr_b32 s9, s6, 16 3657; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 3658; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s9 3659; GFX90A-NEXT: s_lshr_b32 s1, s4, 16 3660; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s1 3661; GFX90A-NEXT: v_mul_f32_e32 v3, v1, v3 3662; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 3663; GFX90A-NEXT: v_mad_f32 v1, -v3, v0, v1 3664; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 3665; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 3666; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 3667; GFX90A-NEXT: s_lshr_b32 s10, s7, 16 3668; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 3669; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s6 3670; GFX90A-NEXT: v_mul_f32_e32 v1, v5, v6 3671; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 3672; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 3673; GFX90A-NEXT: s_and_b32 s4, s7, s0 3674; GFX90A-NEXT: v_mad_f32 v3, -v1, v4, v5 3675; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s4 3676; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 3677; GFX90A-NEXT: s_and_b32 s0, s5, s0 3678; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 3679; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 3680; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 3681; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s10 3682; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 3683; GFX90A-NEXT: s_lshr_b32 s8, s5, 16 3684; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s9 3685; GFX90A-NEXT: v_sub_u32_e32 v3, s1, v1 3686; GFX90A-NEXT: v_mul_f32_e32 v1, v6, v7 3687; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s8 3688; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v4 3689; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 3690; GFX90A-NEXT: v_mad_f32 v6, -v1, v5, v6 3691; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 3692; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 3693; GFX90A-NEXT: v_mul_f32_e32 v5, v7, v8 3694; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 3695; GFX90A-NEXT: v_cvt_u32_f32_e32 v6, v5 3696; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 3697; GFX90A-NEXT: v_mad_f32 v5, -v5, v4, v7 3698; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v4 3699; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s7 3700; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc 3701; GFX90A-NEXT: v_sub_u32_e32 v1, s5, v1 3702; GFX90A-NEXT: v_mul_lo_u32 v4, v4, s10 3703; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff 3704; GFX90A-NEXT: v_sub_u32_e32 v4, s8, v4 3705; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 3706; GFX90A-NEXT: v_and_b32_e32 v0, v5, v0 3707; GFX90A-NEXT: v_lshl_or_b32 v1, v4, 16, v1 3708; GFX90A-NEXT: v_lshl_or_b32 v0, v3, 16, v0 3709; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3710; GFX90A-NEXT: s_endpgm 3711 %r = urem <4 x i16> %x, %y 3712 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 3713 ret void 3714} 3715 3716define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 3717; CHECK-LABEL: @sdiv_v4i16( 3718; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 3719; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 3720; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 3721; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 3722; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 3723; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 3724; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 3725; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 3726; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 3727; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 3728; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 3729; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 3730; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 3731; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 3732; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 3733; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 3734; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 3735; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 3736; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 3737; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 3738; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 3739; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 3740; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 3741; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i16> undef, i16 [[TMP23]], i64 0 3742; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1 3743; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i16> [[Y]], i64 1 3744; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 3745; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 3746; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 3747; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 3748; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 3749; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 3750; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 3751; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 3752; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 3753; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 3754; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 3755; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 3756; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 3757; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 3758; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 3759; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 3760; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 3761; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 3762; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 3763; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 3764; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 3765; CHECK-NEXT: [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1 3766; CHECK-NEXT: [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2 3767; CHECK-NEXT: [[TMP50:%.*]] = extractelement <4 x i16> [[Y]], i64 2 3768; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 3769; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 3770; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 3771; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 3772; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 3773; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 3774; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 3775; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 3776; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 3777; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 3778; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 3779; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 3780; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 3781; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 3782; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 3783; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 3784; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 3785; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 3786; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 3787; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 3788; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 3789; CHECK-NEXT: [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2 3790; CHECK-NEXT: [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3 3791; CHECK-NEXT: [[TMP74:%.*]] = extractelement <4 x i16> [[Y]], i64 3 3792; CHECK-NEXT: [[TMP75:%.*]] = sext i16 [[TMP73]] to i32 3793; CHECK-NEXT: [[TMP76:%.*]] = sext i16 [[TMP74]] to i32 3794; CHECK-NEXT: [[TMP77:%.*]] = xor i32 [[TMP75]], [[TMP76]] 3795; CHECK-NEXT: [[TMP78:%.*]] = ashr i32 [[TMP77]], 30 3796; CHECK-NEXT: [[TMP79:%.*]] = or i32 [[TMP78]], 1 3797; CHECK-NEXT: [[TMP80:%.*]] = sitofp i32 [[TMP75]] to float 3798; CHECK-NEXT: [[TMP81:%.*]] = sitofp i32 [[TMP76]] to float 3799; CHECK-NEXT: [[TMP82:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP81]]) 3800; CHECK-NEXT: [[TMP83:%.*]] = fmul fast float [[TMP80]], [[TMP82]] 3801; CHECK-NEXT: [[TMP84:%.*]] = call fast float @llvm.trunc.f32(float [[TMP83]]) 3802; CHECK-NEXT: [[TMP85:%.*]] = fneg fast float [[TMP84]] 3803; CHECK-NEXT: [[TMP86:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP85]], float [[TMP81]], float [[TMP80]]) 3804; CHECK-NEXT: [[TMP87:%.*]] = fptosi float [[TMP84]] to i32 3805; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.fabs.f32(float [[TMP86]]) 3806; CHECK-NEXT: [[TMP89:%.*]] = call fast float @llvm.fabs.f32(float [[TMP81]]) 3807; CHECK-NEXT: [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]] 3808; CHECK-NEXT: [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0 3809; CHECK-NEXT: [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]] 3810; CHECK-NEXT: [[TMP93:%.*]] = shl i32 [[TMP92]], 16 3811; CHECK-NEXT: [[TMP94:%.*]] = ashr i32 [[TMP93]], 16 3812; CHECK-NEXT: [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16 3813; CHECK-NEXT: [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3 3814; CHECK-NEXT: store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 3815; CHECK-NEXT: ret void 3816; 3817; GFX6-LABEL: sdiv_v4i16: 3818; GFX6: ; %bb.0: 3819; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 3820; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 3821; GFX6-NEXT: s_mov_b32 s3, 0xf000 3822; GFX6-NEXT: s_mov_b32 s2, -1 3823; GFX6-NEXT: s_waitcnt lgkmcnt(0) 3824; GFX6-NEXT: s_sext_i32_i16 s8, s6 3825; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 3826; GFX6-NEXT: s_sext_i32_i16 s9, s4 3827; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 3828; GFX6-NEXT: s_xor_b32 s8, s9, s8 3829; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 3830; GFX6-NEXT: s_ashr_i32 s6, s6, 16 3831; GFX6-NEXT: s_ashr_i32 s8, s8, 30 3832; GFX6-NEXT: s_or_b32 s8, s8, 1 3833; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 3834; GFX6-NEXT: v_trunc_f32_e32 v2, v2 3835; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 3836; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 3837; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 3838; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 3839; GFX6-NEXT: v_mov_b32_e32 v3, s8 3840; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 3841; GFX6-NEXT: s_ashr_i32 s4, s4, 16 3842; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 3843; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 3844; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 3845; GFX6-NEXT: s_xor_b32 s4, s4, s6 3846; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3847; GFX6-NEXT: s_or_b32 s4, s4, 1 3848; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 3849; GFX6-NEXT: v_trunc_f32_e32 v3, v3 3850; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 3851; GFX6-NEXT: v_mov_b32_e32 v4, s4 3852; GFX6-NEXT: s_sext_i32_i16 s4, s7 3853; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 3854; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 3855; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 3856; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 3857; GFX6-NEXT: s_sext_i32_i16 s6, s5 3858; GFX6-NEXT: v_add_i32_e32 v3, vcc, v1, v3 3859; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 3860; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 3861; GFX6-NEXT: s_xor_b32 s4, s6, s4 3862; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3863; GFX6-NEXT: s_or_b32 s4, s4, 1 3864; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 3865; GFX6-NEXT: v_trunc_f32_e32 v4, v4 3866; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1 3867; GFX6-NEXT: v_mov_b32_e32 v5, s4 3868; GFX6-NEXT: s_ashr_i32 s4, s7, 16 3869; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 3870; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 3871; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 3872; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc 3873; GFX6-NEXT: s_ashr_i32 s5, s5, 16 3874; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 3875; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 3876; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 3877; GFX6-NEXT: s_xor_b32 s4, s5, s4 3878; GFX6-NEXT: s_ashr_i32 s4, s4, 30 3879; GFX6-NEXT: s_or_b32 s4, s4, 1 3880; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 3881; GFX6-NEXT: v_trunc_f32_e32 v5, v5 3882; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 3883; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 3884; GFX6-NEXT: v_mov_b32_e32 v6, s4 3885; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| 3886; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc 3887; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 3888; GFX6-NEXT: s_mov_b32 s4, 0xffff 3889; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 3890; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 3891; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 3892; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 3893; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 3894; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 3895; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 3896; GFX6-NEXT: s_endpgm 3897; 3898; GFX9-LABEL: sdiv_v4i16: 3899; GFX9: ; %bb.0: 3900; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3901; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3902; GFX9-NEXT: v_mov_b32_e32 v2, 0 3903; GFX9-NEXT: s_waitcnt lgkmcnt(0) 3904; GFX9-NEXT: s_sext_i32_i16 s0, s6 3905; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 3906; GFX9-NEXT: s_sext_i32_i16 s1, s4 3907; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 3908; GFX9-NEXT: s_xor_b32 s0, s1, s0 3909; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 3910; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3911; GFX9-NEXT: s_or_b32 s8, s0, 1 3912; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 3913; GFX9-NEXT: v_trunc_f32_e32 v3, v3 3914; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 3915; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3916; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3917; GFX9-NEXT: s_cselect_b32 s0, s8, 0 3918; GFX9-NEXT: s_ashr_i32 s1, s6, 16 3919; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 3920; GFX9-NEXT: s_ashr_i32 s4, s4, 16 3921; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s4 3922; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 3923; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 3924; GFX9-NEXT: v_add_u32_e32 v3, s0, v3 3925; GFX9-NEXT: v_mul_f32_e32 v4, v1, v4 3926; GFX9-NEXT: s_xor_b32 s0, s4, s1 3927; GFX9-NEXT: v_trunc_f32_e32 v4, v4 3928; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3929; GFX9-NEXT: v_mad_f32 v1, -v4, v0, v1 3930; GFX9-NEXT: s_or_b32 s4, s0, 1 3931; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3932; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3933; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 3934; GFX9-NEXT: s_sext_i32_i16 s1, s7 3935; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 3936; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3937; GFX9-NEXT: v_add_u32_e32 v4, s0, v4 3938; GFX9-NEXT: s_sext_i32_i16 s0, s5 3939; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0 3940; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 3941; GFX9-NEXT: s_xor_b32 s0, s0, s1 3942; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3943; GFX9-NEXT: s_or_b32 s4, s0, 1 3944; GFX9-NEXT: v_mul_f32_e32 v5, v1, v5 3945; GFX9-NEXT: v_trunc_f32_e32 v5, v5 3946; GFX9-NEXT: v_mad_f32 v1, -v5, v0, v1 3947; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3948; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3949; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 3950; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3951; GFX9-NEXT: s_ashr_i32 s1, s7, 16 3952; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 3953; GFX9-NEXT: v_add_u32_e32 v1, s0, v5 3954; GFX9-NEXT: s_ashr_i32 s0, s5, 16 3955; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 3956; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v0 3957; GFX9-NEXT: s_xor_b32 s0, s0, s1 3958; GFX9-NEXT: s_ashr_i32 s0, s0, 30 3959; GFX9-NEXT: s_or_b32 s4, s0, 1 3960; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 3961; GFX9-NEXT: v_trunc_f32_e32 v6, v6 3962; GFX9-NEXT: v_mad_f32 v5, -v6, v0, v5 3963; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 3964; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| 3965; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 3966; GFX9-NEXT: s_cselect_b32 s0, s4, 0 3967; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff 3968; GFX9-NEXT: v_add_u32_e32 v0, s0, v6 3969; GFX9-NEXT: v_and_b32_e32 v1, v5, v1 3970; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1 3971; GFX9-NEXT: v_and_b32_e32 v0, v5, v3 3972; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0 3973; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 3974; GFX9-NEXT: s_endpgm 3975; 3976; GFX90A-LABEL: sdiv_v4i16: 3977; GFX90A: ; %bb.0: 3978; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 3979; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 3980; GFX90A-NEXT: v_mov_b32_e32 v2, 0 3981; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 3982; GFX90A-NEXT: s_sext_i32_i16 s0, s6 3983; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 3984; GFX90A-NEXT: s_sext_i32_i16 s1, s4 3985; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s1 3986; GFX90A-NEXT: s_xor_b32 s0, s1, s0 3987; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 3988; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 3989; GFX90A-NEXT: s_or_b32 s8, s0, 1 3990; GFX90A-NEXT: v_mul_f32_e32 v3, v1, v3 3991; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 3992; GFX90A-NEXT: v_mad_f32 v1, -v3, v0, v1 3993; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 3994; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 3995; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 3996; GFX90A-NEXT: s_ashr_i32 s1, s6, 16 3997; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 3998; GFX90A-NEXT: s_ashr_i32 s4, s4, 16 3999; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s4 4000; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 4001; GFX90A-NEXT: v_rcp_iflag_f32_e32 v4, v0 4002; GFX90A-NEXT: v_add_u32_e32 v3, s0, v3 4003; GFX90A-NEXT: v_mul_f32_e32 v4, v1, v4 4004; GFX90A-NEXT: s_xor_b32 s0, s4, s1 4005; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 4006; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4007; GFX90A-NEXT: v_mad_f32 v1, -v4, v0, v1 4008; GFX90A-NEXT: s_or_b32 s4, s0, 1 4009; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 4010; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4011; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 4012; GFX90A-NEXT: s_sext_i32_i16 s1, s7 4013; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 4014; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 4015; GFX90A-NEXT: v_add_u32_e32 v4, s0, v4 4016; GFX90A-NEXT: s_sext_i32_i16 s0, s5 4017; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s0 4018; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v0 4019; GFX90A-NEXT: s_xor_b32 s0, s0, s1 4020; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4021; GFX90A-NEXT: s_or_b32 s4, s0, 1 4022; GFX90A-NEXT: v_mul_f32_e32 v5, v1, v5 4023; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 4024; GFX90A-NEXT: v_mad_f32 v1, -v5, v0, v1 4025; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 4026; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4027; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 4028; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 4029; GFX90A-NEXT: s_ashr_i32 s1, s7, 16 4030; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 4031; GFX90A-NEXT: v_add_u32_e32 v1, s0, v5 4032; GFX90A-NEXT: s_ashr_i32 s0, s5, 16 4033; GFX90A-NEXT: v_cvt_f32_i32_e32 v5, s0 4034; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v0 4035; GFX90A-NEXT: s_xor_b32 s0, s0, s1 4036; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4037; GFX90A-NEXT: s_or_b32 s4, s0, 1 4038; GFX90A-NEXT: v_mul_f32_e32 v6, v5, v6 4039; GFX90A-NEXT: v_trunc_f32_e32 v6, v6 4040; GFX90A-NEXT: v_mad_f32 v5, -v6, v0, v5 4041; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 4042; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v0| 4043; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4044; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 4045; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff 4046; GFX90A-NEXT: v_add_u32_e32 v0, s0, v6 4047; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 4048; GFX90A-NEXT: v_lshl_or_b32 v1, v0, 16, v1 4049; GFX90A-NEXT: v_and_b32_e32 v0, v5, v3 4050; GFX90A-NEXT: v_lshl_or_b32 v0, v4, 16, v0 4051; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 4052; GFX90A-NEXT: s_endpgm 4053 %r = sdiv <4 x i16> %x, %y 4054 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 4055 ret void 4056} 4057 4058define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x, <4 x i16> %y) { 4059; CHECK-LABEL: @srem_v4i16( 4060; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[X:%.*]], i64 0 4061; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i16> [[Y:%.*]], i64 0 4062; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 4063; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 4064; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 4065; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 4066; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 4067; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 4068; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 4069; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 4070; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 4071; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 4072; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 4073; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 4074; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 4075; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 4076; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4077; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 4078; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 4079; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 4080; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 4081; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 4082; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 4083; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 4084; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 4085; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> undef, i16 [[TMP25]], i64 0 4086; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1 4087; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i16> [[Y]], i64 1 4088; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 4089; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 4090; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 4091; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 4092; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 4093; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 4094; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 4095; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 4096; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 4097; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 4098; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 4099; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 4100; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 4101; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 4102; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 4103; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 4104; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 4105; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 4106; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 4107; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 4108; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 4109; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 4110; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 4111; CHECK-NEXT: [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1 4112; CHECK-NEXT: [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2 4113; CHECK-NEXT: [[TMP54:%.*]] = extractelement <4 x i16> [[Y]], i64 2 4114; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 4115; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 4116; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 4117; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 4118; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 4119; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 4120; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 4121; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 4122; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 4123; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 4124; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 4125; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 4126; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 4127; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 4128; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 4129; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 4130; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 4131; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 4132; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 4133; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 4134; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 4135; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 4136; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 4137; CHECK-NEXT: [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2 4138; CHECK-NEXT: [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3 4139; CHECK-NEXT: [[TMP80:%.*]] = extractelement <4 x i16> [[Y]], i64 3 4140; CHECK-NEXT: [[TMP81:%.*]] = sext i16 [[TMP79]] to i32 4141; CHECK-NEXT: [[TMP82:%.*]] = sext i16 [[TMP80]] to i32 4142; CHECK-NEXT: [[TMP83:%.*]] = xor i32 [[TMP81]], [[TMP82]] 4143; CHECK-NEXT: [[TMP84:%.*]] = ashr i32 [[TMP83]], 30 4144; CHECK-NEXT: [[TMP85:%.*]] = or i32 [[TMP84]], 1 4145; CHECK-NEXT: [[TMP86:%.*]] = sitofp i32 [[TMP81]] to float 4146; CHECK-NEXT: [[TMP87:%.*]] = sitofp i32 [[TMP82]] to float 4147; CHECK-NEXT: [[TMP88:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP87]]) 4148; CHECK-NEXT: [[TMP89:%.*]] = fmul fast float [[TMP86]], [[TMP88]] 4149; CHECK-NEXT: [[TMP90:%.*]] = call fast float @llvm.trunc.f32(float [[TMP89]]) 4150; CHECK-NEXT: [[TMP91:%.*]] = fneg fast float [[TMP90]] 4151; CHECK-NEXT: [[TMP92:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP91]], float [[TMP87]], float [[TMP86]]) 4152; CHECK-NEXT: [[TMP93:%.*]] = fptosi float [[TMP90]] to i32 4153; CHECK-NEXT: [[TMP94:%.*]] = call fast float @llvm.fabs.f32(float [[TMP92]]) 4154; CHECK-NEXT: [[TMP95:%.*]] = call fast float @llvm.fabs.f32(float [[TMP87]]) 4155; CHECK-NEXT: [[TMP96:%.*]] = fcmp fast oge float [[TMP94]], [[TMP95]] 4156; CHECK-NEXT: [[TMP97:%.*]] = select i1 [[TMP96]], i32 [[TMP85]], i32 0 4157; CHECK-NEXT: [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]] 4158; CHECK-NEXT: [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]] 4159; CHECK-NEXT: [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]] 4160; CHECK-NEXT: [[TMP101:%.*]] = shl i32 [[TMP100]], 16 4161; CHECK-NEXT: [[TMP102:%.*]] = ashr i32 [[TMP101]], 16 4162; CHECK-NEXT: [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16 4163; CHECK-NEXT: [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3 4164; CHECK-NEXT: store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]], align 8 4165; CHECK-NEXT: ret void 4166; 4167; GFX6-LABEL: srem_v4i16: 4168; GFX6: ; %bb.0: 4169; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb 4170; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4171; GFX6-NEXT: s_mov_b32 s3, 0xf000 4172; GFX6-NEXT: s_mov_b32 s2, -1 4173; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4174; GFX6-NEXT: s_sext_i32_i16 s8, s6 4175; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 4176; GFX6-NEXT: s_sext_i32_i16 s9, s4 4177; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 4178; GFX6-NEXT: s_xor_b32 s8, s9, s8 4179; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 4180; GFX6-NEXT: s_ashr_i32 s8, s8, 30 4181; GFX6-NEXT: s_or_b32 s8, s8, 1 4182; GFX6-NEXT: v_mov_b32_e32 v3, s8 4183; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 4184; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4185; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 4186; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 4187; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 4188; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 4189; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 4190; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 4191; GFX6-NEXT: s_ashr_i32 s6, s6, 16 4192; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s6 4193; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 4194; GFX6-NEXT: s_ashr_i32 s4, s4, 16 4195; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 4196; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 4197; GFX6-NEXT: s_xor_b32 s8, s4, s6 4198; GFX6-NEXT: s_ashr_i32 s8, s8, 30 4199; GFX6-NEXT: s_or_b32 s8, s8, 1 4200; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 4201; GFX6-NEXT: v_trunc_f32_e32 v3, v3 4202; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 4203; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 4204; GFX6-NEXT: v_mov_b32_e32 v4, s8 4205; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 4206; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 4207; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 4208; GFX6-NEXT: v_mul_lo_u32 v1, v1, s6 4209; GFX6-NEXT: s_sext_i32_i16 s6, s7 4210; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s6 4211; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s4, v1 4212; GFX6-NEXT: s_sext_i32_i16 s4, s5 4213; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 4214; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 4215; GFX6-NEXT: s_xor_b32 s4, s4, s6 4216; GFX6-NEXT: s_ashr_i32 s4, s4, 30 4217; GFX6-NEXT: s_or_b32 s4, s4, 1 4218; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 4219; GFX6-NEXT: v_trunc_f32_e32 v4, v4 4220; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1 4221; GFX6-NEXT: v_mov_b32_e32 v5, s4 4222; GFX6-NEXT: s_ashr_i32 s4, s7, 16 4223; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 4224; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v2| 4225; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 4226; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc 4227; GFX6-NEXT: s_ashr_i32 s6, s5, 16 4228; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v4 4229; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s6 4230; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 4231; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 4232; GFX6-NEXT: s_xor_b32 s7, s6, s4 4233; GFX6-NEXT: s_ashr_i32 s7, s7, 30 4234; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 4235; GFX6-NEXT: v_trunc_f32_e32 v5, v5 4236; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 4237; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 4238; GFX6-NEXT: s_or_b32 s7, s7, 1 4239; GFX6-NEXT: v_mov_b32_e32 v6, s7 4240; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v2| 4241; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v6, vcc 4242; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 4243; GFX6-NEXT: v_mul_lo_u32 v2, v2, s4 4244; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 4245; GFX6-NEXT: s_mov_b32 s4, 0xffff 4246; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 4247; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 4248; GFX6-NEXT: v_and_b32_e32 v1, s4, v1 4249; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 4250; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 4251; GFX6-NEXT: v_and_b32_e32 v0, s4, v0 4252; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 4253; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 4254; GFX6-NEXT: s_endpgm 4255; 4256; GFX9-LABEL: srem_v4i16: 4257; GFX9: ; %bb.0: 4258; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 4259; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4260; GFX9-NEXT: v_mov_b32_e32 v2, 0 4261; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4262; GFX9-NEXT: s_sext_i32_i16 s0, s6 4263; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 4264; GFX9-NEXT: s_sext_i32_i16 s1, s4 4265; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s1 4266; GFX9-NEXT: s_xor_b32 s0, s1, s0 4267; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 4268; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4269; GFX9-NEXT: s_or_b32 s8, s0, 1 4270; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 4271; GFX9-NEXT: v_trunc_f32_e32 v3, v3 4272; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 4273; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 4274; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4275; GFX9-NEXT: s_cselect_b32 s0, s8, 0 4276; GFX9-NEXT: s_ashr_i32 s9, s6, 16 4277; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 4278; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s9 4279; GFX9-NEXT: s_ashr_i32 s8, s4, 16 4280; GFX9-NEXT: v_add_u32_e32 v1, s0, v3 4281; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s8 4282; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 4283; GFX9-NEXT: s_xor_b32 s0, s8, s9 4284; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4285; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6 4286; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 4287; GFX9-NEXT: v_trunc_f32_e32 v4, v4 4288; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 4289; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 4290; GFX9-NEXT: s_or_b32 s6, s0, 1 4291; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| 4292; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4293; GFX9-NEXT: s_cselect_b32 s0, s6, 0 4294; GFX9-NEXT: v_add_u32_e32 v0, s0, v4 4295; GFX9-NEXT: s_sext_i32_i16 s0, s7 4296; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 4297; GFX9-NEXT: s_sext_i32_i16 s1, s5 4298; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 4299; GFX9-NEXT: s_xor_b32 s0, s1, s0 4300; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 4301; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4302; GFX9-NEXT: s_or_b32 s6, s0, 1 4303; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 4304; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 4305; GFX9-NEXT: v_trunc_f32_e32 v5, v5 4306; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 4307; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 4308; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 4309; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4310; GFX9-NEXT: s_cselect_b32 s0, s6, 0 4311; GFX9-NEXT: s_ashr_i32 s6, s7, 16 4312; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s6 4313; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 4314; GFX9-NEXT: v_mul_lo_u32 v3, v3, s7 4315; GFX9-NEXT: s_ashr_i32 s7, s5, 16 4316; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s7 4317; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 4318; GFX9-NEXT: s_xor_b32 s0, s7, s6 4319; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4320; GFX9-NEXT: s_or_b32 s9, s0, 1 4321; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 4322; GFX9-NEXT: v_trunc_f32_e32 v6, v6 4323; GFX9-NEXT: v_mad_f32 v5, -v6, v4, v5 4324; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 4325; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| 4326; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4327; GFX9-NEXT: s_cselect_b32 s0, s9, 0 4328; GFX9-NEXT: v_add_u32_e32 v4, s0, v6 4329; GFX9-NEXT: v_mul_lo_u32 v4, v4, s6 4330; GFX9-NEXT: v_sub_u32_e32 v5, s4, v1 4331; GFX9-NEXT: v_sub_u32_e32 v1, s5, v3 4332; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0 4333; GFX9-NEXT: v_sub_u32_e32 v3, s7, v4 4334; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff 4335; GFX9-NEXT: v_and_b32_e32 v1, v4, v1 4336; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 4337; GFX9-NEXT: v_and_b32_e32 v3, v4, v5 4338; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v3 4339; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 4340; GFX9-NEXT: s_endpgm 4341; 4342; GFX90A-LABEL: srem_v4i16: 4343; GFX90A: ; %bb.0: 4344; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c 4345; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4346; GFX90A-NEXT: v_mov_b32_e32 v2, 0 4347; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 4348; GFX90A-NEXT: s_sext_i32_i16 s0, s6 4349; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 4350; GFX90A-NEXT: s_sext_i32_i16 s1, s4 4351; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s1 4352; GFX90A-NEXT: s_xor_b32 s0, s1, s0 4353; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 4354; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4355; GFX90A-NEXT: s_or_b32 s8, s0, 1 4356; GFX90A-NEXT: v_mul_f32_e32 v3, v1, v3 4357; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 4358; GFX90A-NEXT: v_mad_f32 v1, -v3, v0, v1 4359; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 4360; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0| 4361; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4362; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 4363; GFX90A-NEXT: s_ashr_i32 s8, s6, 16 4364; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s8 4365; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 4366; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s6 4367; GFX90A-NEXT: v_sub_u32_e32 v0, s4, v0 4368; GFX90A-NEXT: s_ashr_i32 s4, s4, 16 4369; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s4 4370; GFX90A-NEXT: v_rcp_iflag_f32_e32 v4, v1 4371; GFX90A-NEXT: s_xor_b32 s0, s4, s8 4372; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4373; GFX90A-NEXT: s_or_b32 s6, s0, 1 4374; GFX90A-NEXT: v_mul_f32_e32 v4, v3, v4 4375; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 4376; GFX90A-NEXT: v_mad_f32 v3, -v4, v1, v3 4377; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 4378; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v1| 4379; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4380; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 4381; GFX90A-NEXT: v_add_u32_e32 v1, s0, v4 4382; GFX90A-NEXT: s_sext_i32_i16 s0, s7 4383; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s0 4384; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s8 4385; GFX90A-NEXT: s_sext_i32_i16 s1, s5 4386; GFX90A-NEXT: v_sub_u32_e32 v4, s4, v1 4387; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s1 4388; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v3 4389; GFX90A-NEXT: s_xor_b32 s0, s1, s0 4390; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4391; GFX90A-NEXT: s_or_b32 s4, s0, 1 4392; GFX90A-NEXT: v_mul_f32_e32 v5, v1, v5 4393; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 4394; GFX90A-NEXT: v_mad_f32 v1, -v5, v3, v1 4395; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 4396; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v3| 4397; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4398; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 4399; GFX90A-NEXT: s_ashr_i32 s4, s7, 16 4400; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s4 4401; GFX90A-NEXT: v_add_u32_e32 v1, s0, v5 4402; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s7 4403; GFX90A-NEXT: v_sub_u32_e32 v1, s5, v1 4404; GFX90A-NEXT: s_ashr_i32 s5, s5, 16 4405; GFX90A-NEXT: v_cvt_f32_i32_e32 v5, s5 4406; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v3 4407; GFX90A-NEXT: s_xor_b32 s0, s5, s4 4408; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4409; GFX90A-NEXT: s_or_b32 s6, s0, 1 4410; GFX90A-NEXT: v_mul_f32_e32 v6, v5, v6 4411; GFX90A-NEXT: v_trunc_f32_e32 v6, v6 4412; GFX90A-NEXT: v_mad_f32 v5, -v6, v3, v5 4413; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 4414; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| 4415; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4416; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 4417; GFX90A-NEXT: v_add_u32_e32 v3, s0, v6 4418; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s4 4419; GFX90A-NEXT: v_mov_b32_e32 v5, 0xffff 4420; GFX90A-NEXT: v_sub_u32_e32 v3, s5, v3 4421; GFX90A-NEXT: v_and_b32_e32 v1, v5, v1 4422; GFX90A-NEXT: v_and_b32_e32 v0, v5, v0 4423; GFX90A-NEXT: v_lshl_or_b32 v1, v3, 16, v1 4424; GFX90A-NEXT: v_lshl_or_b32 v0, v4, 16, v0 4425; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] 4426; GFX90A-NEXT: s_endpgm 4427 %r = srem <4 x i16> %x, %y 4428 store <4 x i16> %r, <4 x i16> addrspace(1)* %out 4429 ret void 4430} 4431 4432define amdgpu_kernel void @udiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 4433; CHECK-LABEL: @udiv_i3( 4434; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 4435; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 4436; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 4437; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 4438; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 4439; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 4440; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 4441; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 4442; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 4443; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 4444; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4445; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 4446; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 4447; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 4448; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 4449; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 7 4450; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i3 4451; CHECK-NEXT: store i3 [[TMP17]], i3 addrspace(1)* [[OUT:%.*]], align 1 4452; CHECK-NEXT: ret void 4453; 4454; GFX6-LABEL: udiv_i3: 4455; GFX6: ; %bb.0: 4456; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 4457; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4458; GFX6-NEXT: s_mov_b32 s3, 0xf000 4459; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4460; GFX6-NEXT: s_bfe_u32 s2, s4, 0x30008 4461; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 4462; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 4463; GFX6-NEXT: s_and_b32 s4, s4, 7 4464; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 4465; GFX6-NEXT: s_mov_b32 s2, -1 4466; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 4467; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4468; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 4469; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 4470; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 4471; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 4472; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 4473; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 4474; GFX6-NEXT: s_endpgm 4475; 4476; GFX9-LABEL: udiv_i3: 4477; GFX9: ; %bb.0: 4478; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 4479; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4480; GFX9-NEXT: v_mov_b32_e32 v2, 0 4481; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4482; GFX9-NEXT: s_bfe_u32 s0, s4, 0x30008 4483; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 4484; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 4485; GFX9-NEXT: s_and_b32 s0, s4, 7 4486; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 4487; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 4488; GFX9-NEXT: v_trunc_f32_e32 v1, v1 4489; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1 4490; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v3 4491; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 4492; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 4493; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 4494; GFX9-NEXT: global_store_byte v2, v0, s[2:3] 4495; GFX9-NEXT: s_endpgm 4496; 4497; GFX90A-LABEL: udiv_i3: 4498; GFX90A: ; %bb.0: 4499; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 4500; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4501; GFX90A-NEXT: v_mov_b32_e32 v2, 0 4502; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 4503; GFX90A-NEXT: s_bfe_u32 s0, s4, 0x30008 4504; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 4505; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v0 4506; GFX90A-NEXT: s_and_b32 s0, s4, 7 4507; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v3, s0 4508; GFX90A-NEXT: v_mul_f32_e32 v1, v3, v1 4509; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 4510; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v1 4511; GFX90A-NEXT: v_mad_f32 v1, -v1, v0, v3 4512; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 4513; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc 4514; GFX90A-NEXT: v_and_b32_e32 v0, 7, v0 4515; GFX90A-NEXT: global_store_byte v2, v0, s[2:3] 4516; GFX90A-NEXT: s_endpgm 4517 %r = udiv i3 %x, %y 4518 store i3 %r, i3 addrspace(1)* %out 4519 ret void 4520} 4521 4522define amdgpu_kernel void @urem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 4523; CHECK-LABEL: @urem_i3( 4524; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[X:%.*]] to i32 4525; CHECK-NEXT: [[TMP2:%.*]] = zext i3 [[Y:%.*]] to i32 4526; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP1]] to float 4527; CHECK-NEXT: [[TMP4:%.*]] = uitofp i32 [[TMP2]] to float 4528; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP4]]) 4529; CHECK-NEXT: [[TMP6:%.*]] = fmul fast float [[TMP3]], [[TMP5]] 4530; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.trunc.f32(float [[TMP6]]) 4531; CHECK-NEXT: [[TMP8:%.*]] = fneg fast float [[TMP7]] 4532; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP8]], float [[TMP4]], float [[TMP3]]) 4533; CHECK-NEXT: [[TMP10:%.*]] = fptoui float [[TMP7]] to i32 4534; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 4535; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.fabs.f32(float [[TMP4]]) 4536; CHECK-NEXT: [[TMP13:%.*]] = fcmp fast oge float [[TMP11]], [[TMP12]] 4537; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 1, i32 0 4538; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP10]], [[TMP14]] 4539; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP15]], [[TMP2]] 4540; CHECK-NEXT: [[TMP17:%.*]] = sub i32 [[TMP1]], [[TMP16]] 4541; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 7 4542; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3 4543; CHECK-NEXT: store i3 [[TMP19]], i3 addrspace(1)* [[OUT:%.*]], align 1 4544; CHECK-NEXT: ret void 4545; 4546; GFX6-LABEL: urem_i3: 4547; GFX6: ; %bb.0: 4548; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 4549; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4550; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4551; GFX6-NEXT: s_bfe_u32 s2, s4, 0x30008 4552; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 4553; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v0 4554; GFX6-NEXT: s_and_b32 s3, s4, 7 4555; GFX6-NEXT: v_cvt_f32_ubyte0_e32 v2, s3 4556; GFX6-NEXT: s_lshr_b32 s2, s4, 8 4557; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1 4558; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4559; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 4560; GFX6-NEXT: v_mad_f32 v1, -v1, v0, v2 4561; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 4562; GFX6-NEXT: s_mov_b32 s3, 0xf000 4563; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc 4564; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 4565; GFX6-NEXT: s_mov_b32 s2, -1 4566; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 4567; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 4568; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 4569; GFX6-NEXT: s_endpgm 4570; 4571; GFX9-LABEL: urem_i3: 4572; GFX9: ; %bb.0: 4573; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c 4574; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4575; GFX9-NEXT: s_bfe_u32 s3, s2, 0x30008 4576; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 4577; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0 4578; GFX9-NEXT: s_and_b32 s4, s2, 7 4579; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 4580; GFX9-NEXT: s_lshr_b32 s3, s2, 8 4581; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1 4582; GFX9-NEXT: v_trunc_f32_e32 v1, v1 4583; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1 4584; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2 4585; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 4586; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4587; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 4588; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 4589; GFX9-NEXT: v_mov_b32_e32 v1, 0 4590; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 4591; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 4592; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4593; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 4594; GFX9-NEXT: s_endpgm 4595; 4596; GFX90A-LABEL: urem_i3: 4597; GFX90A: ; %bb.0: 4598; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 4599; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4600; GFX90A-NEXT: v_mov_b32_e32 v0, 0 4601; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 4602; GFX90A-NEXT: s_bfe_u32 s0, s4, 0x30008 4603; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s0 4604; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v1 4605; GFX90A-NEXT: s_and_b32 s1, s4, 7 4606; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v3, s1 4607; GFX90A-NEXT: s_lshr_b32 s0, s4, 8 4608; GFX90A-NEXT: v_mul_f32_e32 v2, v3, v2 4609; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 4610; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v2 4611; GFX90A-NEXT: v_mad_f32 v2, -v2, v1, v3 4612; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1 4613; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc 4614; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s0 4615; GFX90A-NEXT: v_sub_u32_e32 v1, s4, v1 4616; GFX90A-NEXT: v_and_b32_e32 v1, 7, v1 4617; GFX90A-NEXT: global_store_byte v0, v1, s[2:3] 4618; GFX90A-NEXT: s_endpgm 4619 %r = urem i3 %x, %y 4620 store i3 %r, i3 addrspace(1)* %out 4621 ret void 4622} 4623 4624define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 4625; CHECK-LABEL: @sdiv_i3( 4626; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 4627; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 4628; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 4629; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 4630; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 4631; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 4632; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 4633; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 4634; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 4635; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 4636; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 4637; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 4638; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 4639; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 4640; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 4641; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 4642; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 4643; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 4644; CHECK-NEXT: [[TMP19:%.*]] = shl i32 [[TMP18]], 29 4645; CHECK-NEXT: [[TMP20:%.*]] = ashr i32 [[TMP19]], 29 4646; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3 4647; CHECK-NEXT: store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]], align 1 4648; CHECK-NEXT: ret void 4649; 4650; GFX6-LABEL: sdiv_i3: 4651; GFX6: ; %bb.0: 4652; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 4653; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4654; GFX6-NEXT: s_mov_b32 s3, 0xf000 4655; GFX6-NEXT: s_mov_b32 s2, -1 4656; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4657; GFX6-NEXT: s_bfe_i32 s5, s4, 0x30008 4658; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s5 4659; GFX6-NEXT: s_bfe_i32 s4, s4, 0x30000 4660; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 4661; GFX6-NEXT: s_xor_b32 s4, s4, s5 4662; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 4663; GFX6-NEXT: s_ashr_i32 s4, s4, 30 4664; GFX6-NEXT: s_or_b32 s4, s4, 1 4665; GFX6-NEXT: v_mov_b32_e32 v3, s4 4666; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 4667; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4668; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 4669; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 4670; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 4671; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 4672; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 4673; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 4674; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 4675; GFX6-NEXT: s_endpgm 4676; 4677; GFX9-LABEL: sdiv_i3: 4678; GFX9: ; %bb.0: 4679; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 4680; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4681; GFX9-NEXT: v_mov_b32_e32 v1, 0 4682; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4683; GFX9-NEXT: s_bfe_i32 s0, s4, 0x30008 4684; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 4685; GFX9-NEXT: s_bfe_i32 s1, s4, 0x30000 4686; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 4687; GFX9-NEXT: s_xor_b32 s0, s1, s0 4688; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 4689; GFX9-NEXT: s_ashr_i32 s0, s0, 30 4690; GFX9-NEXT: s_or_b32 s4, s0, 1 4691; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 4692; GFX9-NEXT: v_trunc_f32_e32 v3, v3 4693; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 4694; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 4695; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 4696; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 4697; GFX9-NEXT: s_cselect_b32 s0, s4, 0 4698; GFX9-NEXT: v_add_u32_e32 v0, s0, v3 4699; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 4700; GFX9-NEXT: global_store_byte v1, v0, s[2:3] 4701; GFX9-NEXT: s_endpgm 4702; 4703; GFX90A-LABEL: sdiv_i3: 4704; GFX90A: ; %bb.0: 4705; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 4706; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4707; GFX90A-NEXT: v_mov_b32_e32 v1, 0 4708; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 4709; GFX90A-NEXT: s_bfe_i32 s0, s4, 0x30008 4710; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 4711; GFX90A-NEXT: s_bfe_i32 s1, s4, 0x30000 4712; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 4713; GFX90A-NEXT: s_xor_b32 s0, s1, s0 4714; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 4715; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4716; GFX90A-NEXT: s_or_b32 s4, s0, 1 4717; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 4718; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 4719; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 4720; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 4721; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 4722; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4723; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 4724; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 4725; GFX90A-NEXT: v_and_b32_e32 v0, 7, v0 4726; GFX90A-NEXT: global_store_byte v1, v0, s[2:3] 4727; GFX90A-NEXT: s_endpgm 4728 %r = sdiv i3 %x, %y 4729 store i3 %r, i3 addrspace(1)* %out 4730 ret void 4731} 4732 4733define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) { 4734; CHECK-LABEL: @srem_i3( 4735; CHECK-NEXT: [[TMP1:%.*]] = sext i3 [[X:%.*]] to i32 4736; CHECK-NEXT: [[TMP2:%.*]] = sext i3 [[Y:%.*]] to i32 4737; CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], [[TMP2]] 4738; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP3]], 30 4739; CHECK-NEXT: [[TMP5:%.*]] = or i32 [[TMP4]], 1 4740; CHECK-NEXT: [[TMP6:%.*]] = sitofp i32 [[TMP1]] to float 4741; CHECK-NEXT: [[TMP7:%.*]] = sitofp i32 [[TMP2]] to float 4742; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP7]]) 4743; CHECK-NEXT: [[TMP9:%.*]] = fmul fast float [[TMP6]], [[TMP8]] 4744; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.trunc.f32(float [[TMP9]]) 4745; CHECK-NEXT: [[TMP11:%.*]] = fneg fast float [[TMP10]] 4746; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP11]], float [[TMP7]], float [[TMP6]]) 4747; CHECK-NEXT: [[TMP13:%.*]] = fptosi float [[TMP10]] to i32 4748; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP12]]) 4749; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.fabs.f32(float [[TMP7]]) 4750; CHECK-NEXT: [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]] 4751; CHECK-NEXT: [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0 4752; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]] 4753; CHECK-NEXT: [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]] 4754; CHECK-NEXT: [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]] 4755; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 29 4756; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 29 4757; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3 4758; CHECK-NEXT: store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]], align 1 4759; CHECK-NEXT: ret void 4760; 4761; GFX6-LABEL: srem_i3: 4762; GFX6: ; %bb.0: 4763; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 4764; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 4765; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4766; GFX6-NEXT: s_bfe_i32 s2, s4, 0x30008 4767; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 4768; GFX6-NEXT: s_bfe_i32 s5, s4, 0x30000 4769; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 4770; GFX6-NEXT: s_xor_b32 s2, s5, s2 4771; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 4772; GFX6-NEXT: s_ashr_i32 s2, s2, 30 4773; GFX6-NEXT: s_or_b32 s2, s2, 1 4774; GFX6-NEXT: v_mov_b32_e32 v3, s2 4775; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 4776; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4777; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 4778; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 4779; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 4780; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 4781; GFX6-NEXT: s_lshr_b32 s3, s4, 8 4782; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 4783; GFX6-NEXT: v_mul_lo_u32 v0, v0, s3 4784; GFX6-NEXT: s_mov_b32 s3, 0xf000 4785; GFX6-NEXT: s_mov_b32 s2, -1 4786; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 4787; GFX6-NEXT: v_and_b32_e32 v0, 7, v0 4788; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 4789; GFX6-NEXT: s_endpgm 4790; 4791; GFX9-LABEL: srem_i3: 4792; GFX9: ; %bb.0: 4793; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 4794; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4795; GFX9-NEXT: s_bfe_i32 s2, s4, 0x30008 4796; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 4797; GFX9-NEXT: s_bfe_i32 s3, s4, 0x30000 4798; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s3 4799; GFX9-NEXT: s_xor_b32 s2, s3, s2 4800; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 4801; GFX9-NEXT: s_ashr_i32 s2, s2, 30 4802; GFX9-NEXT: s_lshr_b32 s5, s4, 8 4803; GFX9-NEXT: s_or_b32 s6, s2, 1 4804; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 4805; GFX9-NEXT: v_trunc_f32_e32 v2, v2 4806; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 4807; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 4808; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0| 4809; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 4810; GFX9-NEXT: s_cselect_b32 s2, s6, 0 4811; GFX9-NEXT: v_add_u32_e32 v0, s2, v2 4812; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 4813; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 4814; GFX9-NEXT: v_mov_b32_e32 v1, 0 4815; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 4816; GFX9-NEXT: v_and_b32_e32 v0, 7, v0 4817; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4818; GFX9-NEXT: global_store_byte v1, v0, s[0:1] 4819; GFX9-NEXT: s_endpgm 4820; 4821; GFX90A-LABEL: srem_i3: 4822; GFX90A: ; %bb.0: 4823; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 4824; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 4825; GFX90A-NEXT: v_mov_b32_e32 v0, 0 4826; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 4827; GFX90A-NEXT: s_bfe_i32 s0, s4, 0x30008 4828; GFX90A-NEXT: v_cvt_f32_i32_e32 v1, s0 4829; GFX90A-NEXT: s_bfe_i32 s1, s4, 0x30000 4830; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 4831; GFX90A-NEXT: s_xor_b32 s0, s1, s0 4832; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v1 4833; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 4834; GFX90A-NEXT: s_lshr_b32 s5, s4, 8 4835; GFX90A-NEXT: s_or_b32 s6, s0, 1 4836; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 4837; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 4838; GFX90A-NEXT: v_mad_f32 v2, -v3, v1, v2 4839; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 4840; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v1| 4841; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 4842; GFX90A-NEXT: s_cselect_b32 s0, s6, 0 4843; GFX90A-NEXT: v_add_u32_e32 v1, s0, v3 4844; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s5 4845; GFX90A-NEXT: v_sub_u32_e32 v1, s4, v1 4846; GFX90A-NEXT: v_and_b32_e32 v1, 7, v1 4847; GFX90A-NEXT: global_store_byte v0, v1, s[2:3] 4848; GFX90A-NEXT: s_endpgm 4849 %r = srem i3 %x, %y 4850 store i3 %r, i3 addrspace(1)* %out 4851 ret void 4852} 4853 4854define amdgpu_kernel void @udiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 4855; CHECK-LABEL: @udiv_v3i16( 4856; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 4857; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 4858; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 4859; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 4860; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 4861; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 4862; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 4863; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 4864; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 4865; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 4866; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 4867; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 4868; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 4869; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 4870; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 4871; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 4872; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 4873; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 65535 4874; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16 4875; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i16> undef, i16 [[TMP19]], i64 0 4876; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i16> [[X]], i64 1 4877; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i16> [[Y]], i64 1 4878; CHECK-NEXT: [[TMP23:%.*]] = zext i16 [[TMP21]] to i32 4879; CHECK-NEXT: [[TMP24:%.*]] = zext i16 [[TMP22]] to i32 4880; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 4881; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 4882; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 4883; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 4884; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 4885; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 4886; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 4887; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 4888; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 4889; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 4890; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 4891; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 4892; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 4893; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 65535 4894; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i16 4895; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i16> [[TMP20]], i16 [[TMP39]], i64 1 4896; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i16> [[X]], i64 2 4897; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i16> [[Y]], i64 2 4898; CHECK-NEXT: [[TMP43:%.*]] = zext i16 [[TMP41]] to i32 4899; CHECK-NEXT: [[TMP44:%.*]] = zext i16 [[TMP42]] to i32 4900; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 4901; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 4902; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 4903; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 4904; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 4905; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 4906; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 4907; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 4908; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 4909; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 4910; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 4911; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 4912; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 4913; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 65535 4914; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i16 4915; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i16> [[TMP40]], i16 [[TMP59]], i64 2 4916; CHECK-NEXT: store <3 x i16> [[TMP60]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 4917; CHECK-NEXT: ret void 4918; 4919; GFX6-LABEL: udiv_v3i16: 4920; GFX6: ; %bb.0: 4921; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 4922; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 4923; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 4924; GFX6-NEXT: s_mov_b32 s8, 0xffff 4925; GFX6-NEXT: s_mov_b32 s7, 0xf000 4926; GFX6-NEXT: s_waitcnt lgkmcnt(0) 4927; GFX6-NEXT: s_and_b32 s6, s2, s8 4928; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 4929; GFX6-NEXT: s_and_b32 s6, s0, s8 4930; GFX6-NEXT: s_lshr_b32 s2, s2, 16 4931; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s6 4932; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 4933; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 4934; GFX6-NEXT: s_lshr_b32 s0, s0, 16 4935; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s0 4936; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 4937; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 4938; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4939; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 4940; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v2 4941; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 4942; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 4943; GFX6-NEXT: v_trunc_f32_e32 v1, v1 4944; GFX6-NEXT: s_and_b32 s0, s3, s8 4945; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc 4946; GFX6-NEXT: v_mad_f32 v2, -v1, v3, v4 4947; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s0 4948; GFX6-NEXT: s_and_b32 s0, s1, s8 4949; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 4950; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 4951; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 4952; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 4953; GFX6-NEXT: s_mov_b32 s6, -1 4954; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 4955; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6 4956; GFX6-NEXT: v_trunc_f32_e32 v2, v2 4957; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 4958; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5 4959; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 4960; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 4961; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 4962; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 4963; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 4964; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 4965; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 4966; GFX6-NEXT: s_endpgm 4967; 4968; GFX9-LABEL: udiv_v3i16: 4969; GFX9: ; %bb.0: 4970; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 4971; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 4972; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 4973; GFX9-NEXT: s_mov_b32 s0, 0xffff 4974; GFX9-NEXT: v_mov_b32_e32 v1, 0 4975; GFX9-NEXT: s_waitcnt lgkmcnt(0) 4976; GFX9-NEXT: s_and_b32 s1, s2, s0 4977; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 4978; GFX9-NEXT: s_and_b32 s1, s6, s0 4979; GFX9-NEXT: s_lshr_b32 s2, s2, 16 4980; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s1 4981; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 4982; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 4983; GFX9-NEXT: s_lshr_b32 s1, s6, 16 4984; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 4985; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 4986; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 4987; GFX9-NEXT: v_trunc_f32_e32 v3, v3 4988; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 4989; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 4990; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 4991; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6 4992; GFX9-NEXT: v_trunc_f32_e32 v2, v2 4993; GFX9-NEXT: s_and_b32 s1, s3, s0 4994; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 4995; GFX9-NEXT: v_mad_f32 v3, -v2, v4, v5 4996; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1 4997; GFX9-NEXT: s_and_b32 s0, s7, s0 4998; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 4999; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2 5000; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v5 5001; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 5002; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 5003; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 5004; GFX9-NEXT: v_mul_f32_e32 v3, v6, v7 5005; GFX9-NEXT: v_trunc_f32_e32 v3, v3 5006; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3 5007; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v6 5008; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 5009; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 5010; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 5011; GFX9-NEXT: global_store_short v1, v3, s[4:5] offset:4 5012; GFX9-NEXT: global_store_dword v1, v0, s[4:5] 5013; GFX9-NEXT: s_endpgm 5014; 5015; GFX90A-LABEL: udiv_v3i16: 5016; GFX90A: ; %bb.0: 5017; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 5018; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 5019; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 5020; GFX90A-NEXT: s_mov_b32 s0, 0xffff 5021; GFX90A-NEXT: v_mov_b32_e32 v1, 0 5022; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 5023; GFX90A-NEXT: s_and_b32 s1, s2, s0 5024; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s1 5025; GFX90A-NEXT: s_and_b32 s1, s6, s0 5026; GFX90A-NEXT: s_lshr_b32 s2, s2, 16 5027; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s1 5028; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 5029; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s2 5030; GFX90A-NEXT: s_lshr_b32 s1, s6, 16 5031; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s1 5032; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 5033; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 5034; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 5035; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 5036; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 5037; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 5038; GFX90A-NEXT: v_mul_f32_e32 v2, v5, v6 5039; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 5040; GFX90A-NEXT: s_and_b32 s1, s3, s0 5041; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 5042; GFX90A-NEXT: v_mad_f32 v3, -v2, v4, v5 5043; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s1 5044; GFX90A-NEXT: s_and_b32 s0, s7, s0 5045; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 5046; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 5047; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 5048; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 5049; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 5050; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 5051; GFX90A-NEXT: v_mul_f32_e32 v3, v6, v7 5052; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 5053; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v3 5054; GFX90A-NEXT: v_mad_f32 v3, -v3, v5, v6 5055; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 5056; GFX90A-NEXT: v_lshl_or_b32 v0, v2, 16, v0 5057; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 5058; GFX90A-NEXT: global_store_short v1, v3, s[4:5] offset:4 5059; GFX90A-NEXT: global_store_dword v1, v0, s[4:5] 5060; GFX90A-NEXT: s_endpgm 5061 %r = udiv <3 x i16> %x, %y 5062 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 5063 ret void 5064} 5065 5066define amdgpu_kernel void @urem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 5067; CHECK-LABEL: @urem_v3i16( 5068; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 5069; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 5070; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32 5071; CHECK-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32 5072; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 5073; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 5074; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 5075; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 5076; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 5077; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 5078; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 5079; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 5080; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 5081; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 5082; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 5083; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 5084; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 5085; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 5086; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 5087; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 65535 5088; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16 5089; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i16> undef, i16 [[TMP21]], i64 0 5090; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i16> [[X]], i64 1 5091; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i16> [[Y]], i64 1 5092; CHECK-NEXT: [[TMP25:%.*]] = zext i16 [[TMP23]] to i32 5093; CHECK-NEXT: [[TMP26:%.*]] = zext i16 [[TMP24]] to i32 5094; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 5095; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 5096; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 5097; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 5098; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 5099; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 5100; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 5101; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 5102; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 5103; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 5104; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 5105; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 5106; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 5107; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 5108; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 5109; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 65535 5110; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i16 5111; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i16> [[TMP22]], i16 [[TMP43]], i64 1 5112; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i16> [[X]], i64 2 5113; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i16> [[Y]], i64 2 5114; CHECK-NEXT: [[TMP47:%.*]] = zext i16 [[TMP45]] to i32 5115; CHECK-NEXT: [[TMP48:%.*]] = zext i16 [[TMP46]] to i32 5116; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 5117; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 5118; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 5119; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 5120; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 5121; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 5122; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 5123; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 5124; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 5125; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 5126; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 5127; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 5128; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 5129; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 5130; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 5131; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 65535 5132; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i16 5133; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i16> [[TMP44]], i16 [[TMP65]], i64 2 5134; CHECK-NEXT: store <3 x i16> [[TMP66]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 5135; CHECK-NEXT: ret void 5136; 5137; GFX6-LABEL: urem_v3i16: 5138; GFX6: ; %bb.0: 5139; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 5140; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5141; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5142; GFX6-NEXT: s_mov_b32 s8, 0xffff 5143; GFX6-NEXT: s_mov_b32 s7, 0xf000 5144; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5145; GFX6-NEXT: s_and_b32 s6, s2, s8 5146; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 5147; GFX6-NEXT: s_and_b32 s6, s0, s8 5148; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 5149; GFX6-NEXT: v_mov_b32_e32 v4, s2 5150; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 5151; GFX6-NEXT: v_alignbit_b32 v4, s3, v4, 16 5152; GFX6-NEXT: v_and_b32_e32 v5, s8, v4 5153; GFX6-NEXT: v_mov_b32_e32 v1, s0 5154; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 5155; GFX6-NEXT: v_trunc_f32_e32 v3, v3 5156; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 5157; GFX6-NEXT: v_mad_f32 v2, -v3, v0, v2 5158; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 5159; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v5 5160; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc 5161; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 5162; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 16 5163; GFX6-NEXT: v_and_b32_e32 v3, s8, v1 5164; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 5165; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 5166; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 5167; GFX6-NEXT: s_and_b32 s0, s3, s8 5168; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s0 5169; GFX6-NEXT: v_mul_f32_e32 v5, v3, v5 5170; GFX6-NEXT: s_and_b32 s0, s1, s8 5171; GFX6-NEXT: v_trunc_f32_e32 v5, v5 5172; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s0 5173; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 5174; GFX6-NEXT: v_mad_f32 v3, -v5, v2, v3 5175; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 5176; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 5177; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 5178; GFX6-NEXT: v_trunc_f32_e32 v3, v3 5179; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc 5180; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 5181; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3 5182; GFX6-NEXT: v_mad_f32 v3, -v3, v6, v7 5183; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v6 5184; GFX6-NEXT: s_mov_b32 s6, -1 5185; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 5186; GFX6-NEXT: v_mul_lo_u32 v3, v3, s3 5187; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 5188; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 5189; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v3 5190; GFX6-NEXT: v_and_b32_e32 v0, s8, v0 5191; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 5192; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 5193; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5194; GFX6-NEXT: s_endpgm 5195; 5196; GFX9-LABEL: urem_v3i16: 5197; GFX9: ; %bb.0: 5198; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 5199; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 5200; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 5201; GFX9-NEXT: s_mov_b32 s0, 0xffff 5202; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5203; GFX9-NEXT: s_and_b32 s1, s2, s0 5204; GFX9-NEXT: s_and_b32 s8, s4, s0 5205; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 5206; GFX9-NEXT: s_lshr_b32 s4, s4, 16 5207; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s4 5208; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s1 5209; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 5210; GFX9-NEXT: s_lshr_b32 s2, s2, 16 5211; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2 5212; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v2 5213; GFX9-NEXT: v_mul_f32_e32 v3, v1, v3 5214; GFX9-NEXT: v_trunc_f32_e32 v3, v3 5215; GFX9-NEXT: v_mad_f32 v1, -v3, v0, v1 5216; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 5217; GFX9-NEXT: v_mul_f32_e32 v1, v4, v5 5218; GFX9-NEXT: v_trunc_f32_e32 v1, v1 5219; GFX9-NEXT: s_and_b32 s5, s5, s0 5220; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v3 5221; GFX9-NEXT: v_mad_f32 v3, -v1, v2, v4 5222; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s5 5223; GFX9-NEXT: s_and_b32 s0, s3, s0 5224; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc 5225; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0 5226; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v4 5227; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 5228; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 5229; GFX9-NEXT: v_mul_lo_u32 v0, v0, s8 5230; GFX9-NEXT: v_mul_f32_e32 v2, v5, v6 5231; GFX9-NEXT: v_trunc_f32_e32 v2, v2 5232; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v2 5233; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 5234; GFX9-NEXT: v_mad_f32 v2, -v2, v4, v5 5235; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 5236; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc 5237; GFX9-NEXT: v_mul_lo_u32 v1, v1, s4 5238; GFX9-NEXT: v_mul_lo_u32 v2, v2, s5 5239; GFX9-NEXT: v_sub_u32_e32 v0, s1, v0 5240; GFX9-NEXT: v_mov_b32_e32 v3, 0 5241; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 5242; GFX9-NEXT: v_sub_u32_e32 v2, s0, v2 5243; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 5244; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0 5245; GFX9-NEXT: global_store_short v3, v2, s[6:7] offset:4 5246; GFX9-NEXT: global_store_dword v3, v0, s[6:7] 5247; GFX9-NEXT: s_endpgm 5248; 5249; GFX90A-LABEL: urem_v3i16: 5250; GFX90A: ; %bb.0: 5251; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 5252; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 5253; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 5254; GFX90A-NEXT: s_mov_b32 s0, 0xffff 5255; GFX90A-NEXT: v_mov_b32_e32 v1, 0 5256; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 5257; GFX90A-NEXT: s_and_b32 s1, s2, s0 5258; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s1 5259; GFX90A-NEXT: s_and_b32 s8, s6, s0 5260; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s8 5261; GFX90A-NEXT: s_lshr_b32 s2, s2, 16 5262; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 5263; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s2 5264; GFX90A-NEXT: s_lshr_b32 s6, s6, 16 5265; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s6 5266; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 5267; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 5268; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 5269; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 5270; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 5271; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 5272; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc 5273; GFX90A-NEXT: v_mul_f32_e32 v2, v5, v6 5274; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s1 5275; GFX90A-NEXT: v_trunc_f32_e32 v2, v2 5276; GFX90A-NEXT: s_and_b32 s1, s3, s0 5277; GFX90A-NEXT: v_mad_f32 v3, -v2, v4, v5 5278; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, s1 5279; GFX90A-NEXT: s_and_b32 s0, s7, s0 5280; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 5281; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 5282; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v5 5283; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 5284; GFX90A-NEXT: v_sub_u32_e32 v0, s8, v0 5285; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 5286; GFX90A-NEXT: v_mul_f32_e32 v3, v6, v7 5287; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 5288; GFX90A-NEXT: v_cvt_u32_f32_e32 v4, v3 5289; GFX90A-NEXT: v_mad_f32 v3, -v3, v5, v6 5290; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 5291; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s2 5292; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 5293; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s1 5294; GFX90A-NEXT: v_sub_u32_e32 v2, s6, v2 5295; GFX90A-NEXT: v_sub_u32_e32 v3, s0, v3 5296; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 5297; GFX90A-NEXT: v_lshl_or_b32 v0, v2, 16, v0 5298; GFX90A-NEXT: global_store_short v1, v3, s[4:5] offset:4 5299; GFX90A-NEXT: global_store_dword v1, v0, s[4:5] 5300; GFX90A-NEXT: s_endpgm 5301 %r = urem <3 x i16> %x, %y 5302 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 5303 ret void 5304} 5305 5306define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 5307; CHECK-LABEL: @sdiv_v3i16( 5308; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 5309; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 5310; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 5311; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 5312; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 5313; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 5314; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 5315; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 5316; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 5317; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 5318; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 5319; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 5320; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 5321; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 5322; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 5323; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 5324; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 5325; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 5326; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 5327; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 5328; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 16 5329; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 16 5330; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16 5331; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i16> undef, i16 [[TMP23]], i64 0 5332; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1 5333; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i16> [[Y]], i64 1 5334; CHECK-NEXT: [[TMP27:%.*]] = sext i16 [[TMP25]] to i32 5335; CHECK-NEXT: [[TMP28:%.*]] = sext i16 [[TMP26]] to i32 5336; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 5337; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 5338; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 5339; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 5340; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 5341; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 5342; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 5343; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 5344; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 5345; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 5346; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 5347; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 5348; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 5349; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 5350; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 5351; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 5352; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 16 5353; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 16 5354; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16 5355; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1 5356; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2 5357; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i16> [[Y]], i64 2 5358; CHECK-NEXT: [[TMP51:%.*]] = sext i16 [[TMP49]] to i32 5359; CHECK-NEXT: [[TMP52:%.*]] = sext i16 [[TMP50]] to i32 5360; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 5361; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 5362; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 5363; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 5364; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 5365; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 5366; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 5367; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 5368; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 5369; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 5370; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 5371; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 5372; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 5373; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 5374; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 5375; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 5376; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 16 5377; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 16 5378; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16 5379; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2 5380; CHECK-NEXT: store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 5381; CHECK-NEXT: ret void 5382; 5383; GFX6-LABEL: sdiv_v3i16: 5384; GFX6: ; %bb.0: 5385; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 5386; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5387; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 5388; GFX6-NEXT: s_mov_b32 s7, 0xf000 5389; GFX6-NEXT: s_mov_b32 s6, -1 5390; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5391; GFX6-NEXT: s_sext_i32_i16 s8, s2 5392; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s8 5393; GFX6-NEXT: s_sext_i32_i16 s9, s0 5394; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s9 5395; GFX6-NEXT: s_xor_b32 s8, s9, s8 5396; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 5397; GFX6-NEXT: s_ashr_i32 s2, s2, 16 5398; GFX6-NEXT: s_ashr_i32 s8, s8, 30 5399; GFX6-NEXT: s_or_b32 s8, s8, 1 5400; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 5401; GFX6-NEXT: v_trunc_f32_e32 v2, v2 5402; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 5403; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 5404; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 5405; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s2 5406; GFX6-NEXT: v_mov_b32_e32 v3, s8 5407; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 5408; GFX6-NEXT: s_ashr_i32 s0, s0, 16 5409; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 5410; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 5411; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 5412; GFX6-NEXT: s_xor_b32 s0, s0, s2 5413; GFX6-NEXT: s_ashr_i32 s0, s0, 30 5414; GFX6-NEXT: s_or_b32 s0, s0, 1 5415; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 5416; GFX6-NEXT: v_trunc_f32_e32 v3, v3 5417; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 5418; GFX6-NEXT: v_mov_b32_e32 v4, s0 5419; GFX6-NEXT: s_sext_i32_i16 s0, s3 5420; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 5421; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v1| 5422; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s0 5423; GFX6-NEXT: v_cndmask_b32_e32 v1, 0, v4, vcc 5424; GFX6-NEXT: s_sext_i32_i16 s1, s1 5425; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 5426; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s1 5427; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 5428; GFX6-NEXT: s_xor_b32 s0, s1, s0 5429; GFX6-NEXT: s_ashr_i32 s0, s0, 30 5430; GFX6-NEXT: s_or_b32 s0, s0, 1 5431; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 5432; GFX6-NEXT: v_trunc_f32_e32 v4, v4 5433; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 5434; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 5435; GFX6-NEXT: v_mov_b32_e32 v5, s0 5436; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 5437; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 5438; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 5439; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 5440; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 5441; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 5442; GFX6-NEXT: buffer_store_short v2, off, s[4:7], 0 offset:4 5443; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5444; GFX6-NEXT: s_endpgm 5445; 5446; GFX9-LABEL: sdiv_v3i16: 5447; GFX9: ; %bb.0: 5448; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 5449; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5450; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 5451; GFX9-NEXT: v_mov_b32_e32 v1, 0 5452; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5453; GFX9-NEXT: s_sext_i32_i16 s0, s2 5454; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s0 5455; GFX9-NEXT: s_sext_i32_i16 s1, s4 5456; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 5457; GFX9-NEXT: s_xor_b32 s0, s1, s0 5458; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 5459; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5460; GFX9-NEXT: s_or_b32 s8, s0, 1 5461; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 5462; GFX9-NEXT: v_trunc_f32_e32 v3, v3 5463; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 5464; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 5465; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5466; GFX9-NEXT: s_cselect_b32 s0, s8, 0 5467; GFX9-NEXT: s_ashr_i32 s1, s2, 16 5468; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 5469; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 5470; GFX9-NEXT: s_ashr_i32 s2, s4, 16 5471; GFX9-NEXT: v_add_u32_e32 v2, s0, v3 5472; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s2 5473; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0 5474; GFX9-NEXT: s_xor_b32 s0, s2, s1 5475; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5476; GFX9-NEXT: s_or_b32 s2, s0, 1 5477; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 5478; GFX9-NEXT: v_trunc_f32_e32 v4, v4 5479; GFX9-NEXT: v_mad_f32 v3, -v4, v0, v3 5480; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| 5481; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5482; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 5483; GFX9-NEXT: s_sext_i32_i16 s1, s3 5484; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s1 5485; GFX9-NEXT: s_cselect_b32 s0, s2, 0 5486; GFX9-NEXT: v_add_u32_e32 v3, s0, v4 5487; GFX9-NEXT: s_sext_i32_i16 s0, s5 5488; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 5489; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v0 5490; GFX9-NEXT: s_xor_b32 s0, s0, s1 5491; GFX9-NEXT: s_ashr_i32 s0, s0, 30 5492; GFX9-NEXT: s_or_b32 s2, s0, 1 5493; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 5494; GFX9-NEXT: v_trunc_f32_e32 v5, v5 5495; GFX9-NEXT: v_mad_f32 v4, -v5, v0, v4 5496; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 5497; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| 5498; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 5499; GFX9-NEXT: s_cselect_b32 s0, s2, 0 5500; GFX9-NEXT: v_add_u32_e32 v0, s0, v5 5501; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 5502; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2 5503; GFX9-NEXT: global_store_short v1, v0, s[6:7] offset:4 5504; GFX9-NEXT: global_store_dword v1, v2, s[6:7] 5505; GFX9-NEXT: s_endpgm 5506; 5507; GFX90A-LABEL: sdiv_v3i16: 5508; GFX90A: ; %bb.0: 5509; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 5510; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5511; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 5512; GFX90A-NEXT: v_mov_b32_e32 v1, 0 5513; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 5514; GFX90A-NEXT: s_sext_i32_i16 s0, s2 5515; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s0 5516; GFX90A-NEXT: s_sext_i32_i16 s1, s4 5517; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s1 5518; GFX90A-NEXT: s_xor_b32 s0, s1, s0 5519; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 5520; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 5521; GFX90A-NEXT: s_or_b32 s8, s0, 1 5522; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 5523; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 5524; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 5525; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 5526; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 5527; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 5528; GFX90A-NEXT: s_ashr_i32 s1, s2, 16 5529; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 5530; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 5531; GFX90A-NEXT: s_ashr_i32 s2, s4, 16 5532; GFX90A-NEXT: v_add_u32_e32 v2, s0, v3 5533; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s2 5534; GFX90A-NEXT: v_rcp_iflag_f32_e32 v4, v0 5535; GFX90A-NEXT: s_xor_b32 s0, s2, s1 5536; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 5537; GFX90A-NEXT: s_or_b32 s2, s0, 1 5538; GFX90A-NEXT: v_mul_f32_e32 v4, v3, v4 5539; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 5540; GFX90A-NEXT: v_mad_f32 v3, -v4, v0, v3 5541; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v0| 5542; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 5543; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 5544; GFX90A-NEXT: s_sext_i32_i16 s1, s3 5545; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s1 5546; GFX90A-NEXT: s_cselect_b32 s0, s2, 0 5547; GFX90A-NEXT: v_add_u32_e32 v3, s0, v4 5548; GFX90A-NEXT: s_sext_i32_i16 s0, s5 5549; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s0 5550; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v0 5551; GFX90A-NEXT: s_xor_b32 s0, s0, s1 5552; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 5553; GFX90A-NEXT: s_or_b32 s2, s0, 1 5554; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 5555; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 5556; GFX90A-NEXT: v_mad_f32 v4, -v5, v0, v4 5557; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 5558; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v0| 5559; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 5560; GFX90A-NEXT: s_cselect_b32 s0, s2, 0 5561; GFX90A-NEXT: v_add_u32_e32 v0, s0, v5 5562; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff, v2 5563; GFX90A-NEXT: v_lshl_or_b32 v2, v3, 16, v2 5564; GFX90A-NEXT: global_store_short v1, v0, s[6:7] offset:4 5565; GFX90A-NEXT: global_store_dword v1, v2, s[6:7] 5566; GFX90A-NEXT: s_endpgm 5567 %r = sdiv <3 x i16> %x, %y 5568 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 5569 ret void 5570} 5571 5572define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x, <3 x i16> %y) { 5573; CHECK-LABEL: @srem_v3i16( 5574; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i16> [[X:%.*]], i64 0 5575; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i16> [[Y:%.*]], i64 0 5576; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP1]] to i32 5577; CHECK-NEXT: [[TMP4:%.*]] = sext i16 [[TMP2]] to i32 5578; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 5579; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 5580; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 5581; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 5582; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 5583; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 5584; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 5585; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 5586; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 5587; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 5588; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 5589; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 5590; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 5591; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 5592; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 5593; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 5594; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 5595; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 5596; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 16 5597; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 16 5598; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16 5599; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i16> undef, i16 [[TMP25]], i64 0 5600; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1 5601; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i16> [[Y]], i64 1 5602; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP27]] to i32 5603; CHECK-NEXT: [[TMP30:%.*]] = sext i16 [[TMP28]] to i32 5604; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 5605; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 5606; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 5607; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 5608; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 5609; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 5610; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 5611; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 5612; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 5613; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 5614; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 5615; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 5616; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 5617; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 5618; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 5619; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 5620; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 5621; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 5622; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 16 5623; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 16 5624; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16 5625; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1 5626; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2 5627; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i16> [[Y]], i64 2 5628; CHECK-NEXT: [[TMP55:%.*]] = sext i16 [[TMP53]] to i32 5629; CHECK-NEXT: [[TMP56:%.*]] = sext i16 [[TMP54]] to i32 5630; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 5631; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 5632; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 5633; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 5634; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 5635; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 5636; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 5637; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 5638; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 5639; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 5640; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 5641; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 5642; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 5643; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 5644; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 5645; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 5646; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 5647; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 5648; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 16 5649; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 16 5650; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16 5651; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2 5652; CHECK-NEXT: store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]], align 8 5653; CHECK-NEXT: ret void 5654; 5655; GFX6-LABEL: srem_v3i16: 5656; GFX6: ; %bb.0: 5657; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd 5658; GFX6-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0xb 5659; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 5660; GFX6-NEXT: s_mov_b32 s3, 0xf000 5661; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5662; GFX6-NEXT: s_sext_i32_i16 s2, s4 5663; GFX6-NEXT: v_cvt_f32_i32_e32 v0, s2 5664; GFX6-NEXT: s_sext_i32_i16 s8, s6 5665; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s8 5666; GFX6-NEXT: s_xor_b32 s2, s8, s2 5667; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v0 5668; GFX6-NEXT: s_ashr_i32 s2, s2, 30 5669; GFX6-NEXT: s_or_b32 s2, s2, 1 5670; GFX6-NEXT: v_mov_b32_e32 v3, s2 5671; GFX6-NEXT: v_mul_f32_e32 v2, v1, v2 5672; GFX6-NEXT: v_trunc_f32_e32 v2, v2 5673; GFX6-NEXT: v_mad_f32 v1, -v2, v0, v1 5674; GFX6-NEXT: v_cvt_i32_f32_e32 v2, v2 5675; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v0| 5676; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc 5677; GFX6-NEXT: v_mov_b32_e32 v1, s6 5678; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 5679; GFX6-NEXT: v_mov_b32_e32 v2, s4 5680; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 16 5681; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 5682; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v3 5683; GFX6-NEXT: v_alignbit_b32 v1, s7, v1, 16 5684; GFX6-NEXT: v_bfe_i32 v5, v1, 0, 16 5685; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v5 5686; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 5687; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 5688; GFX6-NEXT: v_xor_b32_e32 v3, v5, v3 5689; GFX6-NEXT: s_sext_i32_i16 s4, s5 5690; GFX6-NEXT: v_mul_f32_e32 v5, v6, v7 5691; GFX6-NEXT: v_trunc_f32_e32 v5, v5 5692; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 5693; GFX6-NEXT: v_mad_f32 v6, -v5, v4, v6 5694; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 5695; GFX6-NEXT: v_ashrrev_i32_e32 v3, 30, v3 5696; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4| 5697; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 5698; GFX6-NEXT: v_or_b32_e32 v3, 1, v3 5699; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc 5700; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 5701; GFX6-NEXT: s_sext_i32_i16 s6, s7 5702; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 5703; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s6 5704; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v4 5705; GFX6-NEXT: s_xor_b32 s4, s6, s4 5706; GFX6-NEXT: s_ashr_i32 s4, s4, 30 5707; GFX6-NEXT: s_or_b32 s4, s4, 1 5708; GFX6-NEXT: v_mul_f32_e32 v5, v3, v5 5709; GFX6-NEXT: v_trunc_f32_e32 v5, v5 5710; GFX6-NEXT: v_mad_f32 v3, -v5, v4, v3 5711; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 5712; GFX6-NEXT: v_mov_b32_e32 v6, s4 5713; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v4| 5714; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 5715; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 5716; GFX6-NEXT: v_mul_lo_u32 v3, v3, s5 5717; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 5718; GFX6-NEXT: s_mov_b32 s2, -1 5719; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v3 5720; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 5721; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 5722; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 5723; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 5724; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 5725; GFX6-NEXT: s_endpgm 5726; 5727; GFX9-LABEL: srem_v3i16: 5728; GFX9: ; %bb.0: 5729; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 5730; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5731; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5732; GFX9-NEXT: s_sext_i32_i16 s8, s2 5733; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8 5734; GFX9-NEXT: s_sext_i32_i16 s9, s4 5735; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9 5736; GFX9-NEXT: s_xor_b32 s6, s9, s8 5737; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0 5738; GFX9-NEXT: s_ashr_i32 s6, s6, 30 5739; GFX9-NEXT: s_or_b32 s10, s6, 1 5740; GFX9-NEXT: s_sext_i32_i16 s5, s5 5741; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2 5742; GFX9-NEXT: v_trunc_f32_e32 v2, v2 5743; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1 5744; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, |v0| 5745; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec 5746; GFX9-NEXT: s_cselect_b32 s6, s10, 0 5747; GFX9-NEXT: s_ashr_i32 s2, s2, 16 5748; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 5749; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2 5750; GFX9-NEXT: s_ashr_i32 s4, s4, 16 5751; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 5752; GFX9-NEXT: v_add_u32_e32 v1, s6, v2 5753; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s4 5754; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0 5755; GFX9-NEXT: s_xor_b32 s6, s4, s2 5756; GFX9-NEXT: s_ashr_i32 s6, s6, 30 5757; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8 5758; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3 5759; GFX9-NEXT: v_trunc_f32_e32 v3, v3 5760; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2 5761; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 5762; GFX9-NEXT: s_or_b32 s8, s6, 1 5763; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v2|, |v0| 5764; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec 5765; GFX9-NEXT: s_cselect_b32 s6, s8, 0 5766; GFX9-NEXT: v_add_u32_e32 v0, s6, v3 5767; GFX9-NEXT: s_sext_i32_i16 s6, s3 5768; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s6 5769; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5 5770; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2 5771; GFX9-NEXT: s_xor_b32 s2, s5, s6 5772; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 5773; GFX9-NEXT: s_ashr_i32 s2, s2, 30 5774; GFX9-NEXT: s_or_b32 s7, s2, 1 5775; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1 5776; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 5777; GFX9-NEXT: v_trunc_f32_e32 v4, v4 5778; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 5779; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 5780; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v2| 5781; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec 5782; GFX9-NEXT: s_cselect_b32 s2, s7, 0 5783; GFX9-NEXT: v_add_u32_e32 v2, s2, v4 5784; GFX9-NEXT: v_mul_lo_u32 v2, v2, s6 5785; GFX9-NEXT: v_mov_b32_e32 v3, 0 5786; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 5787; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 5788; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2 5789; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 5790; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5791; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4 5792; GFX9-NEXT: global_store_dword v3, v0, s[0:1] 5793; GFX9-NEXT: s_endpgm 5794; 5795; GFX90A-LABEL: srem_v3i16: 5796; GFX90A: ; %bb.0: 5797; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 5798; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c 5799; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 5800; GFX90A-NEXT: v_mov_b32_e32 v1, 0 5801; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 5802; GFX90A-NEXT: s_sext_i32_i16 s8, s2 5803; GFX90A-NEXT: v_cvt_f32_i32_e32 v0, s8 5804; GFX90A-NEXT: s_sext_i32_i16 s9, s4 5805; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s9 5806; GFX90A-NEXT: s_xor_b32 s0, s9, s8 5807; GFX90A-NEXT: v_rcp_iflag_f32_e32 v3, v0 5808; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 5809; GFX90A-NEXT: s_or_b32 s10, s0, 1 5810; GFX90A-NEXT: v_mul_f32_e32 v3, v2, v3 5811; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 5812; GFX90A-NEXT: v_mad_f32 v2, -v3, v0, v2 5813; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0| 5814; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 5815; GFX90A-NEXT: s_cselect_b32 s0, s10, 0 5816; GFX90A-NEXT: s_ashr_i32 s2, s2, 16 5817; GFX90A-NEXT: v_cvt_i32_f32_e32 v3, v3 5818; GFX90A-NEXT: v_cvt_f32_i32_e32 v2, s2 5819; GFX90A-NEXT: s_ashr_i32 s4, s4, 16 5820; GFX90A-NEXT: v_add_u32_e32 v0, s0, v3 5821; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s4 5822; GFX90A-NEXT: v_rcp_iflag_f32_e32 v4, v2 5823; GFX90A-NEXT: s_xor_b32 s0, s4, s2 5824; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 5825; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s8 5826; GFX90A-NEXT: v_mul_f32_e32 v4, v3, v4 5827; GFX90A-NEXT: v_trunc_f32_e32 v4, v4 5828; GFX90A-NEXT: v_mad_f32 v3, -v4, v2, v3 5829; GFX90A-NEXT: v_cvt_i32_f32_e32 v4, v4 5830; GFX90A-NEXT: s_or_b32 s8, s0, 1 5831; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2| 5832; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 5833; GFX90A-NEXT: s_cselect_b32 s0, s8, 0 5834; GFX90A-NEXT: v_add_u32_e32 v2, s0, v4 5835; GFX90A-NEXT: v_mul_lo_u32 v2, v2, s2 5836; GFX90A-NEXT: s_sext_i32_i16 s2, s3 5837; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s2 5838; GFX90A-NEXT: s_sext_i32_i16 s3, s5 5839; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s3 5840; GFX90A-NEXT: s_xor_b32 s0, s3, s2 5841; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v3 5842; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 5843; GFX90A-NEXT: v_sub_u32_e32 v2, s4, v2 5844; GFX90A-NEXT: s_or_b32 s4, s0, 1 5845; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 5846; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 5847; GFX90A-NEXT: v_mad_f32 v4, -v5, v3, v4 5848; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 5849; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 5850; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 5851; GFX90A-NEXT: s_cselect_b32 s0, s4, 0 5852; GFX90A-NEXT: v_add_u32_e32 v3, s0, v5 5853; GFX90A-NEXT: v_sub_u32_e32 v0, s9, v0 5854; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s2 5855; GFX90A-NEXT: v_sub_u32_e32 v3, s3, v3 5856; GFX90A-NEXT: v_and_b32_e32 v0, 0xffff, v0 5857; GFX90A-NEXT: v_lshl_or_b32 v0, v2, 16, v0 5858; GFX90A-NEXT: global_store_short v1, v3, s[6:7] offset:4 5859; GFX90A-NEXT: global_store_dword v1, v0, s[6:7] 5860; GFX90A-NEXT: s_endpgm 5861 %r = srem <3 x i16> %x, %y 5862 store <3 x i16> %r, <3 x i16> addrspace(1)* %out 5863 ret void 5864} 5865 5866define amdgpu_kernel void @udiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 5867; CHECK-LABEL: @udiv_v3i15( 5868; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 5869; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 5870; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 5871; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 5872; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 5873; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 5874; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 5875; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 5876; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 5877; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 5878; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 5879; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 5880; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 5881; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 5882; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 5883; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 5884; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 5885; CHECK-NEXT: [[TMP18:%.*]] = and i32 [[TMP17]], 32767 5886; CHECK-NEXT: [[TMP19:%.*]] = trunc i32 [[TMP18]] to i15 5887; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i15> undef, i15 [[TMP19]], i64 0 5888; CHECK-NEXT: [[TMP21:%.*]] = extractelement <3 x i15> [[X]], i64 1 5889; CHECK-NEXT: [[TMP22:%.*]] = extractelement <3 x i15> [[Y]], i64 1 5890; CHECK-NEXT: [[TMP23:%.*]] = zext i15 [[TMP21]] to i32 5891; CHECK-NEXT: [[TMP24:%.*]] = zext i15 [[TMP22]] to i32 5892; CHECK-NEXT: [[TMP25:%.*]] = uitofp i32 [[TMP23]] to float 5893; CHECK-NEXT: [[TMP26:%.*]] = uitofp i32 [[TMP24]] to float 5894; CHECK-NEXT: [[TMP27:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP26]]) 5895; CHECK-NEXT: [[TMP28:%.*]] = fmul fast float [[TMP25]], [[TMP27]] 5896; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.trunc.f32(float [[TMP28]]) 5897; CHECK-NEXT: [[TMP30:%.*]] = fneg fast float [[TMP29]] 5898; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP30]], float [[TMP26]], float [[TMP25]]) 5899; CHECK-NEXT: [[TMP32:%.*]] = fptoui float [[TMP29]] to i32 5900; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.fabs.f32(float [[TMP31]]) 5901; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.fabs.f32(float [[TMP26]]) 5902; CHECK-NEXT: [[TMP35:%.*]] = fcmp fast oge float [[TMP33]], [[TMP34]] 5903; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], i32 1, i32 0 5904; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP32]], [[TMP36]] 5905; CHECK-NEXT: [[TMP38:%.*]] = and i32 [[TMP37]], 32767 5906; CHECK-NEXT: [[TMP39:%.*]] = trunc i32 [[TMP38]] to i15 5907; CHECK-NEXT: [[TMP40:%.*]] = insertelement <3 x i15> [[TMP20]], i15 [[TMP39]], i64 1 5908; CHECK-NEXT: [[TMP41:%.*]] = extractelement <3 x i15> [[X]], i64 2 5909; CHECK-NEXT: [[TMP42:%.*]] = extractelement <3 x i15> [[Y]], i64 2 5910; CHECK-NEXT: [[TMP43:%.*]] = zext i15 [[TMP41]] to i32 5911; CHECK-NEXT: [[TMP44:%.*]] = zext i15 [[TMP42]] to i32 5912; CHECK-NEXT: [[TMP45:%.*]] = uitofp i32 [[TMP43]] to float 5913; CHECK-NEXT: [[TMP46:%.*]] = uitofp i32 [[TMP44]] to float 5914; CHECK-NEXT: [[TMP47:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP46]]) 5915; CHECK-NEXT: [[TMP48:%.*]] = fmul fast float [[TMP45]], [[TMP47]] 5916; CHECK-NEXT: [[TMP49:%.*]] = call fast float @llvm.trunc.f32(float [[TMP48]]) 5917; CHECK-NEXT: [[TMP50:%.*]] = fneg fast float [[TMP49]] 5918; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP50]], float [[TMP46]], float [[TMP45]]) 5919; CHECK-NEXT: [[TMP52:%.*]] = fptoui float [[TMP49]] to i32 5920; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.fabs.f32(float [[TMP51]]) 5921; CHECK-NEXT: [[TMP54:%.*]] = call fast float @llvm.fabs.f32(float [[TMP46]]) 5922; CHECK-NEXT: [[TMP55:%.*]] = fcmp fast oge float [[TMP53]], [[TMP54]] 5923; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP55]], i32 1, i32 0 5924; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP52]], [[TMP56]] 5925; CHECK-NEXT: [[TMP58:%.*]] = and i32 [[TMP57]], 32767 5926; CHECK-NEXT: [[TMP59:%.*]] = trunc i32 [[TMP58]] to i15 5927; CHECK-NEXT: [[TMP60:%.*]] = insertelement <3 x i15> [[TMP40]], i15 [[TMP59]], i64 2 5928; CHECK-NEXT: store <3 x i15> [[TMP60]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 5929; CHECK-NEXT: ret void 5930; 5931; GFX6-LABEL: udiv_v3i15: 5932; GFX6: ; %bb.0: 5933; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 5934; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 5935; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 5936; GFX6-NEXT: s_mov_b32 s7, 0xf000 5937; GFX6-NEXT: s_mov_b32 s6, -1 5938; GFX6-NEXT: s_waitcnt lgkmcnt(0) 5939; GFX6-NEXT: v_mov_b32_e32 v0, s2 5940; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 5941; GFX6-NEXT: s_movk_i32 s3, 0x7fff 5942; GFX6-NEXT: s_and_b32 s9, s0, s3 5943; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 5944; GFX6-NEXT: s_and_b32 s8, s2, s3 5945; GFX6-NEXT: v_mov_b32_e32 v2, s0 5946; GFX6-NEXT: s_bfe_u32 s0, s0, 0xf000f 5947; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s8 5948; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 5949; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s0 5950; GFX6-NEXT: s_bfe_u32 s2, s2, 0xf000f 5951; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 5952; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 5953; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s2 5954; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 5955; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 5956; GFX6-NEXT: v_trunc_f32_e32 v4, v4 5957; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 5958; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 5959; GFX6-NEXT: v_cvt_f32_u32_e32 v2, v2 5960; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 5961; GFX6-NEXT: v_mul_f32_e32 v1, v6, v7 5962; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 5963; GFX6-NEXT: v_trunc_f32_e32 v1, v1 5964; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 5965; GFX6-NEXT: v_mad_f32 v4, -v1, v5, v6 5966; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 5967; GFX6-NEXT: v_cvt_f32_u32_e32 v0, v0 5968; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v2 5969; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v5 5970; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc 5971; GFX6-NEXT: v_mul_f32_e32 v1, v0, v6 5972; GFX6-NEXT: v_trunc_f32_e32 v1, v1 5973; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v1 5974; GFX6-NEXT: v_mad_f32 v0, -v1, v2, v0 5975; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v2 5976; GFX6-NEXT: v_and_b32_e32 v2, s3, v3 5977; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v5, vcc 5978; GFX6-NEXT: v_and_b32_e32 v3, s3, v4 5979; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 5980; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 5981; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 5982; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 5983; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 5984; GFX6-NEXT: s_waitcnt expcnt(0) 5985; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 5986; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 5987; GFX6-NEXT: s_endpgm 5988; 5989; GFX9-LABEL: udiv_v3i15: 5990; GFX9: ; %bb.0: 5991; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 5992; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 5993; GFX9-NEXT: s_movk_i32 s6, 0x7fff 5994; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 5995; GFX9-NEXT: v_mov_b32_e32 v2, 0 5996; GFX9-NEXT: s_waitcnt lgkmcnt(0) 5997; GFX9-NEXT: v_mov_b32_e32 v0, s2 5998; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 5999; GFX9-NEXT: s_and_b32 s3, s2, s6 6000; GFX9-NEXT: s_and_b32 s7, s0, s6 6001; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7 6002; GFX9-NEXT: v_mov_b32_e32 v3, s0 6003; GFX9-NEXT: s_bfe_u32 s0, s0, 0xf000f 6004; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 6005; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 6006; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s0 6007; GFX9-NEXT: s_bfe_u32 s2, s2, 0xf000f 6008; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30 6009; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 6010; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2 6011; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 6012; GFX9-NEXT: v_and_b32_e32 v3, s6, v3 6013; GFX9-NEXT: v_trunc_f32_e32 v5, v5 6014; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 6015; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 6016; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v3 6017; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 6018; GFX9-NEXT: v_mul_f32_e32 v1, v7, v8 6019; GFX9-NEXT: v_and_b32_e32 v0, s6, v0 6020; GFX9-NEXT: v_trunc_f32_e32 v1, v1 6021; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 6022; GFX9-NEXT: v_mad_f32 v5, -v1, v6, v7 6023; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 6024; GFX9-NEXT: v_cvt_f32_u32_e32 v0, v0 6025; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 6026; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v6 6027; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc 6028; GFX9-NEXT: v_mul_f32_e32 v1, v0, v7 6029; GFX9-NEXT: v_trunc_f32_e32 v1, v1 6030; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v1 6031; GFX9-NEXT: v_mad_f32 v0, -v1, v3, v0 6032; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v3 6033; GFX9-NEXT: v_and_b32_e32 v3, s6, v4 6034; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc 6035; GFX9-NEXT: v_and_b32_e32 v4, s6, v5 6036; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6037; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 6038; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 6039; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 6040; GFX9-NEXT: global_store_dword v2, v0, s[4:5] 6041; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6042; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 6043; GFX9-NEXT: s_endpgm 6044; 6045; GFX90A-LABEL: udiv_v3i15: 6046; GFX90A: ; %bb.0: 6047; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6048; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 6049; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 6050; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 6051; GFX90A-NEXT: v_mov_b32_e32 v2, 0 6052; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 6053; GFX90A-NEXT: v_mov_b32_e32 v0, s2 6054; GFX90A-NEXT: v_alignbit_b32 v0, s3, v0, 30 6055; GFX90A-NEXT: s_and_b32 s3, s2, s6 6056; GFX90A-NEXT: s_and_b32 s7, s0, s6 6057; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s7 6058; GFX90A-NEXT: v_mov_b32_e32 v3, s0 6059; GFX90A-NEXT: s_bfe_u32 s0, s0, 0xf000f 6060; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s3 6061; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v1 6062; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s0 6063; GFX90A-NEXT: s_bfe_u32 s2, s2, 0xf000f 6064; GFX90A-NEXT: v_alignbit_b32 v3, s1, v3, 30 6065; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 6066; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s2 6067; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v6 6068; GFX90A-NEXT: v_and_b32_e32 v3, s6, v3 6069; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 6070; GFX90A-NEXT: v_mad_f32 v4, -v5, v1, v4 6071; GFX90A-NEXT: v_cvt_u32_f32_e32 v5, v5 6072; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, v3 6073; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 6074; GFX90A-NEXT: v_mul_f32_e32 v1, v7, v8 6075; GFX90A-NEXT: v_and_b32_e32 v0, s6, v0 6076; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 6077; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 6078; GFX90A-NEXT: v_mad_f32 v5, -v1, v6, v7 6079; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 6080; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, v0 6081; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v3 6082; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, v6 6083; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc 6084; GFX90A-NEXT: v_mul_f32_e32 v1, v0, v7 6085; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 6086; GFX90A-NEXT: v_cvt_u32_f32_e32 v6, v1 6087; GFX90A-NEXT: v_mad_f32 v0, -v1, v3, v0 6088; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, v3 6089; GFX90A-NEXT: v_and_b32_e32 v3, s6, v4 6090; GFX90A-NEXT: v_and_b32_e32 v4, s6, v5 6091; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc 6092; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 6093; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6094; GFX90A-NEXT: v_or_b32_e32 v3, v3, v4 6095; GFX90A-NEXT: v_or_b32_e32 v0, v3, v0 6096; GFX90A-NEXT: global_store_dword v2, v0, s[4:5] 6097; GFX90A-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6098; GFX90A-NEXT: global_store_short v2, v0, s[4:5] offset:4 6099; GFX90A-NEXT: s_endpgm 6100 %r = udiv <3 x i15> %x, %y 6101 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 6102 ret void 6103} 6104 6105define amdgpu_kernel void @urem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 6106; CHECK-LABEL: @urem_v3i15( 6107; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 6108; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 6109; CHECK-NEXT: [[TMP3:%.*]] = zext i15 [[TMP1]] to i32 6110; CHECK-NEXT: [[TMP4:%.*]] = zext i15 [[TMP2]] to i32 6111; CHECK-NEXT: [[TMP5:%.*]] = uitofp i32 [[TMP3]] to float 6112; CHECK-NEXT: [[TMP6:%.*]] = uitofp i32 [[TMP4]] to float 6113; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP6]]) 6114; CHECK-NEXT: [[TMP8:%.*]] = fmul fast float [[TMP5]], [[TMP7]] 6115; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.trunc.f32(float [[TMP8]]) 6116; CHECK-NEXT: [[TMP10:%.*]] = fneg fast float [[TMP9]] 6117; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP10]], float [[TMP6]], float [[TMP5]]) 6118; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP9]] to i32 6119; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.fabs.f32(float [[TMP11]]) 6120; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.fabs.f32(float [[TMP6]]) 6121; CHECK-NEXT: [[TMP15:%.*]] = fcmp fast oge float [[TMP13]], [[TMP14]] 6122; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], i32 1, i32 0 6123; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP12]], [[TMP16]] 6124; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP17]], [[TMP4]] 6125; CHECK-NEXT: [[TMP19:%.*]] = sub i32 [[TMP3]], [[TMP18]] 6126; CHECK-NEXT: [[TMP20:%.*]] = and i32 [[TMP19]], 32767 6127; CHECK-NEXT: [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15 6128; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i15> undef, i15 [[TMP21]], i64 0 6129; CHECK-NEXT: [[TMP23:%.*]] = extractelement <3 x i15> [[X]], i64 1 6130; CHECK-NEXT: [[TMP24:%.*]] = extractelement <3 x i15> [[Y]], i64 1 6131; CHECK-NEXT: [[TMP25:%.*]] = zext i15 [[TMP23]] to i32 6132; CHECK-NEXT: [[TMP26:%.*]] = zext i15 [[TMP24]] to i32 6133; CHECK-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP25]] to float 6134; CHECK-NEXT: [[TMP28:%.*]] = uitofp i32 [[TMP26]] to float 6135; CHECK-NEXT: [[TMP29:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP28]]) 6136; CHECK-NEXT: [[TMP30:%.*]] = fmul fast float [[TMP27]], [[TMP29]] 6137; CHECK-NEXT: [[TMP31:%.*]] = call fast float @llvm.trunc.f32(float [[TMP30]]) 6138; CHECK-NEXT: [[TMP32:%.*]] = fneg fast float [[TMP31]] 6139; CHECK-NEXT: [[TMP33:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP32]], float [[TMP28]], float [[TMP27]]) 6140; CHECK-NEXT: [[TMP34:%.*]] = fptoui float [[TMP31]] to i32 6141; CHECK-NEXT: [[TMP35:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 6142; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.fabs.f32(float [[TMP28]]) 6143; CHECK-NEXT: [[TMP37:%.*]] = fcmp fast oge float [[TMP35]], [[TMP36]] 6144; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP37]], i32 1, i32 0 6145; CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP34]], [[TMP38]] 6146; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP26]] 6147; CHECK-NEXT: [[TMP41:%.*]] = sub i32 [[TMP25]], [[TMP40]] 6148; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], 32767 6149; CHECK-NEXT: [[TMP43:%.*]] = trunc i32 [[TMP42]] to i15 6150; CHECK-NEXT: [[TMP44:%.*]] = insertelement <3 x i15> [[TMP22]], i15 [[TMP43]], i64 1 6151; CHECK-NEXT: [[TMP45:%.*]] = extractelement <3 x i15> [[X]], i64 2 6152; CHECK-NEXT: [[TMP46:%.*]] = extractelement <3 x i15> [[Y]], i64 2 6153; CHECK-NEXT: [[TMP47:%.*]] = zext i15 [[TMP45]] to i32 6154; CHECK-NEXT: [[TMP48:%.*]] = zext i15 [[TMP46]] to i32 6155; CHECK-NEXT: [[TMP49:%.*]] = uitofp i32 [[TMP47]] to float 6156; CHECK-NEXT: [[TMP50:%.*]] = uitofp i32 [[TMP48]] to float 6157; CHECK-NEXT: [[TMP51:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP50]]) 6158; CHECK-NEXT: [[TMP52:%.*]] = fmul fast float [[TMP49]], [[TMP51]] 6159; CHECK-NEXT: [[TMP53:%.*]] = call fast float @llvm.trunc.f32(float [[TMP52]]) 6160; CHECK-NEXT: [[TMP54:%.*]] = fneg fast float [[TMP53]] 6161; CHECK-NEXT: [[TMP55:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP54]], float [[TMP50]], float [[TMP49]]) 6162; CHECK-NEXT: [[TMP56:%.*]] = fptoui float [[TMP53]] to i32 6163; CHECK-NEXT: [[TMP57:%.*]] = call fast float @llvm.fabs.f32(float [[TMP55]]) 6164; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.fabs.f32(float [[TMP50]]) 6165; CHECK-NEXT: [[TMP59:%.*]] = fcmp fast oge float [[TMP57]], [[TMP58]] 6166; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP59]], i32 1, i32 0 6167; CHECK-NEXT: [[TMP61:%.*]] = add i32 [[TMP56]], [[TMP60]] 6168; CHECK-NEXT: [[TMP62:%.*]] = mul i32 [[TMP61]], [[TMP48]] 6169; CHECK-NEXT: [[TMP63:%.*]] = sub i32 [[TMP47]], [[TMP62]] 6170; CHECK-NEXT: [[TMP64:%.*]] = and i32 [[TMP63]], 32767 6171; CHECK-NEXT: [[TMP65:%.*]] = trunc i32 [[TMP64]] to i15 6172; CHECK-NEXT: [[TMP66:%.*]] = insertelement <3 x i15> [[TMP44]], i15 [[TMP65]], i64 2 6173; CHECK-NEXT: store <3 x i15> [[TMP66]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 6174; CHECK-NEXT: ret void 6175; 6176; GFX6-LABEL: urem_v3i15: 6177; GFX6: ; %bb.0: 6178; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 6179; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6180; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 6181; GFX6-NEXT: s_mov_b32 s7, 0xf000 6182; GFX6-NEXT: s_mov_b32 s6, -1 6183; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6184; GFX6-NEXT: v_mov_b32_e32 v0, s2 6185; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 6186; GFX6-NEXT: s_movk_i32 s3, 0x7fff 6187; GFX6-NEXT: s_and_b32 s10, s0, s3 6188; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s10 6189; GFX6-NEXT: s_and_b32 s9, s2, s3 6190; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s9 6191; GFX6-NEXT: v_mov_b32_e32 v2, s0 6192; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 6193; GFX6-NEXT: v_alignbit_b32 v2, s1, v2, 30 6194; GFX6-NEXT: s_bfe_u32 s1, s0, 0xf000f 6195; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s1 6196; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 6197; GFX6-NEXT: v_trunc_f32_e32 v4, v4 6198; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 6199; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 6200; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 6201; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf000f 6202; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s10 6203; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc 6204; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 6205; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 6206; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 6207; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 6208; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s2, v1 6209; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 6210; GFX6-NEXT: v_cvt_f32_u32_e32 v4, v2 6211; GFX6-NEXT: v_cvt_f32_u32_e32 v7, v0 6212; GFX6-NEXT: v_trunc_f32_e32 v1, v1 6213; GFX6-NEXT: v_mad_f32 v3, -v1, v5, v3 6214; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v4 6215; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 6216; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 6217; GFX6-NEXT: s_lshr_b32 s0, s0, 15 6218; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 6219; GFX6-NEXT: v_trunc_f32_e32 v3, v3 6220; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v3 6221; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc 6222; GFX6-NEXT: v_mad_f32 v3, -v3, v4, v7 6223; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4 6224; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 6225; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 6226; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 6227; GFX6-NEXT: s_lshr_b32 s8, s2, 15 6228; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v1 6229; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 6230; GFX6-NEXT: v_and_b32_e32 v3, s3, v3 6231; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 6232; GFX6-NEXT: v_and_b32_e32 v2, s3, v6 6233; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 6234; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 6235; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 6236; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 6237; GFX6-NEXT: s_waitcnt expcnt(0) 6238; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6239; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 6240; GFX6-NEXT: s_endpgm 6241; 6242; GFX9-LABEL: urem_v3i15: 6243; GFX9: ; %bb.0: 6244; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6245; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 6246; GFX9-NEXT: s_movk_i32 s6, 0x7fff 6247; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 6248; GFX9-NEXT: v_mov_b32_e32 v2, 0 6249; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6250; GFX9-NEXT: v_mov_b32_e32 v0, s2 6251; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 6252; GFX9-NEXT: s_and_b32 s3, s2, s6 6253; GFX9-NEXT: s_and_b32 s8, s0, s6 6254; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8 6255; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3 6256; GFX9-NEXT: s_bfe_u32 s3, s0, 0xf000f 6257; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s3 6258; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1 6259; GFX9-NEXT: v_mov_b32_e32 v3, s0 6260; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30 6261; GFX9-NEXT: s_bfe_u32 s7, s2, 0xf000f 6262; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 6263; GFX9-NEXT: v_trunc_f32_e32 v5, v5 6264; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4 6265; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 6266; GFX9-NEXT: v_and_b32_e32 v3, s6, v3 6267; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 6268; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s7 6269; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6 6270; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc 6271; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3 6272; GFX9-NEXT: v_and_b32_e32 v0, s6, v0 6273; GFX9-NEXT: v_mul_f32_e32 v4, v7, v8 6274; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0 6275; GFX9-NEXT: v_rcp_iflag_f32_e32 v9, v5 6276; GFX9-NEXT: v_trunc_f32_e32 v4, v4 6277; GFX9-NEXT: v_mad_f32 v7, -v4, v6, v7 6278; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 6279; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 6280; GFX9-NEXT: v_mul_f32_e32 v6, v8, v9 6281; GFX9-NEXT: v_trunc_f32_e32 v6, v6 6282; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6 6283; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 6284; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v8 6285; GFX9-NEXT: s_lshr_b32 s1, s0, 15 6286; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5 6287; GFX9-NEXT: v_mul_lo_u32 v4, v4, s1 6288; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc 6289; GFX9-NEXT: v_mul_lo_u32 v1, v1, s0 6290; GFX9-NEXT: v_mul_lo_u32 v3, v5, v3 6291; GFX9-NEXT: s_lshr_b32 s0, s2, 15 6292; GFX9-NEXT: v_sub_u32_e32 v4, s0, v4 6293; GFX9-NEXT: v_sub_u32_e32 v5, s2, v1 6294; GFX9-NEXT: v_sub_u32_e32 v0, v0, v3 6295; GFX9-NEXT: v_and_b32_e32 v4, s6, v4 6296; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6297; GFX9-NEXT: v_and_b32_e32 v3, s6, v5 6298; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 6299; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 6300; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 6301; GFX9-NEXT: global_store_dword v2, v0, s[4:5] 6302; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6303; GFX9-NEXT: global_store_short v2, v0, s[4:5] offset:4 6304; GFX9-NEXT: s_endpgm 6305; 6306; GFX90A-LABEL: urem_v3i15: 6307; GFX90A: ; %bb.0: 6308; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6309; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 6310; GFX90A-NEXT: s_movk_i32 s6, 0x7fff 6311; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 6312; GFX90A-NEXT: v_mov_b32_e32 v2, 0 6313; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 6314; GFX90A-NEXT: s_and_b32 s7, s2, s6 6315; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s7 6316; GFX90A-NEXT: s_bfe_u32 s8, s2, 0xf000f 6317; GFX90A-NEXT: s_and_b32 s9, s0, s6 6318; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 6319; GFX90A-NEXT: s_bfe_u32 s7, s0, 0xf000f 6320; GFX90A-NEXT: v_cvt_f32_u32_e32 v6, s7 6321; GFX90A-NEXT: v_cvt_f32_u32_e32 v7, s8 6322; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v1 6323; GFX90A-NEXT: v_mov_b32_e32 v3, s0 6324; GFX90A-NEXT: v_rcp_iflag_f32_e32 v8, v6 6325; GFX90A-NEXT: v_alignbit_b32 v3, s1, v3, 30 6326; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 6327; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 6328; GFX90A-NEXT: v_mad_f32 v4, -v5, v1, v4 6329; GFX90A-NEXT: v_cvt_u32_f32_e32 v5, v5 6330; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1 6331; GFX90A-NEXT: v_and_b32_e32 v3, s6, v3 6332; GFX90A-NEXT: v_mov_b32_e32 v0, s2 6333; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc 6334; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s0 6335; GFX90A-NEXT: v_sub_u32_e32 v4, s2, v1 6336; GFX90A-NEXT: v_mul_f32_e32 v1, v7, v8 6337; GFX90A-NEXT: v_cvt_f32_u32_e32 v5, v3 6338; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 6339; GFX90A-NEXT: v_alignbit_b32 v0, s3, v0, 30 6340; GFX90A-NEXT: v_mad_f32 v7, -v1, v6, v7 6341; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 6342; GFX90A-NEXT: v_and_b32_e32 v0, s6, v0 6343; GFX90A-NEXT: v_cvt_f32_u32_e32 v8, v0 6344; GFX90A-NEXT: v_rcp_iflag_f32_e32 v9, v5 6345; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, v6 6346; GFX90A-NEXT: s_lshr_b32 s1, s0, 15 6347; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc 6348; GFX90A-NEXT: s_lshr_b32 s3, s2, 15 6349; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s1 6350; GFX90A-NEXT: v_sub_u32_e32 v6, s3, v1 6351; GFX90A-NEXT: v_mul_f32_e32 v1, v8, v9 6352; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 6353; GFX90A-NEXT: v_cvt_u32_f32_e32 v7, v1 6354; GFX90A-NEXT: v_mad_f32 v1, -v1, v5, v8 6355; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v5 6356; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc 6357; GFX90A-NEXT: v_mul_lo_u32 v1, v1, v3 6358; GFX90A-NEXT: v_and_b32_e32 v3, s6, v4 6359; GFX90A-NEXT: v_and_b32_e32 v4, s6, v6 6360; GFX90A-NEXT: v_sub_u32_e32 v0, v0, v1 6361; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 6362; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6363; GFX90A-NEXT: v_or_b32_e32 v3, v3, v4 6364; GFX90A-NEXT: v_or_b32_e32 v0, v3, v0 6365; GFX90A-NEXT: global_store_dword v2, v0, s[4:5] 6366; GFX90A-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6367; GFX90A-NEXT: global_store_short v2, v0, s[4:5] offset:4 6368; GFX90A-NEXT: s_endpgm 6369 %r = urem <3 x i15> %x, %y 6370 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 6371 ret void 6372} 6373 6374define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 6375; CHECK-LABEL: @sdiv_v3i15( 6376; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 6377; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 6378; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 6379; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 6380; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 6381; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 6382; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 6383; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 6384; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 6385; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 6386; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 6387; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 6388; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 6389; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 6390; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 6391; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 6392; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 6393; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 6394; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 6395; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 6396; CHECK-NEXT: [[TMP21:%.*]] = shl i32 [[TMP20]], 17 6397; CHECK-NEXT: [[TMP22:%.*]] = ashr i32 [[TMP21]], 17 6398; CHECK-NEXT: [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15 6399; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i15> undef, i15 [[TMP23]], i64 0 6400; CHECK-NEXT: [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1 6401; CHECK-NEXT: [[TMP26:%.*]] = extractelement <3 x i15> [[Y]], i64 1 6402; CHECK-NEXT: [[TMP27:%.*]] = sext i15 [[TMP25]] to i32 6403; CHECK-NEXT: [[TMP28:%.*]] = sext i15 [[TMP26]] to i32 6404; CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP27]], [[TMP28]] 6405; CHECK-NEXT: [[TMP30:%.*]] = ashr i32 [[TMP29]], 30 6406; CHECK-NEXT: [[TMP31:%.*]] = or i32 [[TMP30]], 1 6407; CHECK-NEXT: [[TMP32:%.*]] = sitofp i32 [[TMP27]] to float 6408; CHECK-NEXT: [[TMP33:%.*]] = sitofp i32 [[TMP28]] to float 6409; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 6410; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP32]], [[TMP34]] 6411; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.trunc.f32(float [[TMP35]]) 6412; CHECK-NEXT: [[TMP37:%.*]] = fneg fast float [[TMP36]] 6413; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP37]], float [[TMP33]], float [[TMP32]]) 6414; CHECK-NEXT: [[TMP39:%.*]] = fptosi float [[TMP36]] to i32 6415; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.fabs.f32(float [[TMP38]]) 6416; CHECK-NEXT: [[TMP41:%.*]] = call fast float @llvm.fabs.f32(float [[TMP33]]) 6417; CHECK-NEXT: [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]] 6418; CHECK-NEXT: [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0 6419; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]] 6420; CHECK-NEXT: [[TMP45:%.*]] = shl i32 [[TMP44]], 17 6421; CHECK-NEXT: [[TMP46:%.*]] = ashr i32 [[TMP45]], 17 6422; CHECK-NEXT: [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15 6423; CHECK-NEXT: [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1 6424; CHECK-NEXT: [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2 6425; CHECK-NEXT: [[TMP50:%.*]] = extractelement <3 x i15> [[Y]], i64 2 6426; CHECK-NEXT: [[TMP51:%.*]] = sext i15 [[TMP49]] to i32 6427; CHECK-NEXT: [[TMP52:%.*]] = sext i15 [[TMP50]] to i32 6428; CHECK-NEXT: [[TMP53:%.*]] = xor i32 [[TMP51]], [[TMP52]] 6429; CHECK-NEXT: [[TMP54:%.*]] = ashr i32 [[TMP53]], 30 6430; CHECK-NEXT: [[TMP55:%.*]] = or i32 [[TMP54]], 1 6431; CHECK-NEXT: [[TMP56:%.*]] = sitofp i32 [[TMP51]] to float 6432; CHECK-NEXT: [[TMP57:%.*]] = sitofp i32 [[TMP52]] to float 6433; CHECK-NEXT: [[TMP58:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP57]]) 6434; CHECK-NEXT: [[TMP59:%.*]] = fmul fast float [[TMP56]], [[TMP58]] 6435; CHECK-NEXT: [[TMP60:%.*]] = call fast float @llvm.trunc.f32(float [[TMP59]]) 6436; CHECK-NEXT: [[TMP61:%.*]] = fneg fast float [[TMP60]] 6437; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP61]], float [[TMP57]], float [[TMP56]]) 6438; CHECK-NEXT: [[TMP63:%.*]] = fptosi float [[TMP60]] to i32 6439; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.fabs.f32(float [[TMP62]]) 6440; CHECK-NEXT: [[TMP65:%.*]] = call fast float @llvm.fabs.f32(float [[TMP57]]) 6441; CHECK-NEXT: [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]] 6442; CHECK-NEXT: [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0 6443; CHECK-NEXT: [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]] 6444; CHECK-NEXT: [[TMP69:%.*]] = shl i32 [[TMP68]], 17 6445; CHECK-NEXT: [[TMP70:%.*]] = ashr i32 [[TMP69]], 17 6446; CHECK-NEXT: [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15 6447; CHECK-NEXT: [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2 6448; CHECK-NEXT: store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 6449; CHECK-NEXT: ret void 6450; 6451; GFX6-LABEL: sdiv_v3i15: 6452; GFX6: ; %bb.0: 6453; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 6454; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6455; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 6456; GFX6-NEXT: s_mov_b32 s7, 0xf000 6457; GFX6-NEXT: s_mov_b32 s6, -1 6458; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6459; GFX6-NEXT: v_mov_b32_e32 v0, s2 6460; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 6461; GFX6-NEXT: s_bfe_i32 s3, s0, 0xf0000 6462; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s3 6463; GFX6-NEXT: v_mov_b32_e32 v1, s0 6464; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 30 6465; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf0000 6466; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s1 6467; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 6468; GFX6-NEXT: s_xor_b32 s1, s1, s3 6469; GFX6-NEXT: s_bfe_i32 s0, s0, 0xf000f 6470; GFX6-NEXT: s_ashr_i32 s1, s1, 30 6471; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 6472; GFX6-NEXT: v_trunc_f32_e32 v4, v4 6473; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 6474; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 6475; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 6476; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s0 6477; GFX6-NEXT: s_or_b32 s1, s1, 1 6478; GFX6-NEXT: v_mov_b32_e32 v5, s1 6479; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 6480; GFX6-NEXT: s_bfe_i32 s1, s2, 0xf000f 6481; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 6482; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s1 6483; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 6484; GFX6-NEXT: s_xor_b32 s0, s1, s0 6485; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 15 6486; GFX6-NEXT: s_ashr_i32 s0, s0, 30 6487; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 6488; GFX6-NEXT: v_trunc_f32_e32 v5, v5 6489; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 6490; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 6491; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| 6492; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v1 6493; GFX6-NEXT: s_or_b32 s0, s0, 1 6494; GFX6-NEXT: v_mov_b32_e32 v6, s0 6495; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 6496; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 15 6497; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 6498; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v0 6499; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 6500; GFX6-NEXT: v_xor_b32_e32 v0, v0, v1 6501; GFX6-NEXT: v_ashrrev_i32_e32 v0, 30, v0 6502; GFX6-NEXT: v_or_b32_e32 v0, 1, v0 6503; GFX6-NEXT: v_mul_f32_e32 v1, v5, v6 6504; GFX6-NEXT: v_trunc_f32_e32 v1, v1 6505; GFX6-NEXT: v_mad_f32 v5, -v1, v4, v5 6506; GFX6-NEXT: v_cvt_i32_f32_e32 v1, v1 6507; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v5|, |v4| 6508; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 6509; GFX6-NEXT: s_movk_i32 s0, 0x7fff 6510; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 6511; GFX6-NEXT: v_and_b32_e32 v3, s0, v3 6512; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 6513; GFX6-NEXT: v_and_b32_e32 v2, s0, v2 6514; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 6515; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 6516; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 6517; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 6518; GFX6-NEXT: s_waitcnt expcnt(0) 6519; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6520; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 6521; GFX6-NEXT: s_endpgm 6522; 6523; GFX9-LABEL: sdiv_v3i15: 6524; GFX9: ; %bb.0: 6525; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6526; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 6527; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 6528; GFX9-NEXT: v_mov_b32_e32 v2, 0 6529; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6530; GFX9-NEXT: s_bfe_i32 s1, s2, 0xf0000 6531; GFX9-NEXT: s_bfe_i32 s0, s4, 0xf0000 6532; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s0 6533; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s1 6534; GFX9-NEXT: s_xor_b32 s0, s1, s0 6535; GFX9-NEXT: v_mov_b32_e32 v0, s2 6536; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 6537; GFX9-NEXT: s_ashr_i32 s0, s0, 30 6538; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 6539; GFX9-NEXT: s_or_b32 s3, s0, 1 6540; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 6541; GFX9-NEXT: v_trunc_f32_e32 v5, v5 6542; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 6543; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 6544; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 6545; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 6546; GFX9-NEXT: s_cselect_b32 s0, s3, 0 6547; GFX9-NEXT: s_bfe_i32 s1, s4, 0xf000f 6548; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s1 6549; GFX9-NEXT: v_add_u32_e32 v4, s0, v5 6550; GFX9-NEXT: s_bfe_i32 s0, s2, 0xf000f 6551; GFX9-NEXT: v_cvt_f32_i32_e32 v5, s0 6552; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3 6553; GFX9-NEXT: v_mov_b32_e32 v1, s4 6554; GFX9-NEXT: v_alignbit_b32 v1, s5, v1, 30 6555; GFX9-NEXT: s_xor_b32 s0, s0, s1 6556; GFX9-NEXT: v_mul_f32_e32 v6, v5, v6 6557; GFX9-NEXT: v_trunc_f32_e32 v6, v6 6558; GFX9-NEXT: s_ashr_i32 s0, s0, 30 6559; GFX9-NEXT: v_mad_f32 v5, -v6, v3, v5 6560; GFX9-NEXT: v_bfe_i32 v1, v1, 0, 15 6561; GFX9-NEXT: s_or_b32 s2, s0, 1 6562; GFX9-NEXT: v_cvt_i32_f32_e32 v6, v6 6563; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| 6564; GFX9-NEXT: v_cvt_f32_i32_e32 v3, v1 6565; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 6566; GFX9-NEXT: s_cselect_b32 s0, s2, 0 6567; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 15 6568; GFX9-NEXT: v_add_u32_e32 v5, s0, v6 6569; GFX9-NEXT: v_cvt_f32_i32_e32 v6, v0 6570; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v3 6571; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 6572; GFX9-NEXT: v_ashrrev_i32_e32 v0, 30, v0 6573; GFX9-NEXT: v_or_b32_e32 v0, 1, v0 6574; GFX9-NEXT: v_mul_f32_e32 v1, v6, v7 6575; GFX9-NEXT: v_trunc_f32_e32 v1, v1 6576; GFX9-NEXT: v_cvt_i32_f32_e32 v7, v1 6577; GFX9-NEXT: v_mad_f32 v1, -v1, v3, v6 6578; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| 6579; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 6580; GFX9-NEXT: s_movk_i32 s0, 0x7fff 6581; GFX9-NEXT: v_add_u32_e32 v0, v7, v0 6582; GFX9-NEXT: v_and_b32_e32 v3, s0, v4 6583; GFX9-NEXT: v_and_b32_e32 v4, s0, v5 6584; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6585; GFX9-NEXT: v_lshlrev_b32_e32 v4, 15, v4 6586; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 6587; GFX9-NEXT: v_or_b32_e32 v0, v3, v0 6588; GFX9-NEXT: global_store_dword v2, v0, s[6:7] 6589; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6590; GFX9-NEXT: global_store_short v2, v0, s[6:7] offset:4 6591; GFX9-NEXT: s_endpgm 6592; 6593; GFX90A-LABEL: sdiv_v3i15: 6594; GFX90A: ; %bb.0: 6595; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6596; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 6597; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 6598; GFX90A-NEXT: v_mov_b32_e32 v2, 0 6599; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 6600; GFX90A-NEXT: s_bfe_i32 s1, s2, 0xf0000 6601; GFX90A-NEXT: s_bfe_i32 s0, s4, 0xf0000 6602; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s0 6603; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s1 6604; GFX90A-NEXT: s_xor_b32 s0, s1, s0 6605; GFX90A-NEXT: v_mov_b32_e32 v0, s2 6606; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v3 6607; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 6608; GFX90A-NEXT: v_alignbit_b32 v0, s3, v0, 30 6609; GFX90A-NEXT: s_or_b32 s3, s0, 1 6610; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 6611; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 6612; GFX90A-NEXT: v_mad_f32 v4, -v5, v3, v4 6613; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 6614; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 6615; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 6616; GFX90A-NEXT: s_cselect_b32 s0, s3, 0 6617; GFX90A-NEXT: s_bfe_i32 s1, s4, 0xf000f 6618; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s1 6619; GFX90A-NEXT: v_add_u32_e32 v4, s0, v5 6620; GFX90A-NEXT: s_bfe_i32 s0, s2, 0xf000f 6621; GFX90A-NEXT: v_cvt_f32_i32_e32 v5, s0 6622; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v3 6623; GFX90A-NEXT: v_mov_b32_e32 v1, s4 6624; GFX90A-NEXT: v_alignbit_b32 v1, s5, v1, 30 6625; GFX90A-NEXT: s_xor_b32 s0, s0, s1 6626; GFX90A-NEXT: v_mul_f32_e32 v6, v5, v6 6627; GFX90A-NEXT: v_trunc_f32_e32 v6, v6 6628; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 6629; GFX90A-NEXT: v_mad_f32 v5, -v6, v3, v5 6630; GFX90A-NEXT: v_bfe_i32 v1, v1, 0, 15 6631; GFX90A-NEXT: s_or_b32 s2, s0, 1 6632; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 6633; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v3| 6634; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, v1 6635; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 6636; GFX90A-NEXT: s_cselect_b32 s0, s2, 0 6637; GFX90A-NEXT: v_bfe_i32 v0, v0, 0, 15 6638; GFX90A-NEXT: v_add_u32_e32 v5, s0, v6 6639; GFX90A-NEXT: v_cvt_f32_i32_e32 v6, v0 6640; GFX90A-NEXT: v_rcp_iflag_f32_e32 v7, v3 6641; GFX90A-NEXT: v_xor_b32_e32 v0, v0, v1 6642; GFX90A-NEXT: v_ashrrev_i32_e32 v0, 30, v0 6643; GFX90A-NEXT: v_or_b32_e32 v0, 1, v0 6644; GFX90A-NEXT: v_mul_f32_e32 v1, v6, v7 6645; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 6646; GFX90A-NEXT: v_cvt_i32_f32_e32 v7, v1 6647; GFX90A-NEXT: v_mad_f32 v1, -v1, v3, v6 6648; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, |v3| 6649; GFX90A-NEXT: s_movk_i32 s0, 0x7fff 6650; GFX90A-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc 6651; GFX90A-NEXT: v_and_b32_e32 v3, s0, v4 6652; GFX90A-NEXT: v_and_b32_e32 v4, s0, v5 6653; GFX90A-NEXT: v_add_u32_e32 v0, v7, v0 6654; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 6655; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6656; GFX90A-NEXT: v_or_b32_e32 v3, v3, v4 6657; GFX90A-NEXT: v_or_b32_e32 v0, v3, v0 6658; GFX90A-NEXT: global_store_dword v2, v0, s[6:7] 6659; GFX90A-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6660; GFX90A-NEXT: global_store_short v2, v0, s[6:7] offset:4 6661; GFX90A-NEXT: s_endpgm 6662 %r = sdiv <3 x i15> %x, %y 6663 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 6664 ret void 6665} 6666 6667define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x, <3 x i15> %y) { 6668; CHECK-LABEL: @srem_v3i15( 6669; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i15> [[X:%.*]], i64 0 6670; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i15> [[Y:%.*]], i64 0 6671; CHECK-NEXT: [[TMP3:%.*]] = sext i15 [[TMP1]] to i32 6672; CHECK-NEXT: [[TMP4:%.*]] = sext i15 [[TMP2]] to i32 6673; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 6674; CHECK-NEXT: [[TMP6:%.*]] = ashr i32 [[TMP5]], 30 6675; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[TMP6]], 1 6676; CHECK-NEXT: [[TMP8:%.*]] = sitofp i32 [[TMP3]] to float 6677; CHECK-NEXT: [[TMP9:%.*]] = sitofp i32 [[TMP4]] to float 6678; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 6679; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP8]], [[TMP10]] 6680; CHECK-NEXT: [[TMP12:%.*]] = call fast float @llvm.trunc.f32(float [[TMP11]]) 6681; CHECK-NEXT: [[TMP13:%.*]] = fneg fast float [[TMP12]] 6682; CHECK-NEXT: [[TMP14:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP13]], float [[TMP9]], float [[TMP8]]) 6683; CHECK-NEXT: [[TMP15:%.*]] = fptosi float [[TMP12]] to i32 6684; CHECK-NEXT: [[TMP16:%.*]] = call fast float @llvm.fabs.f32(float [[TMP14]]) 6685; CHECK-NEXT: [[TMP17:%.*]] = call fast float @llvm.fabs.f32(float [[TMP9]]) 6686; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]] 6687; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0 6688; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]] 6689; CHECK-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]] 6690; CHECK-NEXT: [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]] 6691; CHECK-NEXT: [[TMP23:%.*]] = shl i32 [[TMP22]], 17 6692; CHECK-NEXT: [[TMP24:%.*]] = ashr i32 [[TMP23]], 17 6693; CHECK-NEXT: [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15 6694; CHECK-NEXT: [[TMP26:%.*]] = insertelement <3 x i15> undef, i15 [[TMP25]], i64 0 6695; CHECK-NEXT: [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1 6696; CHECK-NEXT: [[TMP28:%.*]] = extractelement <3 x i15> [[Y]], i64 1 6697; CHECK-NEXT: [[TMP29:%.*]] = sext i15 [[TMP27]] to i32 6698; CHECK-NEXT: [[TMP30:%.*]] = sext i15 [[TMP28]] to i32 6699; CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP29]], [[TMP30]] 6700; CHECK-NEXT: [[TMP32:%.*]] = ashr i32 [[TMP31]], 30 6701; CHECK-NEXT: [[TMP33:%.*]] = or i32 [[TMP32]], 1 6702; CHECK-NEXT: [[TMP34:%.*]] = sitofp i32 [[TMP29]] to float 6703; CHECK-NEXT: [[TMP35:%.*]] = sitofp i32 [[TMP30]] to float 6704; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 6705; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP34]], [[TMP36]] 6706; CHECK-NEXT: [[TMP38:%.*]] = call fast float @llvm.trunc.f32(float [[TMP37]]) 6707; CHECK-NEXT: [[TMP39:%.*]] = fneg fast float [[TMP38]] 6708; CHECK-NEXT: [[TMP40:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP39]], float [[TMP35]], float [[TMP34]]) 6709; CHECK-NEXT: [[TMP41:%.*]] = fptosi float [[TMP38]] to i32 6710; CHECK-NEXT: [[TMP42:%.*]] = call fast float @llvm.fabs.f32(float [[TMP40]]) 6711; CHECK-NEXT: [[TMP43:%.*]] = call fast float @llvm.fabs.f32(float [[TMP35]]) 6712; CHECK-NEXT: [[TMP44:%.*]] = fcmp fast oge float [[TMP42]], [[TMP43]] 6713; CHECK-NEXT: [[TMP45:%.*]] = select i1 [[TMP44]], i32 [[TMP33]], i32 0 6714; CHECK-NEXT: [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]] 6715; CHECK-NEXT: [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]] 6716; CHECK-NEXT: [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]] 6717; CHECK-NEXT: [[TMP49:%.*]] = shl i32 [[TMP48]], 17 6718; CHECK-NEXT: [[TMP50:%.*]] = ashr i32 [[TMP49]], 17 6719; CHECK-NEXT: [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15 6720; CHECK-NEXT: [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1 6721; CHECK-NEXT: [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2 6722; CHECK-NEXT: [[TMP54:%.*]] = extractelement <3 x i15> [[Y]], i64 2 6723; CHECK-NEXT: [[TMP55:%.*]] = sext i15 [[TMP53]] to i32 6724; CHECK-NEXT: [[TMP56:%.*]] = sext i15 [[TMP54]] to i32 6725; CHECK-NEXT: [[TMP57:%.*]] = xor i32 [[TMP55]], [[TMP56]] 6726; CHECK-NEXT: [[TMP58:%.*]] = ashr i32 [[TMP57]], 30 6727; CHECK-NEXT: [[TMP59:%.*]] = or i32 [[TMP58]], 1 6728; CHECK-NEXT: [[TMP60:%.*]] = sitofp i32 [[TMP55]] to float 6729; CHECK-NEXT: [[TMP61:%.*]] = sitofp i32 [[TMP56]] to float 6730; CHECK-NEXT: [[TMP62:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP61]]) 6731; CHECK-NEXT: [[TMP63:%.*]] = fmul fast float [[TMP60]], [[TMP62]] 6732; CHECK-NEXT: [[TMP64:%.*]] = call fast float @llvm.trunc.f32(float [[TMP63]]) 6733; CHECK-NEXT: [[TMP65:%.*]] = fneg fast float [[TMP64]] 6734; CHECK-NEXT: [[TMP66:%.*]] = call fast float @llvm.amdgcn.fmad.ftz.f32(float [[TMP65]], float [[TMP61]], float [[TMP60]]) 6735; CHECK-NEXT: [[TMP67:%.*]] = fptosi float [[TMP64]] to i32 6736; CHECK-NEXT: [[TMP68:%.*]] = call fast float @llvm.fabs.f32(float [[TMP66]]) 6737; CHECK-NEXT: [[TMP69:%.*]] = call fast float @llvm.fabs.f32(float [[TMP61]]) 6738; CHECK-NEXT: [[TMP70:%.*]] = fcmp fast oge float [[TMP68]], [[TMP69]] 6739; CHECK-NEXT: [[TMP71:%.*]] = select i1 [[TMP70]], i32 [[TMP59]], i32 0 6740; CHECK-NEXT: [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]] 6741; CHECK-NEXT: [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]] 6742; CHECK-NEXT: [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]] 6743; CHECK-NEXT: [[TMP75:%.*]] = shl i32 [[TMP74]], 17 6744; CHECK-NEXT: [[TMP76:%.*]] = ashr i32 [[TMP75]], 17 6745; CHECK-NEXT: [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15 6746; CHECK-NEXT: [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2 6747; CHECK-NEXT: store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]], align 8 6748; CHECK-NEXT: ret void 6749; 6750; GFX6-LABEL: srem_v3i15: 6751; GFX6: ; %bb.0: 6752; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 6753; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 6754; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd 6755; GFX6-NEXT: s_mov_b32 s7, 0xf000 6756; GFX6-NEXT: s_mov_b32 s6, -1 6757; GFX6-NEXT: s_waitcnt lgkmcnt(0) 6758; GFX6-NEXT: v_mov_b32_e32 v0, s2 6759; GFX6-NEXT: v_alignbit_b32 v0, s3, v0, 30 6760; GFX6-NEXT: s_movk_i32 s3, 0x7fff 6761; GFX6-NEXT: s_and_b32 s11, s0, s3 6762; GFX6-NEXT: s_bfe_i32 s11, s11, 0xf0000 6763; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s11 6764; GFX6-NEXT: s_and_b32 s9, s2, s3 6765; GFX6-NEXT: s_bfe_i32 s9, s9, 0xf0000 6766; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s9 6767; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 6768; GFX6-NEXT: s_xor_b32 s9, s9, s11 6769; GFX6-NEXT: s_ashr_i32 s9, s9, 30 6770; GFX6-NEXT: s_or_b32 s9, s9, 1 6771; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 6772; GFX6-NEXT: v_trunc_f32_e32 v4, v4 6773; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 6774; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 6775; GFX6-NEXT: v_mov_b32_e32 v5, s9 6776; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, |v2| 6777; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v5, vcc 6778; GFX6-NEXT: v_mov_b32_e32 v1, s0 6779; GFX6-NEXT: s_bfe_u32 s12, s0, 0xf000f 6780; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 6781; GFX6-NEXT: v_alignbit_b32 v1, s1, v1, 30 6782; GFX6-NEXT: s_lshr_b32 s1, s0, 15 6783; GFX6-NEXT: v_mul_lo_u32 v2, v2, s0 6784; GFX6-NEXT: s_bfe_i32 s0, s12, 0xf0000 6785; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s0 6786; GFX6-NEXT: s_bfe_u32 s10, s2, 0xf000f 6787; GFX6-NEXT: s_lshr_b32 s8, s2, 15 6788; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 6789; GFX6-NEXT: s_bfe_i32 s2, s10, 0xf0000 6790; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s2 6791; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 6792; GFX6-NEXT: s_xor_b32 s0, s2, s0 6793; GFX6-NEXT: s_ashr_i32 s0, s0, 30 6794; GFX6-NEXT: s_or_b32 s0, s0, 1 6795; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 6796; GFX6-NEXT: v_trunc_f32_e32 v5, v5 6797; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 6798; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 6799; GFX6-NEXT: v_and_b32_e32 v1, s3, v1 6800; GFX6-NEXT: v_mov_b32_e32 v6, s0 6801; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, |v3| 6802; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc 6803; GFX6-NEXT: v_bfe_i32 v4, v1, 0, 15 6804; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 6805; GFX6-NEXT: v_cvt_f32_i32_e32 v5, v4 6806; GFX6-NEXT: v_and_b32_e32 v0, s3, v0 6807; GFX6-NEXT: v_bfe_i32 v6, v0, 0, 15 6808; GFX6-NEXT: v_cvt_f32_i32_e32 v7, v6 6809; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v5 6810; GFX6-NEXT: v_xor_b32_e32 v4, v6, v4 6811; GFX6-NEXT: v_ashrrev_i32_e32 v4, 30, v4 6812; GFX6-NEXT: v_or_b32_e32 v4, 1, v4 6813; GFX6-NEXT: v_mul_f32_e32 v6, v7, v8 6814; GFX6-NEXT: v_trunc_f32_e32 v6, v6 6815; GFX6-NEXT: v_mad_f32 v7, -v6, v5, v7 6816; GFX6-NEXT: v_cvt_i32_f32_e32 v6, v6 6817; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v5| 6818; GFX6-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc 6819; GFX6-NEXT: v_mul_lo_u32 v3, v3, s1 6820; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 6821; GFX6-NEXT: v_mul_lo_u32 v1, v4, v1 6822; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s8, v3 6823; GFX6-NEXT: v_and_b32_e32 v3, s3, v3 6824; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 6825; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 30 6826; GFX6-NEXT: v_and_b32_e32 v2, s3, v2 6827; GFX6-NEXT: v_lshlrev_b32_e32 v3, 15, v3 6828; GFX6-NEXT: v_or_b32_e32 v2, v2, v3 6829; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 6830; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 6831; GFX6-NEXT: s_waitcnt expcnt(0) 6832; GFX6-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6833; GFX6-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 6834; GFX6-NEXT: s_endpgm 6835; 6836; GFX9-LABEL: srem_v3i15: 6837; GFX9: ; %bb.0: 6838; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6839; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 6840; GFX9-NEXT: s_movk_i32 s8, 0x7fff 6841; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 6842; GFX9-NEXT: s_waitcnt lgkmcnt(0) 6843; GFX9-NEXT: v_mov_b32_e32 v0, s2 6844; GFX9-NEXT: v_alignbit_b32 v0, s3, v0, 30 6845; GFX9-NEXT: s_and_b32 s3, s2, s8 6846; GFX9-NEXT: v_mov_b32_e32 v1, s0 6847; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, 30 6848; GFX9-NEXT: s_and_b32 s1, s0, s8 6849; GFX9-NEXT: s_bfe_i32 s1, s1, 0xf0000 6850; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s1 6851; GFX9-NEXT: s_bfe_i32 s3, s3, 0xf0000 6852; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s3 6853; GFX9-NEXT: s_xor_b32 s1, s3, s1 6854; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2 6855; GFX9-NEXT: s_ashr_i32 s1, s1, 30 6856; GFX9-NEXT: s_lshr_b32 s9, s2, 15 6857; GFX9-NEXT: s_bfe_u32 s10, s2, 0xf000f 6858; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4 6859; GFX9-NEXT: v_trunc_f32_e32 v4, v4 6860; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3 6861; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4 6862; GFX9-NEXT: s_lshr_b32 s11, s0, 15 6863; GFX9-NEXT: s_bfe_u32 s12, s0, 0xf000f 6864; GFX9-NEXT: s_or_b32 s1, s1, 1 6865; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v2| 6866; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec 6867; GFX9-NEXT: s_cselect_b32 s1, s1, 0 6868; GFX9-NEXT: v_add_u32_e32 v2, s1, v4 6869; GFX9-NEXT: s_bfe_i32 s1, s12, 0xf0000 6870; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s1 6871; GFX9-NEXT: v_mul_lo_u32 v2, v2, s0 6872; GFX9-NEXT: s_bfe_i32 s0, s10, 0xf0000 6873; GFX9-NEXT: v_cvt_f32_i32_e32 v4, s0 6874; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v3 6875; GFX9-NEXT: s_xor_b32 s0, s0, s1 6876; GFX9-NEXT: s_ashr_i32 s0, s0, 30 6877; GFX9-NEXT: s_or_b32 s3, s0, 1 6878; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5 6879; GFX9-NEXT: v_trunc_f32_e32 v5, v5 6880; GFX9-NEXT: v_mad_f32 v4, -v5, v3, v4 6881; GFX9-NEXT: v_cvt_i32_f32_e32 v5, v5 6882; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, |v3| 6883; GFX9-NEXT: v_and_b32_e32 v1, s8, v1 6884; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec 6885; GFX9-NEXT: s_cselect_b32 s0, s3, 0 6886; GFX9-NEXT: v_bfe_i32 v4, v1, 0, 15 6887; GFX9-NEXT: v_add_u32_e32 v3, s0, v5 6888; GFX9-NEXT: v_cvt_f32_i32_e32 v5, v4 6889; GFX9-NEXT: v_and_b32_e32 v0, s8, v0 6890; GFX9-NEXT: v_bfe_i32 v6, v0, 0, 15 6891; GFX9-NEXT: v_cvt_f32_i32_e32 v7, v6 6892; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5 6893; GFX9-NEXT: v_xor_b32_e32 v4, v6, v4 6894; GFX9-NEXT: v_ashrrev_i32_e32 v4, 30, v4 6895; GFX9-NEXT: v_or_b32_e32 v4, 1, v4 6896; GFX9-NEXT: v_mul_f32_e32 v6, v7, v8 6897; GFX9-NEXT: v_trunc_f32_e32 v6, v6 6898; GFX9-NEXT: v_cvt_i32_f32_e32 v8, v6 6899; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v7 6900; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v5| 6901; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc 6902; GFX9-NEXT: v_mul_lo_u32 v3, v3, s11 6903; GFX9-NEXT: v_add_u32_e32 v4, v8, v4 6904; GFX9-NEXT: v_mul_lo_u32 v1, v4, v1 6905; GFX9-NEXT: v_sub_u32_e32 v2, s2, v2 6906; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3 6907; GFX9-NEXT: v_and_b32_e32 v3, s8, v3 6908; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1 6909; GFX9-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6910; GFX9-NEXT: v_and_b32_e32 v2, s8, v2 6911; GFX9-NEXT: v_lshlrev_b32_e32 v3, 15, v3 6912; GFX9-NEXT: v_or_b32_e32 v2, v2, v3 6913; GFX9-NEXT: v_mov_b32_e32 v4, 0 6914; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 6915; GFX9-NEXT: global_store_dword v4, v0, s[4:5] 6916; GFX9-NEXT: v_and_b32_e32 v0, 0x1fff, v1 6917; GFX9-NEXT: global_store_short v4, v0, s[4:5] offset:4 6918; GFX9-NEXT: s_endpgm 6919; 6920; GFX90A-LABEL: srem_v3i15: 6921; GFX90A: ; %bb.0: 6922; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 6923; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 6924; GFX90A-NEXT: s_movk_i32 s8, 0x7fff 6925; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 6926; GFX90A-NEXT: v_mov_b32_e32 v2, 0 6927; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 6928; GFX90A-NEXT: s_and_b32 s6, s2, s8 6929; GFX90A-NEXT: s_bfe_i32 s6, s6, 0xf0000 6930; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s6 6931; GFX90A-NEXT: v_mov_b32_e32 v1, s0 6932; GFX90A-NEXT: v_alignbit_b32 v1, s1, v1, 30 6933; GFX90A-NEXT: s_and_b32 s1, s0, s8 6934; GFX90A-NEXT: s_bfe_i32 s1, s1, 0xf0000 6935; GFX90A-NEXT: v_cvt_f32_i32_e32 v3, s1 6936; GFX90A-NEXT: s_xor_b32 s1, s6, s1 6937; GFX90A-NEXT: v_mov_b32_e32 v0, s2 6938; GFX90A-NEXT: s_ashr_i32 s1, s1, 30 6939; GFX90A-NEXT: v_rcp_iflag_f32_e32 v5, v3 6940; GFX90A-NEXT: v_alignbit_b32 v0, s3, v0, 30 6941; GFX90A-NEXT: s_lshr_b32 s3, s2, 15 6942; GFX90A-NEXT: s_bfe_u32 s9, s2, 0xf000f 6943; GFX90A-NEXT: v_mul_f32_e32 v5, v4, v5 6944; GFX90A-NEXT: v_trunc_f32_e32 v5, v5 6945; GFX90A-NEXT: v_mad_f32 v4, -v5, v3, v4 6946; GFX90A-NEXT: v_cvt_i32_f32_e32 v5, v5 6947; GFX90A-NEXT: s_lshr_b32 s10, s0, 15 6948; GFX90A-NEXT: s_bfe_u32 s11, s0, 0xf000f 6949; GFX90A-NEXT: s_or_b32 s1, s1, 1 6950; GFX90A-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, |v3| 6951; GFX90A-NEXT: s_and_b64 s[6:7], s[6:7], exec 6952; GFX90A-NEXT: s_cselect_b32 s1, s1, 0 6953; GFX90A-NEXT: v_add_u32_e32 v3, s1, v5 6954; GFX90A-NEXT: v_mul_lo_u32 v3, v3, s0 6955; GFX90A-NEXT: s_bfe_i32 s0, s11, 0xf0000 6956; GFX90A-NEXT: v_cvt_f32_i32_e32 v4, s0 6957; GFX90A-NEXT: s_bfe_i32 s1, s9, 0xf0000 6958; GFX90A-NEXT: v_cvt_f32_i32_e32 v5, s1 6959; GFX90A-NEXT: s_xor_b32 s0, s1, s0 6960; GFX90A-NEXT: v_rcp_iflag_f32_e32 v6, v4 6961; GFX90A-NEXT: s_ashr_i32 s0, s0, 30 6962; GFX90A-NEXT: v_sub_u32_e32 v3, s2, v3 6963; GFX90A-NEXT: s_or_b32 s2, s0, 1 6964; GFX90A-NEXT: v_mul_f32_e32 v6, v5, v6 6965; GFX90A-NEXT: v_trunc_f32_e32 v6, v6 6966; GFX90A-NEXT: v_mad_f32 v5, -v6, v4, v5 6967; GFX90A-NEXT: v_cvt_i32_f32_e32 v6, v6 6968; GFX90A-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, |v4| 6969; GFX90A-NEXT: v_and_b32_e32 v1, s8, v1 6970; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec 6971; GFX90A-NEXT: s_cselect_b32 s0, s2, 0 6972; GFX90A-NEXT: v_bfe_i32 v5, v1, 0, 15 6973; GFX90A-NEXT: v_add_u32_e32 v4, s0, v6 6974; GFX90A-NEXT: v_cvt_f32_i32_e32 v6, v5 6975; GFX90A-NEXT: v_and_b32_e32 v0, s8, v0 6976; GFX90A-NEXT: v_bfe_i32 v7, v0, 0, 15 6977; GFX90A-NEXT: v_cvt_f32_i32_e32 v8, v7 6978; GFX90A-NEXT: v_rcp_iflag_f32_e32 v9, v6 6979; GFX90A-NEXT: v_xor_b32_e32 v5, v7, v5 6980; GFX90A-NEXT: v_ashrrev_i32_e32 v5, 30, v5 6981; GFX90A-NEXT: v_or_b32_e32 v5, 1, v5 6982; GFX90A-NEXT: v_mul_f32_e32 v7, v8, v9 6983; GFX90A-NEXT: v_trunc_f32_e32 v7, v7 6984; GFX90A-NEXT: v_cvt_i32_f32_e32 v9, v7 6985; GFX90A-NEXT: v_mad_f32 v7, -v7, v6, v8 6986; GFX90A-NEXT: v_cmp_ge_f32_e64 vcc, |v7|, |v6| 6987; GFX90A-NEXT: v_mul_lo_u32 v4, v4, s10 6988; GFX90A-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc 6989; GFX90A-NEXT: v_sub_u32_e32 v4, s3, v4 6990; GFX90A-NEXT: v_add_u32_e32 v5, v9, v5 6991; GFX90A-NEXT: v_mul_lo_u32 v1, v5, v1 6992; GFX90A-NEXT: v_and_b32_e32 v4, s8, v4 6993; GFX90A-NEXT: v_sub_u32_e32 v0, v0, v1 6994; GFX90A-NEXT: v_and_b32_e32 v3, s8, v3 6995; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 15, v4 6996; GFX90A-NEXT: v_lshlrev_b64 v[0:1], 30, v[0:1] 6997; GFX90A-NEXT: v_or_b32_e32 v3, v3, v4 6998; GFX90A-NEXT: v_or_b32_e32 v0, v3, v0 6999; GFX90A-NEXT: global_store_dword v2, v0, s[4:5] 7000; GFX90A-NEXT: v_and_b32_e32 v0, 0x1fff, v1 7001; GFX90A-NEXT: global_store_short v2, v0, s[4:5] offset:4 7002; GFX90A-NEXT: s_endpgm 7003 %r = srem <3 x i15> %x, %y 7004 store <3 x i15> %r, <3 x i15> addrspace(1)* %out 7005 ret void 7006} 7007 7008define amdgpu_kernel void @udiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 7009; CHECK-LABEL: @udiv_i32_oddk_denom( 7010; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 1235195 7011; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 7012; CHECK-NEXT: ret void 7013; 7014; GFX6-LABEL: udiv_i32_oddk_denom: 7015; GFX6: ; %bb.0: 7016; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 7017; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 7018; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 7019; GFX6-NEXT: s_mov_b32 s3, 0xf000 7020; GFX6-NEXT: s_mov_b32 s2, -1 7021; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7022; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 7023; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v0 7024; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 7025; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7026; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 7027; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 7028; GFX6-NEXT: s_endpgm 7029; 7030; GFX9-LABEL: udiv_i32_oddk_denom: 7031; GFX9: ; %bb.0: 7032; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 7033; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7034; GFX9-NEXT: v_mov_b32_e32 v0, 0 7035; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7036; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 7037; GFX9-NEXT: s_sub_i32 s1, s4, s0 7038; GFX9-NEXT: s_lshr_b32 s1, s1, 1 7039; GFX9-NEXT: s_add_i32 s1, s1, s0 7040; GFX9-NEXT: s_lshr_b32 s0, s1, 20 7041; GFX9-NEXT: v_mov_b32_e32 v1, s0 7042; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 7043; GFX9-NEXT: s_endpgm 7044; 7045; GFX90A-LABEL: udiv_i32_oddk_denom: 7046; GFX90A: ; %bb.0: 7047; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 7048; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7049; GFX90A-NEXT: v_mov_b32_e32 v0, 0 7050; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7051; GFX90A-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 7052; GFX90A-NEXT: s_sub_i32 s1, s4, s0 7053; GFX90A-NEXT: s_lshr_b32 s1, s1, 1 7054; GFX90A-NEXT: s_add_i32 s1, s1, s0 7055; GFX90A-NEXT: s_lshr_b32 s0, s1, 20 7056; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7057; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 7058; GFX90A-NEXT: s_endpgm 7059 %r = udiv i32 %x, 1235195 7060 store i32 %r, i32 addrspace(1)* %out 7061 ret void 7062} 7063 7064define amdgpu_kernel void @udiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 7065; CHECK-LABEL: @udiv_i32_pow2k_denom( 7066; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], 4096 7067; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 7068; CHECK-NEXT: ret void 7069; 7070; GFX6-LABEL: udiv_i32_pow2k_denom: 7071; GFX6: ; %bb.0: 7072; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 7073; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 7074; GFX6-NEXT: s_mov_b32 s3, 0xf000 7075; GFX6-NEXT: s_mov_b32 s2, -1 7076; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7077; GFX6-NEXT: s_lshr_b32 s4, s4, 12 7078; GFX6-NEXT: v_mov_b32_e32 v0, s4 7079; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 7080; GFX6-NEXT: s_endpgm 7081; 7082; GFX9-LABEL: udiv_i32_pow2k_denom: 7083; GFX9: ; %bb.0: 7084; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 7085; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7086; GFX9-NEXT: v_mov_b32_e32 v0, 0 7087; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7088; GFX9-NEXT: s_lshr_b32 s0, s4, 12 7089; GFX9-NEXT: v_mov_b32_e32 v1, s0 7090; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 7091; GFX9-NEXT: s_endpgm 7092; 7093; GFX90A-LABEL: udiv_i32_pow2k_denom: 7094; GFX90A: ; %bb.0: 7095; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 7096; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7097; GFX90A-NEXT: v_mov_b32_e32 v0, 0 7098; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7099; GFX90A-NEXT: s_lshr_b32 s0, s4, 12 7100; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7101; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 7102; GFX90A-NEXT: s_endpgm 7103 %r = udiv i32 %x, 4096 7104 store i32 %r, i32 addrspace(1)* %out 7105 ret void 7106} 7107 7108define amdgpu_kernel void @udiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 7109; CHECK-LABEL: @udiv_i32_pow2_shl_denom( 7110; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 7111; CHECK-NEXT: [[R:%.*]] = udiv i32 [[X:%.*]], [[SHL_Y]] 7112; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 7113; CHECK-NEXT: ret void 7114; 7115; GFX6-LABEL: udiv_i32_pow2_shl_denom: 7116; GFX6: ; %bb.0: 7117; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 7118; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 7119; GFX6-NEXT: s_mov_b32 s3, 0xf000 7120; GFX6-NEXT: s_mov_b32 s2, -1 7121; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7122; GFX6-NEXT: s_add_i32 s5, s5, 12 7123; GFX6-NEXT: s_lshr_b32 s4, s4, s5 7124; GFX6-NEXT: v_mov_b32_e32 v0, s4 7125; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 7126; GFX6-NEXT: s_endpgm 7127; 7128; GFX9-LABEL: udiv_i32_pow2_shl_denom: 7129; GFX9: ; %bb.0: 7130; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 7131; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 7132; GFX9-NEXT: v_mov_b32_e32 v0, 0 7133; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7134; GFX9-NEXT: s_add_i32 s0, s3, 12 7135; GFX9-NEXT: s_lshr_b32 s0, s2, s0 7136; GFX9-NEXT: v_mov_b32_e32 v1, s0 7137; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 7138; GFX9-NEXT: s_endpgm 7139; 7140; GFX90A-LABEL: udiv_i32_pow2_shl_denom: 7141; GFX90A: ; %bb.0: 7142; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 7143; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 7144; GFX90A-NEXT: v_mov_b32_e32 v0, 0 7145; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7146; GFX90A-NEXT: s_add_i32 s0, s3, 12 7147; GFX90A-NEXT: s_lshr_b32 s0, s2, s0 7148; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7149; GFX90A-NEXT: global_store_dword v0, v1, s[4:5] 7150; GFX90A-NEXT: s_endpgm 7151 %shl.y = shl i32 4096, %y 7152 %r = udiv i32 %x, %shl.y 7153 store i32 %r, i32 addrspace(1)* %out 7154 ret void 7155} 7156 7157define amdgpu_kernel void @udiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 7158; CHECK-LABEL: @udiv_v2i32_pow2k_denom( 7159; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 7160; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 7161; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 7162; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 7163; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4096 7164; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 7165; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 7166; CHECK-NEXT: ret void 7167; 7168; GFX6-LABEL: udiv_v2i32_pow2k_denom: 7169; GFX6: ; %bb.0: 7170; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 7171; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 7172; GFX6-NEXT: s_mov_b32 s3, 0xf000 7173; GFX6-NEXT: s_mov_b32 s2, -1 7174; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7175; GFX6-NEXT: s_lshr_b32 s4, s4, 12 7176; GFX6-NEXT: s_lshr_b32 s5, s5, 12 7177; GFX6-NEXT: v_mov_b32_e32 v0, s4 7178; GFX6-NEXT: v_mov_b32_e32 v1, s5 7179; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 7180; GFX6-NEXT: s_endpgm 7181; 7182; GFX9-LABEL: udiv_v2i32_pow2k_denom: 7183; GFX9: ; %bb.0: 7184; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 7185; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 7186; GFX9-NEXT: v_mov_b32_e32 v2, 0 7187; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7188; GFX9-NEXT: s_lshr_b32 s0, s2, 12 7189; GFX9-NEXT: s_lshr_b32 s1, s3, 12 7190; GFX9-NEXT: v_mov_b32_e32 v0, s0 7191; GFX9-NEXT: v_mov_b32_e32 v1, s1 7192; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 7193; GFX9-NEXT: s_endpgm 7194; 7195; GFX90A-LABEL: udiv_v2i32_pow2k_denom: 7196; GFX90A: ; %bb.0: 7197; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 7198; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 7199; GFX90A-NEXT: v_mov_b32_e32 v2, 0 7200; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7201; GFX90A-NEXT: s_lshr_b32 s0, s2, 12 7202; GFX90A-NEXT: s_lshr_b32 s1, s3, 12 7203; GFX90A-NEXT: v_mov_b32_e32 v0, s0 7204; GFX90A-NEXT: v_mov_b32_e32 v1, s1 7205; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 7206; GFX90A-NEXT: s_endpgm 7207 %r = udiv <2 x i32> %x, <i32 4096, i32 4096> 7208 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 7209 ret void 7210} 7211 7212define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 7213; CHECK-LABEL: @udiv_v2i32_mixed_pow2k_denom( 7214; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 7215; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[TMP1]], 4096 7216; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 7217; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 7218; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[TMP4]], 4095 7219; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 7220; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 7221; CHECK-NEXT: ret void 7222; 7223; GFX6-LABEL: udiv_v2i32_mixed_pow2k_denom: 7224; GFX6: ; %bb.0: 7225; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 7226; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 7227; GFX6-NEXT: v_mov_b32_e32 v0, 0x100101 7228; GFX6-NEXT: s_mov_b32 s3, 0xf000 7229; GFX6-NEXT: s_mov_b32 s2, -1 7230; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7231; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0 7232; GFX6-NEXT: s_lshr_b32 s4, s4, 12 7233; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v0 7234; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 7235; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7236; GFX6-NEXT: v_lshrrev_b32_e32 v1, 11, v0 7237; GFX6-NEXT: v_mov_b32_e32 v0, s4 7238; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 7239; GFX6-NEXT: s_endpgm 7240; 7241; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom: 7242; GFX9: ; %bb.0: 7243; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 7244; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 7245; GFX9-NEXT: v_mov_b32_e32 v2, 0 7246; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7247; GFX9-NEXT: s_mul_hi_u32 s1, s3, 0x100101 7248; GFX9-NEXT: s_lshr_b32 s0, s2, 12 7249; GFX9-NEXT: s_sub_i32 s2, s3, s1 7250; GFX9-NEXT: s_lshr_b32 s2, s2, 1 7251; GFX9-NEXT: s_add_i32 s2, s2, s1 7252; GFX9-NEXT: s_lshr_b32 s1, s2, 11 7253; GFX9-NEXT: v_mov_b32_e32 v0, s0 7254; GFX9-NEXT: v_mov_b32_e32 v1, s1 7255; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 7256; GFX9-NEXT: s_endpgm 7257; 7258; GFX90A-LABEL: udiv_v2i32_mixed_pow2k_denom: 7259; GFX90A: ; %bb.0: 7260; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 7261; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 7262; GFX90A-NEXT: v_mov_b32_e32 v2, 0 7263; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7264; GFX90A-NEXT: s_mul_hi_u32 s1, s3, 0x100101 7265; GFX90A-NEXT: s_lshr_b32 s0, s2, 12 7266; GFX90A-NEXT: s_sub_i32 s2, s3, s1 7267; GFX90A-NEXT: s_lshr_b32 s2, s2, 1 7268; GFX90A-NEXT: s_add_i32 s2, s2, s1 7269; GFX90A-NEXT: s_lshr_b32 s1, s2, 11 7270; GFX90A-NEXT: v_mov_b32_e32 v0, s0 7271; GFX90A-NEXT: v_mov_b32_e32 v1, s1 7272; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 7273; GFX90A-NEXT: s_endpgm 7274 %r = udiv <2 x i32> %x, <i32 4096, i32 4095> 7275 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 7276 ret void 7277} 7278 7279define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 7280; CHECK-LABEL: @udiv_v2i32_pow2_shl_denom( 7281; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 7282; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 7283; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 7284; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 7285; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 7286; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 7287; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 7288; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 7289; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 7290; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 7291; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 7292; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 7293; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 7294; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 7295; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 7296; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 7297; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 7298; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 7299; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 7300; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 7301; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 7302; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 7303; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 7304; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 7305; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 7306; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], 1 7307; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP21]] 7308; CHECK-NEXT: [[TMP27:%.*]] = sub i32 [[TMP23]], [[TMP2]] 7309; CHECK-NEXT: [[TMP28:%.*]] = select i1 [[TMP24]], i32 [[TMP27]], i32 [[TMP23]] 7310; CHECK-NEXT: [[TMP29:%.*]] = icmp uge i32 [[TMP28]], [[TMP2]] 7311; CHECK-NEXT: [[TMP30:%.*]] = add i32 [[TMP26]], 1 7312; CHECK-NEXT: [[TMP31:%.*]] = select i1 [[TMP29]], i32 [[TMP30]], i32 [[TMP26]] 7313; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> undef, i32 [[TMP31]], i64 0 7314; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x i32> [[X]], i64 1 7315; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 7316; CHECK-NEXT: [[TMP35:%.*]] = uitofp i32 [[TMP34]] to float 7317; CHECK-NEXT: [[TMP36:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP35]]) 7318; CHECK-NEXT: [[TMP37:%.*]] = fmul fast float [[TMP36]], 0x41EFFFFFC0000000 7319; CHECK-NEXT: [[TMP38:%.*]] = fptoui float [[TMP37]] to i32 7320; CHECK-NEXT: [[TMP39:%.*]] = sub i32 0, [[TMP34]] 7321; CHECK-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], [[TMP38]] 7322; CHECK-NEXT: [[TMP41:%.*]] = zext i32 [[TMP38]] to i64 7323; CHECK-NEXT: [[TMP42:%.*]] = zext i32 [[TMP40]] to i64 7324; CHECK-NEXT: [[TMP43:%.*]] = mul i64 [[TMP41]], [[TMP42]] 7325; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 7326; CHECK-NEXT: [[TMP45:%.*]] = lshr i64 [[TMP43]], 32 7327; CHECK-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i32 7328; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP38]], [[TMP46]] 7329; CHECK-NEXT: [[TMP48:%.*]] = zext i32 [[TMP33]] to i64 7330; CHECK-NEXT: [[TMP49:%.*]] = zext i32 [[TMP47]] to i64 7331; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP48]], [[TMP49]] 7332; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 7333; CHECK-NEXT: [[TMP52:%.*]] = lshr i64 [[TMP50]], 32 7334; CHECK-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i32 7335; CHECK-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], [[TMP34]] 7336; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP33]], [[TMP54]] 7337; CHECK-NEXT: [[TMP56:%.*]] = icmp uge i32 [[TMP55]], [[TMP34]] 7338; CHECK-NEXT: [[TMP57:%.*]] = add i32 [[TMP53]], 1 7339; CHECK-NEXT: [[TMP58:%.*]] = select i1 [[TMP56]], i32 [[TMP57]], i32 [[TMP53]] 7340; CHECK-NEXT: [[TMP59:%.*]] = sub i32 [[TMP55]], [[TMP34]] 7341; CHECK-NEXT: [[TMP60:%.*]] = select i1 [[TMP56]], i32 [[TMP59]], i32 [[TMP55]] 7342; CHECK-NEXT: [[TMP61:%.*]] = icmp uge i32 [[TMP60]], [[TMP34]] 7343; CHECK-NEXT: [[TMP62:%.*]] = add i32 [[TMP58]], 1 7344; CHECK-NEXT: [[TMP63:%.*]] = select i1 [[TMP61]], i32 [[TMP62]], i32 [[TMP58]] 7345; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i32> [[TMP32]], i32 [[TMP63]], i64 1 7346; CHECK-NEXT: store <2 x i32> [[TMP64]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 7347; CHECK-NEXT: ret void 7348; 7349; GFX6-LABEL: udiv_v2i32_pow2_shl_denom: 7350; GFX6: ; %bb.0: 7351; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 7352; GFX6-NEXT: s_movk_i32 s4, 0x1000 7353; GFX6-NEXT: s_mov_b32 s7, 0xf000 7354; GFX6-NEXT: s_mov_b32 s6, -1 7355; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7356; GFX6-NEXT: s_lshl_b32 s8, s4, s2 7357; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 7358; GFX6-NEXT: s_lshl_b32 s9, s4, s3 7359; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 7360; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 7361; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 7362; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 7363; GFX6-NEXT: s_mov_b32 s0, 0x4f7ffffe 7364; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 7365; GFX6-NEXT: v_mul_f32_e32 v0, s0, v0 7366; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 7367; GFX6-NEXT: v_mul_f32_e32 v1, s0, v1 7368; GFX6-NEXT: s_sub_i32 s0, 0, s8 7369; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 7370; GFX6-NEXT: v_mul_lo_u32 v2, s0, v0 7371; GFX6-NEXT: s_sub_i32 s0, 0, s9 7372; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1 7373; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 7374; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 7375; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 7376; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7377; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 7378; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 7379; GFX6-NEXT: v_mul_hi_u32 v1, s3, v1 7380; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 7381; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 7382; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9 7383; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s2, v2 7384; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v2 7385; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 7386; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s8, v2 7387; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 7388; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 7389; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v2 7390; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 7391; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s3, v4 7392; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 7393; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2 7394; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 7395; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s9, v2 7396; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 7397; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 7398; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 7399; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 7400; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 7401; GFX6-NEXT: s_endpgm 7402; 7403; GFX9-LABEL: udiv_v2i32_pow2_shl_denom: 7404; GFX9: ; %bb.0: 7405; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 7406; GFX9-NEXT: s_movk_i32 s4, 0x1000 7407; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7408; GFX9-NEXT: s_lshl_b32 s5, s4, s3 7409; GFX9-NEXT: s_lshl_b32 s4, s4, s2 7410; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 7411; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 7412; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe 7413; GFX9-NEXT: s_sub_i32 s3, 0, s5 7414; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 7415; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 7416; GFX9-NEXT: v_mul_f32_e32 v0, s2, v0 7417; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 7418; GFX9-NEXT: v_mul_f32_e32 v1, s2, v1 7419; GFX9-NEXT: s_sub_i32 s2, 0, s4 7420; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 7421; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 7422; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 7423; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 7424; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 7425; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 7426; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 7427; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 7428; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7429; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 7430; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 7431; GFX9-NEXT: v_mul_hi_u32 v1, s3, v1 7432; GFX9-NEXT: v_mov_b32_e32 v2, 0 7433; GFX9-NEXT: v_mul_lo_u32 v3, v0, s4 7434; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 7435; GFX9-NEXT: v_mul_lo_u32 v4, v1, s5 7436; GFX9-NEXT: v_add_u32_e32 v6, 1, v1 7437; GFX9-NEXT: v_sub_u32_e32 v3, s2, v3 7438; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 7439; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 7440; GFX9-NEXT: v_subrev_u32_e32 v5, s4, v3 7441; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 7442; GFX9-NEXT: v_sub_u32_e32 v4, s3, v4 7443; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 7444; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v3 7445; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 7446; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v4 7447; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v4 7448; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 7449; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc 7450; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 7451; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 7452; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 7453; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 7454; GFX9-NEXT: s_endpgm 7455; 7456; GFX90A-LABEL: udiv_v2i32_pow2_shl_denom: 7457; GFX90A: ; %bb.0: 7458; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 7459; GFX90A-NEXT: s_movk_i32 s8, 0x1000 7460; GFX90A-NEXT: s_mov_b32 s9, 0x4f7ffffe 7461; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 7462; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 7463; GFX90A-NEXT: v_mov_b32_e32 v2, 0 7464; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7465; GFX90A-NEXT: s_lshl_b32 s2, s8, s2 7466; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s2 7467; GFX90A-NEXT: s_lshl_b32 s0, s8, s3 7468; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s0 7469; GFX90A-NEXT: s_sub_i32 s1, 0, s2 7470; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 7471; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 7472; GFX90A-NEXT: v_mul_f32_e32 v0, s9, v0 7473; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 7474; GFX90A-NEXT: v_mul_f32_e32 v1, s9, v1 7475; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 7476; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v0 7477; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v3 7478; GFX90A-NEXT: v_add_u32_e32 v0, v0, v3 7479; GFX90A-NEXT: v_mul_hi_u32 v0, s6, v0 7480; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s2 7481; GFX90A-NEXT: v_sub_u32_e32 v3, s6, v3 7482; GFX90A-NEXT: v_add_u32_e32 v4, 1, v0 7483; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 7484; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 7485; GFX90A-NEXT: v_subrev_u32_e32 v4, s2, v3 7486; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 7487; GFX90A-NEXT: s_sub_i32 s1, 0, s0 7488; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v3 7489; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v1 7490; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 7491; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 7492; GFX90A-NEXT: v_mul_hi_u32 v1, s7, v1 7493; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s0 7494; GFX90A-NEXT: v_add_u32_e32 v4, 1, v0 7495; GFX90A-NEXT: v_sub_u32_e32 v3, s7, v3 7496; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 7497; GFX90A-NEXT: v_add_u32_e32 v4, 1, v1 7498; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 7499; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 7500; GFX90A-NEXT: v_subrev_u32_e32 v4, s0, v3 7501; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 7502; GFX90A-NEXT: v_add_u32_e32 v4, 1, v1 7503; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 7504; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 7505; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 7506; GFX90A-NEXT: s_endpgm 7507 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 7508 %r = udiv <2 x i32> %x, %shl.y 7509 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 7510 ret void 7511} 7512 7513define amdgpu_kernel void @urem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 7514; CHECK-LABEL: @urem_i32_oddk_denom( 7515; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 1235195 7516; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 7517; CHECK-NEXT: ret void 7518; 7519; GFX6-LABEL: urem_i32_oddk_denom: 7520; GFX6: ; %bb.0: 7521; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 7522; GFX6-NEXT: v_mov_b32_e32 v0, 0xb2a50881 7523; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb 7524; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 7525; GFX6-NEXT: s_mov_b32 s3, 0xf000 7526; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7527; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 7528; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v0 7529; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 7530; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 7531; GFX6-NEXT: v_lshrrev_b32_e32 v0, 20, v0 7532; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 7533; GFX6-NEXT: s_mov_b32 s2, -1 7534; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 7535; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 7536; GFX6-NEXT: s_endpgm 7537; 7538; GFX9-LABEL: urem_i32_oddk_denom: 7539; GFX9: ; %bb.0: 7540; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 7541; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7542; GFX9-NEXT: v_mov_b32_e32 v0, 0 7543; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7544; GFX9-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 7545; GFX9-NEXT: s_sub_i32 s1, s4, s0 7546; GFX9-NEXT: s_lshr_b32 s1, s1, 1 7547; GFX9-NEXT: s_add_i32 s1, s1, s0 7548; GFX9-NEXT: s_lshr_b32 s0, s1, 20 7549; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb 7550; GFX9-NEXT: s_sub_i32 s0, s4, s0 7551; GFX9-NEXT: v_mov_b32_e32 v1, s0 7552; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 7553; GFX9-NEXT: s_endpgm 7554; 7555; GFX90A-LABEL: urem_i32_oddk_denom: 7556; GFX90A: ; %bb.0: 7557; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 7558; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7559; GFX90A-NEXT: v_mov_b32_e32 v0, 0 7560; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7561; GFX90A-NEXT: s_mul_hi_u32 s0, s4, 0xb2a50881 7562; GFX90A-NEXT: s_sub_i32 s1, s4, s0 7563; GFX90A-NEXT: s_lshr_b32 s1, s1, 1 7564; GFX90A-NEXT: s_add_i32 s1, s1, s0 7565; GFX90A-NEXT: s_lshr_b32 s0, s1, 20 7566; GFX90A-NEXT: s_mul_i32 s0, s0, 0x12d8fb 7567; GFX90A-NEXT: s_sub_i32 s0, s4, s0 7568; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7569; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 7570; GFX90A-NEXT: s_endpgm 7571 %r = urem i32 %x, 1235195 7572 store i32 %r, i32 addrspace(1)* %out 7573 ret void 7574} 7575 7576define amdgpu_kernel void @urem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 7577; CHECK-LABEL: @urem_i32_pow2k_denom( 7578; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], 4096 7579; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 7580; CHECK-NEXT: ret void 7581; 7582; GFX6-LABEL: urem_i32_pow2k_denom: 7583; GFX6: ; %bb.0: 7584; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 7585; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 7586; GFX6-NEXT: s_mov_b32 s3, 0xf000 7587; GFX6-NEXT: s_mov_b32 s2, -1 7588; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7589; GFX6-NEXT: s_and_b32 s4, s4, 0xfff 7590; GFX6-NEXT: v_mov_b32_e32 v0, s4 7591; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 7592; GFX6-NEXT: s_endpgm 7593; 7594; GFX9-LABEL: urem_i32_pow2k_denom: 7595; GFX9: ; %bb.0: 7596; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 7597; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7598; GFX9-NEXT: v_mov_b32_e32 v0, 0 7599; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7600; GFX9-NEXT: s_and_b32 s0, s4, 0xfff 7601; GFX9-NEXT: v_mov_b32_e32 v1, s0 7602; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 7603; GFX9-NEXT: s_endpgm 7604; 7605; GFX90A-LABEL: urem_i32_pow2k_denom: 7606; GFX90A: ; %bb.0: 7607; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 7608; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7609; GFX90A-NEXT: v_mov_b32_e32 v0, 0 7610; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7611; GFX90A-NEXT: s_and_b32 s0, s4, 0xfff 7612; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7613; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 7614; GFX90A-NEXT: s_endpgm 7615 %r = urem i32 %x, 4096 7616 store i32 %r, i32 addrspace(1)* %out 7617 ret void 7618} 7619 7620define amdgpu_kernel void @urem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 7621; CHECK-LABEL: @urem_i32_pow2_shl_denom( 7622; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 7623; CHECK-NEXT: [[R:%.*]] = urem i32 [[X:%.*]], [[SHL_Y]] 7624; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 7625; CHECK-NEXT: ret void 7626; 7627; GFX6-LABEL: urem_i32_pow2_shl_denom: 7628; GFX6: ; %bb.0: 7629; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 7630; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 7631; GFX6-NEXT: s_mov_b32 s3, 0xf000 7632; GFX6-NEXT: s_mov_b32 s2, -1 7633; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7634; GFX6-NEXT: s_lshl_b32 s5, 0x1000, s5 7635; GFX6-NEXT: s_add_i32 s5, s5, -1 7636; GFX6-NEXT: s_and_b32 s4, s4, s5 7637; GFX6-NEXT: v_mov_b32_e32 v0, s4 7638; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 7639; GFX6-NEXT: s_endpgm 7640; 7641; GFX9-LABEL: urem_i32_pow2_shl_denom: 7642; GFX9: ; %bb.0: 7643; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 7644; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 7645; GFX9-NEXT: v_mov_b32_e32 v0, 0 7646; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7647; GFX9-NEXT: s_lshl_b32 s0, 0x1000, s3 7648; GFX9-NEXT: s_add_i32 s0, s0, -1 7649; GFX9-NEXT: s_and_b32 s0, s2, s0 7650; GFX9-NEXT: v_mov_b32_e32 v1, s0 7651; GFX9-NEXT: global_store_dword v0, v1, s[4:5] 7652; GFX9-NEXT: s_endpgm 7653; 7654; GFX90A-LABEL: urem_i32_pow2_shl_denom: 7655; GFX90A: ; %bb.0: 7656; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 7657; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 7658; GFX90A-NEXT: v_mov_b32_e32 v0, 0 7659; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7660; GFX90A-NEXT: s_lshl_b32 s0, 0x1000, s3 7661; GFX90A-NEXT: s_add_i32 s0, s0, -1 7662; GFX90A-NEXT: s_and_b32 s0, s2, s0 7663; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7664; GFX90A-NEXT: global_store_dword v0, v1, s[4:5] 7665; GFX90A-NEXT: s_endpgm 7666 %shl.y = shl i32 4096, %y 7667 %r = urem i32 %x, %shl.y 7668 store i32 %r, i32 addrspace(1)* %out 7669 ret void 7670} 7671 7672define amdgpu_kernel void @urem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 7673; CHECK-LABEL: @urem_v2i32_pow2k_denom( 7674; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 7675; CHECK-NEXT: [[TMP2:%.*]] = urem i32 [[TMP1]], 4096 7676; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 7677; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 7678; CHECK-NEXT: [[TMP5:%.*]] = urem i32 [[TMP4]], 4096 7679; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 7680; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 7681; CHECK-NEXT: ret void 7682; 7683; GFX6-LABEL: urem_v2i32_pow2k_denom: 7684; GFX6: ; %bb.0: 7685; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 7686; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 7687; GFX6-NEXT: s_movk_i32 s6, 0xfff 7688; GFX6-NEXT: s_mov_b32 s3, 0xf000 7689; GFX6-NEXT: s_mov_b32 s2, -1 7690; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7691; GFX6-NEXT: s_and_b32 s4, s4, s6 7692; GFX6-NEXT: s_and_b32 s5, s5, s6 7693; GFX6-NEXT: v_mov_b32_e32 v0, s4 7694; GFX6-NEXT: v_mov_b32_e32 v1, s5 7695; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 7696; GFX6-NEXT: s_endpgm 7697; 7698; GFX9-LABEL: urem_v2i32_pow2k_denom: 7699; GFX9: ; %bb.0: 7700; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 7701; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 7702; GFX9-NEXT: s_movk_i32 s0, 0xfff 7703; GFX9-NEXT: v_mov_b32_e32 v2, 0 7704; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7705; GFX9-NEXT: s_and_b32 s1, s2, s0 7706; GFX9-NEXT: s_and_b32 s0, s3, s0 7707; GFX9-NEXT: v_mov_b32_e32 v0, s1 7708; GFX9-NEXT: v_mov_b32_e32 v1, s0 7709; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 7710; GFX9-NEXT: s_endpgm 7711; 7712; GFX90A-LABEL: urem_v2i32_pow2k_denom: 7713; GFX90A: ; %bb.0: 7714; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 7715; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 7716; GFX90A-NEXT: s_movk_i32 s0, 0xfff 7717; GFX90A-NEXT: v_mov_b32_e32 v2, 0 7718; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7719; GFX90A-NEXT: s_and_b32 s1, s2, s0 7720; GFX90A-NEXT: s_and_b32 s0, s3, s0 7721; GFX90A-NEXT: v_mov_b32_e32 v0, s1 7722; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7723; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 7724; GFX90A-NEXT: s_endpgm 7725 %r = urem <2 x i32> %x, <i32 4096, i32 4096> 7726 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 7727 ret void 7728} 7729 7730define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 7731; CHECK-LABEL: @urem_v2i32_pow2_shl_denom( 7732; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 7733; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 7734; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 7735; CHECK-NEXT: [[TMP3:%.*]] = uitofp i32 [[TMP2]] to float 7736; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP3]]) 7737; CHECK-NEXT: [[TMP5:%.*]] = fmul fast float [[TMP4]], 0x41EFFFFFC0000000 7738; CHECK-NEXT: [[TMP6:%.*]] = fptoui float [[TMP5]] to i32 7739; CHECK-NEXT: [[TMP7:%.*]] = sub i32 0, [[TMP2]] 7740; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], [[TMP6]] 7741; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64 7742; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 7743; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]] 7744; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 7745; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP11]], 32 7746; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 7747; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP6]], [[TMP14]] 7748; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP1]] to i64 7749; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 7750; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 7751; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 7752; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 7753; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 7754; CHECK-NEXT: [[TMP22:%.*]] = mul i32 [[TMP21]], [[TMP2]] 7755; CHECK-NEXT: [[TMP23:%.*]] = sub i32 [[TMP1]], [[TMP22]] 7756; CHECK-NEXT: [[TMP24:%.*]] = icmp uge i32 [[TMP23]], [[TMP2]] 7757; CHECK-NEXT: [[TMP25:%.*]] = sub i32 [[TMP23]], [[TMP2]] 7758; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP24]], i32 [[TMP25]], i32 [[TMP23]] 7759; CHECK-NEXT: [[TMP27:%.*]] = icmp uge i32 [[TMP26]], [[TMP2]] 7760; CHECK-NEXT: [[TMP28:%.*]] = sub i32 [[TMP26]], [[TMP2]] 7761; CHECK-NEXT: [[TMP29:%.*]] = select i1 [[TMP27]], i32 [[TMP28]], i32 [[TMP26]] 7762; CHECK-NEXT: [[TMP30:%.*]] = insertelement <2 x i32> undef, i32 [[TMP29]], i64 0 7763; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[X]], i64 1 7764; CHECK-NEXT: [[TMP32:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 7765; CHECK-NEXT: [[TMP33:%.*]] = uitofp i32 [[TMP32]] to float 7766; CHECK-NEXT: [[TMP34:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP33]]) 7767; CHECK-NEXT: [[TMP35:%.*]] = fmul fast float [[TMP34]], 0x41EFFFFFC0000000 7768; CHECK-NEXT: [[TMP36:%.*]] = fptoui float [[TMP35]] to i32 7769; CHECK-NEXT: [[TMP37:%.*]] = sub i32 0, [[TMP32]] 7770; CHECK-NEXT: [[TMP38:%.*]] = mul i32 [[TMP37]], [[TMP36]] 7771; CHECK-NEXT: [[TMP39:%.*]] = zext i32 [[TMP36]] to i64 7772; CHECK-NEXT: [[TMP40:%.*]] = zext i32 [[TMP38]] to i64 7773; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP39]], [[TMP40]] 7774; CHECK-NEXT: [[TMP42:%.*]] = trunc i64 [[TMP41]] to i32 7775; CHECK-NEXT: [[TMP43:%.*]] = lshr i64 [[TMP41]], 32 7776; CHECK-NEXT: [[TMP44:%.*]] = trunc i64 [[TMP43]] to i32 7777; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[TMP36]], [[TMP44]] 7778; CHECK-NEXT: [[TMP46:%.*]] = zext i32 [[TMP31]] to i64 7779; CHECK-NEXT: [[TMP47:%.*]] = zext i32 [[TMP45]] to i64 7780; CHECK-NEXT: [[TMP48:%.*]] = mul i64 [[TMP46]], [[TMP47]] 7781; CHECK-NEXT: [[TMP49:%.*]] = trunc i64 [[TMP48]] to i32 7782; CHECK-NEXT: [[TMP50:%.*]] = lshr i64 [[TMP48]], 32 7783; CHECK-NEXT: [[TMP51:%.*]] = trunc i64 [[TMP50]] to i32 7784; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP32]] 7785; CHECK-NEXT: [[TMP53:%.*]] = sub i32 [[TMP31]], [[TMP52]] 7786; CHECK-NEXT: [[TMP54:%.*]] = icmp uge i32 [[TMP53]], [[TMP32]] 7787; CHECK-NEXT: [[TMP55:%.*]] = sub i32 [[TMP53]], [[TMP32]] 7788; CHECK-NEXT: [[TMP56:%.*]] = select i1 [[TMP54]], i32 [[TMP55]], i32 [[TMP53]] 7789; CHECK-NEXT: [[TMP57:%.*]] = icmp uge i32 [[TMP56]], [[TMP32]] 7790; CHECK-NEXT: [[TMP58:%.*]] = sub i32 [[TMP56]], [[TMP32]] 7791; CHECK-NEXT: [[TMP59:%.*]] = select i1 [[TMP57]], i32 [[TMP58]], i32 [[TMP56]] 7792; CHECK-NEXT: [[TMP60:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[TMP59]], i64 1 7793; CHECK-NEXT: store <2 x i32> [[TMP60]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 7794; CHECK-NEXT: ret void 7795; 7796; GFX6-LABEL: urem_v2i32_pow2_shl_denom: 7797; GFX6: ; %bb.0: 7798; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 7799; GFX6-NEXT: s_movk_i32 s4, 0x1000 7800; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7801; GFX6-NEXT: s_lshl_b32 s6, s4, s2 7802; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6 7803; GFX6-NEXT: s_lshl_b32 s7, s4, s3 7804; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7 7805; GFX6-NEXT: s_mov_b32 s2, 0x4f7ffffe 7806; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 7807; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 7808; GFX6-NEXT: v_rcp_iflag_f32_e32 v1, v1 7809; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 7810; GFX6-NEXT: v_mul_f32_e32 v0, s2, v0 7811; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 7812; GFX6-NEXT: v_mul_f32_e32 v1, s2, v1 7813; GFX6-NEXT: s_sub_i32 s2, 0, s6 7814; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 7815; GFX6-NEXT: v_mul_lo_u32 v2, s2, v0 7816; GFX6-NEXT: s_sub_i32 s2, 0, s7 7817; GFX6-NEXT: s_mov_b32 s3, 0xf000 7818; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 7819; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 7820; GFX6-NEXT: s_mov_b32 s2, -1 7821; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 7822; GFX6-NEXT: v_add_i32_e32 v0, vcc, v2, v0 7823; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7824; GFX6-NEXT: v_mul_hi_u32 v0, s4, v0 7825; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 7826; GFX6-NEXT: v_mul_hi_u32 v1, s5, v1 7827; GFX6-NEXT: v_mul_lo_u32 v0, v0, s6 7828; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 7829; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 7830; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 7831; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 7832; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 7833; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s6, v0 7834; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s6, v0 7835; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 7836; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s5, v1 7837; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 7838; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 7839; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 7840; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s7, v1 7841; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 7842; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 7843; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 7844; GFX6-NEXT: s_endpgm 7845; 7846; GFX9-LABEL: urem_v2i32_pow2_shl_denom: 7847; GFX9: ; %bb.0: 7848; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 7849; GFX9-NEXT: s_movk_i32 s4, 0x1000 7850; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7851; GFX9-NEXT: s_lshl_b32 s5, s4, s3 7852; GFX9-NEXT: s_lshl_b32 s4, s4, s2 7853; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4 7854; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s5 7855; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe 7856; GFX9-NEXT: s_sub_i32 s3, 0, s5 7857; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 7858; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 7859; GFX9-NEXT: v_mul_f32_e32 v0, s2, v0 7860; GFX9-NEXT: v_mul_f32_e32 v1, s2, v1 7861; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 7862; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 7863; GFX9-NEXT: s_sub_i32 s2, 0, s4 7864; GFX9-NEXT: v_mul_lo_u32 v2, s2, v0 7865; GFX9-NEXT: v_mul_lo_u32 v3, s3, v1 7866; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 7867; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2 7868; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 7869; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 7870; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 7871; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 7872; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7873; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 7874; GFX9-NEXT: v_mul_hi_u32 v1, s3, v1 7875; GFX9-NEXT: v_mov_b32_e32 v2, 0 7876; GFX9-NEXT: v_mul_lo_u32 v0, v0, s4 7877; GFX9-NEXT: v_mul_lo_u32 v1, v1, s5 7878; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 7879; GFX9-NEXT: v_sub_u32_e32 v1, s3, v1 7880; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v0 7881; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 7882; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v1 7883; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 7884; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 7885; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 7886; GFX9-NEXT: v_subrev_u32_e32 v3, s4, v0 7887; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 7888; GFX9-NEXT: v_subrev_u32_e32 v4, s5, v1 7889; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 7890; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s5, v1 7891; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 7892; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 7893; GFX9-NEXT: s_endpgm 7894; 7895; GFX90A-LABEL: urem_v2i32_pow2_shl_denom: 7896; GFX90A: ; %bb.0: 7897; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 7898; GFX90A-NEXT: s_movk_i32 s8, 0x1000 7899; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 7900; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 7901; GFX90A-NEXT: v_mov_b32_e32 v2, 0 7902; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7903; GFX90A-NEXT: s_lshl_b32 s2, s8, s2 7904; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s2 7905; GFX90A-NEXT: s_lshl_b32 s0, s8, s3 7906; GFX90A-NEXT: s_mov_b32 s3, 0x4f7ffffe 7907; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s0 7908; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 7909; GFX90A-NEXT: s_sub_i32 s1, 0, s2 7910; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 7911; GFX90A-NEXT: v_mul_f32_e32 v0, s3, v0 7912; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 7913; GFX90A-NEXT: v_mul_f32_e32 v1, s3, v1 7914; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 7915; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v0 7916; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v3 7917; GFX90A-NEXT: v_add_u32_e32 v0, v0, v3 7918; GFX90A-NEXT: v_mul_hi_u32 v0, s6, v0 7919; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s2 7920; GFX90A-NEXT: v_sub_u32_e32 v0, s6, v0 7921; GFX90A-NEXT: v_subrev_u32_e32 v3, s2, v0 7922; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 7923; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 7924; GFX90A-NEXT: v_subrev_u32_e32 v3, s2, v0 7925; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 7926; GFX90A-NEXT: s_sub_i32 s1, 0, s0 7927; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 7928; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v1 7929; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 7930; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 7931; GFX90A-NEXT: v_mul_hi_u32 v1, s7, v1 7932; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s0 7933; GFX90A-NEXT: v_sub_u32_e32 v1, s7, v1 7934; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v1 7935; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v1 7936; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 7937; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v1 7938; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v1 7939; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 7940; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 7941; GFX90A-NEXT: s_endpgm 7942 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 7943 %r = urem <2 x i32> %x, %shl.y 7944 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 7945 ret void 7946} 7947 7948define amdgpu_kernel void @sdiv_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 7949; CHECK-LABEL: @sdiv_i32_oddk_denom( 7950; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 1235195 7951; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 7952; CHECK-NEXT: ret void 7953; 7954; GFX6-LABEL: sdiv_i32_oddk_denom: 7955; GFX6: ; %bb.0: 7956; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 7957; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 7958; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 7959; GFX6-NEXT: s_mov_b32 s3, 0xf000 7960; GFX6-NEXT: s_mov_b32 s2, -1 7961; GFX6-NEXT: s_waitcnt lgkmcnt(0) 7962; GFX6-NEXT: v_mul_hi_i32 v0, s4, v0 7963; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 7964; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 7965; GFX6-NEXT: v_ashrrev_i32_e32 v0, 20, v0 7966; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 7967; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 7968; GFX6-NEXT: s_endpgm 7969; 7970; GFX9-LABEL: sdiv_i32_oddk_denom: 7971; GFX9: ; %bb.0: 7972; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 7973; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7974; GFX9-NEXT: v_mov_b32_e32 v0, 0 7975; GFX9-NEXT: s_waitcnt lgkmcnt(0) 7976; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 7977; GFX9-NEXT: s_add_i32 s0, s0, s4 7978; GFX9-NEXT: s_lshr_b32 s1, s0, 31 7979; GFX9-NEXT: s_ashr_i32 s0, s0, 20 7980; GFX9-NEXT: s_add_i32 s0, s0, s1 7981; GFX9-NEXT: v_mov_b32_e32 v1, s0 7982; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 7983; GFX9-NEXT: s_endpgm 7984; 7985; GFX90A-LABEL: sdiv_i32_oddk_denom: 7986; GFX90A: ; %bb.0: 7987; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 7988; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 7989; GFX90A-NEXT: v_mov_b32_e32 v0, 0 7990; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 7991; GFX90A-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 7992; GFX90A-NEXT: s_add_i32 s0, s0, s4 7993; GFX90A-NEXT: s_lshr_b32 s1, s0, 31 7994; GFX90A-NEXT: s_ashr_i32 s0, s0, 20 7995; GFX90A-NEXT: s_add_i32 s0, s0, s1 7996; GFX90A-NEXT: v_mov_b32_e32 v1, s0 7997; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 7998; GFX90A-NEXT: s_endpgm 7999 %r = sdiv i32 %x, 1235195 8000 store i32 %r, i32 addrspace(1)* %out 8001 ret void 8002} 8003 8004define amdgpu_kernel void @sdiv_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 8005; CHECK-LABEL: @sdiv_i32_pow2k_denom( 8006; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], 4096 8007; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 8008; CHECK-NEXT: ret void 8009; 8010; GFX6-LABEL: sdiv_i32_pow2k_denom: 8011; GFX6: ; %bb.0: 8012; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 8013; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 8014; GFX6-NEXT: s_mov_b32 s3, 0xf000 8015; GFX6-NEXT: s_mov_b32 s2, -1 8016; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8017; GFX6-NEXT: s_ashr_i32 s5, s4, 31 8018; GFX6-NEXT: s_lshr_b32 s5, s5, 20 8019; GFX6-NEXT: s_add_i32 s4, s4, s5 8020; GFX6-NEXT: s_ashr_i32 s4, s4, 12 8021; GFX6-NEXT: v_mov_b32_e32 v0, s4 8022; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 8023; GFX6-NEXT: s_endpgm 8024; 8025; GFX9-LABEL: sdiv_i32_pow2k_denom: 8026; GFX9: ; %bb.0: 8027; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 8028; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8029; GFX9-NEXT: v_mov_b32_e32 v0, 0 8030; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8031; GFX9-NEXT: s_ashr_i32 s0, s4, 31 8032; GFX9-NEXT: s_lshr_b32 s0, s0, 20 8033; GFX9-NEXT: s_add_i32 s4, s4, s0 8034; GFX9-NEXT: s_ashr_i32 s0, s4, 12 8035; GFX9-NEXT: v_mov_b32_e32 v1, s0 8036; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 8037; GFX9-NEXT: s_endpgm 8038; 8039; GFX90A-LABEL: sdiv_i32_pow2k_denom: 8040; GFX90A: ; %bb.0: 8041; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 8042; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8043; GFX90A-NEXT: v_mov_b32_e32 v0, 0 8044; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8045; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 8046; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 8047; GFX90A-NEXT: s_add_i32 s4, s4, s0 8048; GFX90A-NEXT: s_ashr_i32 s0, s4, 12 8049; GFX90A-NEXT: v_mov_b32_e32 v1, s0 8050; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 8051; GFX90A-NEXT: s_endpgm 8052 %r = sdiv i32 %x, 4096 8053 store i32 %r, i32 addrspace(1)* %out 8054 ret void 8055} 8056 8057define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 8058; CHECK-LABEL: @sdiv_i32_pow2_shl_denom( 8059; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 8060; CHECK-NEXT: [[R:%.*]] = sdiv i32 [[X:%.*]], [[SHL_Y]] 8061; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 8062; CHECK-NEXT: ret void 8063; 8064; GFX6-LABEL: sdiv_i32_pow2_shl_denom: 8065; GFX6: ; %bb.0: 8066; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 8067; GFX6-NEXT: s_mov_b32 s7, 0xf000 8068; GFX6-NEXT: s_mov_b32 s6, -1 8069; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8070; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 8071; GFX6-NEXT: s_ashr_i32 s8, s3, 31 8072; GFX6-NEXT: s_add_i32 s3, s3, s8 8073; GFX6-NEXT: s_xor_b32 s3, s3, s8 8074; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s3 8075; GFX6-NEXT: s_sub_i32 s4, 0, s3 8076; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 8077; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 8078; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 8079; GFX6-NEXT: v_mul_lo_u32 v1, s4, v0 8080; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8081; GFX6-NEXT: s_ashr_i32 s0, s2, 31 8082; GFX6-NEXT: s_add_i32 s1, s2, s0 8083; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 8084; GFX6-NEXT: s_xor_b32 s1, s1, s0 8085; GFX6-NEXT: s_xor_b32 s2, s0, s8 8086; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 8087; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 8088; GFX6-NEXT: v_mul_lo_u32 v1, v0, s3 8089; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0 8090; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 8091; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s3, v1 8092; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 8093; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 8094; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 8095; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1] 8096; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 8097; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 8098; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0 8099; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0 8100; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8101; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0 8102; GFX6-NEXT: s_endpgm 8103; 8104; GFX9-LABEL: sdiv_i32_pow2_shl_denom: 8105; GFX9: ; %bb.0: 8106; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 8107; GFX9-NEXT: v_mov_b32_e32 v2, 0 8108; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 8109; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8110; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 8111; GFX9-NEXT: s_ashr_i32 s4, s3, 31 8112; GFX9-NEXT: s_add_i32 s3, s3, s4 8113; GFX9-NEXT: s_xor_b32 s3, s3, s4 8114; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 8115; GFX9-NEXT: s_sub_i32 s5, 0, s3 8116; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 8117; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 8118; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 8119; GFX9-NEXT: v_mul_lo_u32 v1, s5, v0 8120; GFX9-NEXT: s_ashr_i32 s5, s2, 31 8121; GFX9-NEXT: s_add_i32 s2, s2, s5 8122; GFX9-NEXT: s_xor_b32 s2, s2, s5 8123; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 8124; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 8125; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 8126; GFX9-NEXT: v_mul_lo_u32 v1, v0, s3 8127; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 8128; GFX9-NEXT: v_sub_u32_e32 v1, s2, v1 8129; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 8130; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 8131; GFX9-NEXT: v_subrev_u32_e32 v3, s3, v1 8132; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 8133; GFX9-NEXT: v_add_u32_e32 v4, 1, v0 8134; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 8135; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 8136; GFX9-NEXT: s_xor_b32 s2, s5, s4 8137; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 8138; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 8139; GFX9-NEXT: global_store_dword v2, v0, s[0:1] 8140; GFX9-NEXT: s_endpgm 8141; 8142; GFX90A-LABEL: sdiv_i32_pow2_shl_denom: 8143; GFX90A: ; %bb.0: 8144; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 8145; GFX90A-NEXT: v_mov_b32_e32 v1, 0 8146; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 8147; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8148; GFX90A-NEXT: s_lshl_b32 s3, 0x1000, s3 8149; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 8150; GFX90A-NEXT: s_add_i32 s3, s3, s4 8151; GFX90A-NEXT: s_xor_b32 s3, s3, s4 8152; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 8153; GFX90A-NEXT: s_sub_i32 s6, 0, s3 8154; GFX90A-NEXT: s_ashr_i32 s5, s2, 31 8155; GFX90A-NEXT: s_add_i32 s2, s2, s5 8156; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 8157; GFX90A-NEXT: s_xor_b32 s2, s2, s5 8158; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 8159; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 8160; GFX90A-NEXT: v_mul_lo_u32 v2, s6, v0 8161; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 8162; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 8163; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 8164; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s3 8165; GFX90A-NEXT: v_sub_u32_e32 v3, s2, v3 8166; GFX90A-NEXT: v_add_u32_e32 v2, 1, v0 8167; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v3 8168; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 8169; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v3 8170; GFX90A-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc 8171; GFX90A-NEXT: v_add_u32_e32 v4, 1, v0 8172; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v2 8173; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 8174; GFX90A-NEXT: s_xor_b32 s2, s5, s4 8175; GFX90A-NEXT: v_xor_b32_e32 v0, s2, v0 8176; GFX90A-NEXT: v_subrev_u32_e32 v0, s2, v0 8177; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] 8178; GFX90A-NEXT: s_endpgm 8179 %shl.y = shl i32 4096, %y 8180 %r = sdiv i32 %x, %shl.y 8181 store i32 %r, i32 addrspace(1)* %out 8182 ret void 8183} 8184 8185define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 8186; CHECK-LABEL: @sdiv_v2i32_pow2k_denom( 8187; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 8188; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 8189; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 8190; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 8191; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4096 8192; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 8193; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 8194; CHECK-NEXT: ret void 8195; 8196; GFX6-LABEL: sdiv_v2i32_pow2k_denom: 8197; GFX6: ; %bb.0: 8198; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 8199; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 8200; GFX6-NEXT: s_mov_b32 s3, 0xf000 8201; GFX6-NEXT: s_mov_b32 s2, -1 8202; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8203; GFX6-NEXT: s_ashr_i32 s6, s4, 31 8204; GFX6-NEXT: s_ashr_i32 s7, s5, 31 8205; GFX6-NEXT: s_lshr_b32 s6, s6, 20 8206; GFX6-NEXT: s_add_i32 s4, s4, s6 8207; GFX6-NEXT: s_lshr_b32 s6, s7, 20 8208; GFX6-NEXT: s_add_i32 s5, s5, s6 8209; GFX6-NEXT: s_ashr_i32 s4, s4, 12 8210; GFX6-NEXT: s_ashr_i32 s5, s5, 12 8211; GFX6-NEXT: v_mov_b32_e32 v0, s4 8212; GFX6-NEXT: v_mov_b32_e32 v1, s5 8213; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 8214; GFX6-NEXT: s_endpgm 8215; 8216; GFX9-LABEL: sdiv_v2i32_pow2k_denom: 8217; GFX9: ; %bb.0: 8218; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 8219; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 8220; GFX9-NEXT: v_mov_b32_e32 v2, 0 8221; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8222; GFX9-NEXT: s_ashr_i32 s0, s2, 31 8223; GFX9-NEXT: s_ashr_i32 s1, s3, 31 8224; GFX9-NEXT: s_lshr_b32 s0, s0, 20 8225; GFX9-NEXT: s_lshr_b32 s1, s1, 20 8226; GFX9-NEXT: s_add_i32 s0, s2, s0 8227; GFX9-NEXT: s_add_i32 s1, s3, s1 8228; GFX9-NEXT: s_ashr_i32 s0, s0, 12 8229; GFX9-NEXT: s_ashr_i32 s1, s1, 12 8230; GFX9-NEXT: v_mov_b32_e32 v0, s0 8231; GFX9-NEXT: v_mov_b32_e32 v1, s1 8232; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 8233; GFX9-NEXT: s_endpgm 8234; 8235; GFX90A-LABEL: sdiv_v2i32_pow2k_denom: 8236; GFX90A: ; %bb.0: 8237; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 8238; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 8239; GFX90A-NEXT: v_mov_b32_e32 v2, 0 8240; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8241; GFX90A-NEXT: s_ashr_i32 s0, s2, 31 8242; GFX90A-NEXT: s_ashr_i32 s1, s3, 31 8243; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 8244; GFX90A-NEXT: s_lshr_b32 s1, s1, 20 8245; GFX90A-NEXT: s_add_i32 s0, s2, s0 8246; GFX90A-NEXT: s_add_i32 s1, s3, s1 8247; GFX90A-NEXT: s_ashr_i32 s0, s0, 12 8248; GFX90A-NEXT: s_ashr_i32 s1, s1, 12 8249; GFX90A-NEXT: v_mov_b32_e32 v0, s0 8250; GFX90A-NEXT: v_mov_b32_e32 v1, s1 8251; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 8252; GFX90A-NEXT: s_endpgm 8253 %r = sdiv <2 x i32> %x, <i32 4096, i32 4096> 8254 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 8255 ret void 8256} 8257 8258define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 8259; CHECK-LABEL: @ssdiv_v2i32_mixed_pow2k_denom( 8260; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 8261; CHECK-NEXT: [[TMP2:%.*]] = sdiv i32 [[TMP1]], 4096 8262; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 8263; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 8264; CHECK-NEXT: [[TMP5:%.*]] = sdiv i32 [[TMP4]], 4095 8265; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 8266; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 8267; CHECK-NEXT: ret void 8268; 8269; GFX6-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 8270; GFX6: ; %bb.0: 8271; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 8272; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 8273; GFX6-NEXT: v_mov_b32_e32 v0, 0x80080081 8274; GFX6-NEXT: s_mov_b32 s3, 0xf000 8275; GFX6-NEXT: s_mov_b32 s2, -1 8276; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8277; GFX6-NEXT: v_mul_hi_i32 v0, s5, v0 8278; GFX6-NEXT: s_ashr_i32 s6, s4, 31 8279; GFX6-NEXT: s_lshr_b32 s6, s6, 20 8280; GFX6-NEXT: s_add_i32 s4, s4, s6 8281; GFX6-NEXT: v_add_i32_e32 v0, vcc, s5, v0 8282; GFX6-NEXT: s_ashr_i32 s4, s4, 12 8283; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 8284; GFX6-NEXT: v_ashrrev_i32_e32 v0, 11, v0 8285; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v0 8286; GFX6-NEXT: v_mov_b32_e32 v0, s4 8287; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 8288; GFX6-NEXT: s_endpgm 8289; 8290; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 8291; GFX9: ; %bb.0: 8292; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 8293; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 8294; GFX9-NEXT: v_mov_b32_e32 v2, 0 8295; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8296; GFX9-NEXT: s_ashr_i32 s0, s2, 31 8297; GFX9-NEXT: s_mul_hi_i32 s1, s3, 0x80080081 8298; GFX9-NEXT: s_lshr_b32 s0, s0, 20 8299; GFX9-NEXT: s_add_i32 s1, s1, s3 8300; GFX9-NEXT: s_add_i32 s0, s2, s0 8301; GFX9-NEXT: s_lshr_b32 s2, s1, 31 8302; GFX9-NEXT: s_ashr_i32 s1, s1, 11 8303; GFX9-NEXT: s_ashr_i32 s0, s0, 12 8304; GFX9-NEXT: s_add_i32 s1, s1, s2 8305; GFX9-NEXT: v_mov_b32_e32 v0, s0 8306; GFX9-NEXT: v_mov_b32_e32 v1, s1 8307; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 8308; GFX9-NEXT: s_endpgm 8309; 8310; GFX90A-LABEL: ssdiv_v2i32_mixed_pow2k_denom: 8311; GFX90A: ; %bb.0: 8312; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 8313; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 8314; GFX90A-NEXT: v_mov_b32_e32 v2, 0 8315; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8316; GFX90A-NEXT: s_ashr_i32 s0, s2, 31 8317; GFX90A-NEXT: s_mul_hi_i32 s1, s3, 0x80080081 8318; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 8319; GFX90A-NEXT: s_add_i32 s1, s1, s3 8320; GFX90A-NEXT: s_add_i32 s0, s2, s0 8321; GFX90A-NEXT: s_lshr_b32 s2, s1, 31 8322; GFX90A-NEXT: s_ashr_i32 s1, s1, 11 8323; GFX90A-NEXT: s_ashr_i32 s0, s0, 12 8324; GFX90A-NEXT: s_add_i32 s1, s1, s2 8325; GFX90A-NEXT: v_mov_b32_e32 v0, s0 8326; GFX90A-NEXT: v_mov_b32_e32 v1, s1 8327; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 8328; GFX90A-NEXT: s_endpgm 8329 %r = sdiv <2 x i32> %x, <i32 4096, i32 4095> 8330 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 8331 ret void 8332} 8333 8334define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 8335; CHECK-LABEL: @sdiv_v2i32_pow2_shl_denom( 8336; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 8337; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 8338; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 8339; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 8340; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 8341; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]] 8342; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP1]], [[TMP3]] 8343; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP2]], [[TMP4]] 8344; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP3]] 8345; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[TMP7]], [[TMP4]] 8346; CHECK-NEXT: [[TMP10:%.*]] = uitofp i32 [[TMP9]] to float 8347; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP10]]) 8348; CHECK-NEXT: [[TMP12:%.*]] = fmul fast float [[TMP11]], 0x41EFFFFFC0000000 8349; CHECK-NEXT: [[TMP13:%.*]] = fptoui float [[TMP12]] to i32 8350; CHECK-NEXT: [[TMP14:%.*]] = sub i32 0, [[TMP9]] 8351; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], [[TMP13]] 8352; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP13]] to i64 8353; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[TMP15]] to i64 8354; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP16]], [[TMP17]] 8355; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 8356; CHECK-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32 8357; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32 8358; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP13]], [[TMP21]] 8359; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP8]] to i64 8360; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[TMP22]] to i64 8361; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP23]], [[TMP24]] 8362; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP25]] to i32 8363; CHECK-NEXT: [[TMP27:%.*]] = lshr i64 [[TMP25]], 32 8364; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32 8365; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], [[TMP9]] 8366; CHECK-NEXT: [[TMP30:%.*]] = sub i32 [[TMP8]], [[TMP29]] 8367; CHECK-NEXT: [[TMP31:%.*]] = icmp uge i32 [[TMP30]], [[TMP9]] 8368; CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP28]], 1 8369; CHECK-NEXT: [[TMP33:%.*]] = select i1 [[TMP31]], i32 [[TMP32]], i32 [[TMP28]] 8370; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP30]], [[TMP9]] 8371; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP31]], i32 [[TMP34]], i32 [[TMP30]] 8372; CHECK-NEXT: [[TMP36:%.*]] = icmp uge i32 [[TMP35]], [[TMP9]] 8373; CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP33]], 1 8374; CHECK-NEXT: [[TMP38:%.*]] = select i1 [[TMP36]], i32 [[TMP37]], i32 [[TMP33]] 8375; CHECK-NEXT: [[TMP39:%.*]] = xor i32 [[TMP38]], [[TMP5]] 8376; CHECK-NEXT: [[TMP40:%.*]] = sub i32 [[TMP39]], [[TMP5]] 8377; CHECK-NEXT: [[TMP41:%.*]] = insertelement <2 x i32> undef, i32 [[TMP40]], i64 0 8378; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[X]], i64 1 8379; CHECK-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 8380; CHECK-NEXT: [[TMP44:%.*]] = ashr i32 [[TMP42]], 31 8381; CHECK-NEXT: [[TMP45:%.*]] = ashr i32 [[TMP43]], 31 8382; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP45]] 8383; CHECK-NEXT: [[TMP47:%.*]] = add i32 [[TMP42]], [[TMP44]] 8384; CHECK-NEXT: [[TMP48:%.*]] = add i32 [[TMP43]], [[TMP45]] 8385; CHECK-NEXT: [[TMP49:%.*]] = xor i32 [[TMP47]], [[TMP44]] 8386; CHECK-NEXT: [[TMP50:%.*]] = xor i32 [[TMP48]], [[TMP45]] 8387; CHECK-NEXT: [[TMP51:%.*]] = uitofp i32 [[TMP50]] to float 8388; CHECK-NEXT: [[TMP52:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP51]]) 8389; CHECK-NEXT: [[TMP53:%.*]] = fmul fast float [[TMP52]], 0x41EFFFFFC0000000 8390; CHECK-NEXT: [[TMP54:%.*]] = fptoui float [[TMP53]] to i32 8391; CHECK-NEXT: [[TMP55:%.*]] = sub i32 0, [[TMP50]] 8392; CHECK-NEXT: [[TMP56:%.*]] = mul i32 [[TMP55]], [[TMP54]] 8393; CHECK-NEXT: [[TMP57:%.*]] = zext i32 [[TMP54]] to i64 8394; CHECK-NEXT: [[TMP58:%.*]] = zext i32 [[TMP56]] to i64 8395; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP57]], [[TMP58]] 8396; CHECK-NEXT: [[TMP60:%.*]] = trunc i64 [[TMP59]] to i32 8397; CHECK-NEXT: [[TMP61:%.*]] = lshr i64 [[TMP59]], 32 8398; CHECK-NEXT: [[TMP62:%.*]] = trunc i64 [[TMP61]] to i32 8399; CHECK-NEXT: [[TMP63:%.*]] = add i32 [[TMP54]], [[TMP62]] 8400; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[TMP49]] to i64 8401; CHECK-NEXT: [[TMP65:%.*]] = zext i32 [[TMP63]] to i64 8402; CHECK-NEXT: [[TMP66:%.*]] = mul i64 [[TMP64]], [[TMP65]] 8403; CHECK-NEXT: [[TMP67:%.*]] = trunc i64 [[TMP66]] to i32 8404; CHECK-NEXT: [[TMP68:%.*]] = lshr i64 [[TMP66]], 32 8405; CHECK-NEXT: [[TMP69:%.*]] = trunc i64 [[TMP68]] to i32 8406; CHECK-NEXT: [[TMP70:%.*]] = mul i32 [[TMP69]], [[TMP50]] 8407; CHECK-NEXT: [[TMP71:%.*]] = sub i32 [[TMP49]], [[TMP70]] 8408; CHECK-NEXT: [[TMP72:%.*]] = icmp uge i32 [[TMP71]], [[TMP50]] 8409; CHECK-NEXT: [[TMP73:%.*]] = add i32 [[TMP69]], 1 8410; CHECK-NEXT: [[TMP74:%.*]] = select i1 [[TMP72]], i32 [[TMP73]], i32 [[TMP69]] 8411; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP71]], [[TMP50]] 8412; CHECK-NEXT: [[TMP76:%.*]] = select i1 [[TMP72]], i32 [[TMP75]], i32 [[TMP71]] 8413; CHECK-NEXT: [[TMP77:%.*]] = icmp uge i32 [[TMP76]], [[TMP50]] 8414; CHECK-NEXT: [[TMP78:%.*]] = add i32 [[TMP74]], 1 8415; CHECK-NEXT: [[TMP79:%.*]] = select i1 [[TMP77]], i32 [[TMP78]], i32 [[TMP74]] 8416; CHECK-NEXT: [[TMP80:%.*]] = xor i32 [[TMP79]], [[TMP46]] 8417; CHECK-NEXT: [[TMP81:%.*]] = sub i32 [[TMP80]], [[TMP46]] 8418; CHECK-NEXT: [[TMP82:%.*]] = insertelement <2 x i32> [[TMP41]], i32 [[TMP81]], i64 1 8419; CHECK-NEXT: store <2 x i32> [[TMP82]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 8420; CHECK-NEXT: ret void 8421; 8422; GFX6-LABEL: sdiv_v2i32_pow2_shl_denom: 8423; GFX6: ; %bb.0: 8424; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 8425; GFX6-NEXT: s_movk_i32 s10, 0x1000 8426; GFX6-NEXT: s_mov_b32 s12, 0x4f7ffffe 8427; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 8428; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb 8429; GFX6-NEXT: s_mov_b32 s7, 0xf000 8430; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8431; GFX6-NEXT: s_lshl_b32 s2, s10, s2 8432; GFX6-NEXT: s_ashr_i32 s11, s2, 31 8433; GFX6-NEXT: s_add_i32 s2, s2, s11 8434; GFX6-NEXT: s_xor_b32 s2, s2, s11 8435; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 8436; GFX6-NEXT: s_lshl_b32 s0, s10, s3 8437; GFX6-NEXT: s_sub_i32 s10, 0, s2 8438; GFX6-NEXT: s_ashr_i32 s3, s0, 31 8439; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 8440; GFX6-NEXT: s_add_i32 s0, s0, s3 8441; GFX6-NEXT: s_ashr_i32 s1, s8, 31 8442; GFX6-NEXT: s_mov_b32 s6, -1 8443; GFX6-NEXT: v_mul_f32_e32 v0, s12, v0 8444; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 8445; GFX6-NEXT: v_mul_lo_u32 v1, s10, v0 8446; GFX6-NEXT: s_xor_b32 s10, s0, s3 8447; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s10 8448; GFX6-NEXT: s_add_i32 s0, s8, s1 8449; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 8450; GFX6-NEXT: s_xor_b32 s0, s0, s1 8451; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 8452; GFX6-NEXT: s_xor_b32 s8, s1, s11 8453; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 8454; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 8455; GFX6-NEXT: v_mul_f32_e32 v1, s12, v2 8456; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 8457; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 8458; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v0 8459; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s0, v2 8460; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s2, v2 8461; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[0:1] 8462; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v2 8463; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 8464; GFX6-NEXT: s_sub_i32 s0, 0, s10 8465; GFX6-NEXT: v_mul_lo_u32 v3, s0, v1 8466; GFX6-NEXT: s_ashr_i32 s0, s9, 31 8467; GFX6-NEXT: s_add_i32 s1, s9, s0 8468; GFX6-NEXT: s_xor_b32 s1, s1, s0 8469; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 8470; GFX6-NEXT: v_add_i32_e32 v4, vcc, 1, v0 8471; GFX6-NEXT: v_add_i32_e32 v1, vcc, v3, v1 8472; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 8473; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v2 8474; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 8475; GFX6-NEXT: s_xor_b32 s2, s0, s3 8476; GFX6-NEXT: v_mul_lo_u32 v2, v1, s10 8477; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 8478; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 8479; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s1, v2 8480; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2 8481; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 8482; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s10, v2 8483; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 8484; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1] 8485; GFX6-NEXT: v_add_i32_e32 v3, vcc, 1, v1 8486; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v2 8487; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 8488; GFX6-NEXT: v_xor_b32_e32 v1, s2, v1 8489; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s2, v1 8490; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 8491; GFX6-NEXT: s_endpgm 8492; 8493; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom: 8494; GFX9: ; %bb.0: 8495; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 8496; GFX9-NEXT: s_movk_i32 s8, 0x1000 8497; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 8498; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 8499; GFX9-NEXT: s_mov_b32 s10, 0x4f7ffffe 8500; GFX9-NEXT: v_mov_b32_e32 v2, 0 8501; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8502; GFX9-NEXT: s_lshl_b32 s2, s8, s2 8503; GFX9-NEXT: s_ashr_i32 s9, s2, 31 8504; GFX9-NEXT: s_add_i32 s2, s2, s9 8505; GFX9-NEXT: s_xor_b32 s2, s2, s9 8506; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2 8507; GFX9-NEXT: s_lshl_b32 s0, s8, s3 8508; GFX9-NEXT: s_ashr_i32 s1, s0, 31 8509; GFX9-NEXT: s_add_i32 s0, s0, s1 8510; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 8511; GFX9-NEXT: s_xor_b32 s0, s0, s1 8512; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 8513; GFX9-NEXT: s_sub_i32 s3, 0, s2 8514; GFX9-NEXT: v_mul_f32_e32 v0, s10, v0 8515; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 8516; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 8517; GFX9-NEXT: s_sub_i32 s8, 0, s0 8518; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 8519; GFX9-NEXT: v_mul_f32_e32 v1, s10, v1 8520; GFX9-NEXT: s_ashr_i32 s3, s6, 31 8521; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 8522; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 8523; GFX9-NEXT: s_add_i32 s6, s6, s3 8524; GFX9-NEXT: s_xor_b32 s6, s6, s3 8525; GFX9-NEXT: s_xor_b32 s3, s3, s9 8526; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 8527; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0 8528; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 8529; GFX9-NEXT: s_ashr_i32 s8, s7, 31 8530; GFX9-NEXT: s_xor_b32 s1, s8, s1 8531; GFX9-NEXT: v_mul_lo_u32 v4, v0, s2 8532; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 8533; GFX9-NEXT: v_add_u32_e32 v5, 1, v0 8534; GFX9-NEXT: v_sub_u32_e32 v4, s6, v4 8535; GFX9-NEXT: s_add_i32 s6, s7, s8 8536; GFX9-NEXT: s_xor_b32 s6, s6, s8 8537; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 8538; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 8539; GFX9-NEXT: v_mul_hi_u32 v1, s6, v1 8540; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 8541; GFX9-NEXT: v_subrev_u32_e32 v5, s2, v4 8542; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc 8543; GFX9-NEXT: v_add_u32_e32 v3, 1, v0 8544; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v4 8545; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 8546; GFX9-NEXT: v_mul_lo_u32 v3, v1, s0 8547; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 8548; GFX9-NEXT: v_xor_b32_e32 v0, s3, v0 8549; GFX9-NEXT: v_subrev_u32_e32 v0, s3, v0 8550; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3 8551; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 8552; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 8553; GFX9-NEXT: v_subrev_u32_e32 v4, s0, v3 8554; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 8555; GFX9-NEXT: v_add_u32_e32 v4, 1, v1 8556; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 8557; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 8558; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 8559; GFX9-NEXT: v_subrev_u32_e32 v1, s1, v1 8560; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 8561; GFX9-NEXT: s_endpgm 8562; 8563; GFX90A-LABEL: sdiv_v2i32_pow2_shl_denom: 8564; GFX90A: ; %bb.0: 8565; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 8566; GFX90A-NEXT: s_movk_i32 s8, 0x1000 8567; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 8568; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 8569; GFX90A-NEXT: s_mov_b32 s10, 0x4f7ffffe 8570; GFX90A-NEXT: v_mov_b32_e32 v2, 0 8571; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8572; GFX90A-NEXT: s_lshl_b32 s2, s8, s2 8573; GFX90A-NEXT: s_ashr_i32 s9, s2, 31 8574; GFX90A-NEXT: s_add_i32 s2, s2, s9 8575; GFX90A-NEXT: s_xor_b32 s2, s2, s9 8576; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s2 8577; GFX90A-NEXT: s_ashr_i32 s1, s6, 31 8578; GFX90A-NEXT: s_lshl_b32 s0, s8, s3 8579; GFX90A-NEXT: s_add_i32 s3, s6, s1 8580; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 8581; GFX90A-NEXT: s_xor_b32 s6, s1, s9 8582; GFX90A-NEXT: s_xor_b32 s1, s3, s1 8583; GFX90A-NEXT: s_sub_i32 s3, 0, s2 8584; GFX90A-NEXT: v_mul_f32_e32 v0, s10, v0 8585; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 8586; GFX90A-NEXT: v_mul_lo_u32 v1, s3, v0 8587; GFX90A-NEXT: v_mul_hi_u32 v1, v0, v1 8588; GFX90A-NEXT: v_add_u32_e32 v0, v0, v1 8589; GFX90A-NEXT: v_mul_hi_u32 v0, s1, v0 8590; GFX90A-NEXT: v_mul_lo_u32 v1, v0, s2 8591; GFX90A-NEXT: v_sub_u32_e32 v1, s1, v1 8592; GFX90A-NEXT: s_ashr_i32 s1, s0, 31 8593; GFX90A-NEXT: s_add_i32 s0, s0, s1 8594; GFX90A-NEXT: s_xor_b32 s0, s0, s1 8595; GFX90A-NEXT: v_cvt_f32_u32_e32 v4, s0 8596; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 8597; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 8598; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 8599; GFX90A-NEXT: v_subrev_u32_e32 v3, s2, v1 8600; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 8601; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s2, v1 8602; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v4 8603; GFX90A-NEXT: s_ashr_i32 s2, s7, 31 8604; GFX90A-NEXT: s_add_i32 s3, s7, s2 8605; GFX90A-NEXT: v_add_u32_e32 v3, 1, v0 8606; GFX90A-NEXT: v_mul_f32_e32 v1, s10, v1 8607; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 8608; GFX90A-NEXT: s_xor_b32 s1, s2, s1 8609; GFX90A-NEXT: s_xor_b32 s2, s3, s2 8610; GFX90A-NEXT: s_sub_i32 s3, 0, s0 8611; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 8612; GFX90A-NEXT: v_mul_lo_u32 v3, s3, v1 8613; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 8614; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 8615; GFX90A-NEXT: v_mul_hi_u32 v1, s2, v1 8616; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s0 8617; GFX90A-NEXT: v_sub_u32_e32 v3, s2, v3 8618; GFX90A-NEXT: v_add_u32_e32 v4, 1, v1 8619; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 8620; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 8621; GFX90A-NEXT: v_subrev_u32_e32 v4, s0, v3 8622; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 8623; GFX90A-NEXT: v_add_u32_e32 v4, 1, v1 8624; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v3 8625; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 8626; GFX90A-NEXT: v_xor_b32_e32 v0, s6, v0 8627; GFX90A-NEXT: v_xor_b32_e32 v1, s1, v1 8628; GFX90A-NEXT: v_subrev_u32_e32 v0, s6, v0 8629; GFX90A-NEXT: v_subrev_u32_e32 v1, s1, v1 8630; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 8631; GFX90A-NEXT: s_endpgm 8632 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 8633 %r = sdiv <2 x i32> %x, %shl.y 8634 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 8635 ret void 8636} 8637 8638define amdgpu_kernel void @srem_i32_oddk_denom(i32 addrspace(1)* %out, i32 %x) { 8639; CHECK-LABEL: @srem_i32_oddk_denom( 8640; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 1235195 8641; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 8642; CHECK-NEXT: ret void 8643; 8644; GFX6-LABEL: srem_i32_oddk_denom: 8645; GFX6: ; %bb.0: 8646; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 8647; GFX6-NEXT: v_mov_b32_e32 v0, 0xd9528441 8648; GFX6-NEXT: s_mov_b32 s2, 0x12d8fb 8649; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 8650; GFX6-NEXT: s_mov_b32 s3, 0xf000 8651; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8652; GFX6-NEXT: v_mul_hi_i32 v0, s4, v0 8653; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v0 8654; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v0 8655; GFX6-NEXT: v_ashrrev_i32_e32 v0, 20, v0 8656; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 8657; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 8658; GFX6-NEXT: s_mov_b32 s2, -1 8659; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 8660; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 8661; GFX6-NEXT: s_endpgm 8662; 8663; GFX9-LABEL: srem_i32_oddk_denom: 8664; GFX9: ; %bb.0: 8665; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 8666; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8667; GFX9-NEXT: v_mov_b32_e32 v0, 0 8668; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8669; GFX9-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 8670; GFX9-NEXT: s_add_i32 s0, s0, s4 8671; GFX9-NEXT: s_lshr_b32 s1, s0, 31 8672; GFX9-NEXT: s_ashr_i32 s0, s0, 20 8673; GFX9-NEXT: s_add_i32 s0, s0, s1 8674; GFX9-NEXT: s_mul_i32 s0, s0, 0x12d8fb 8675; GFX9-NEXT: s_sub_i32 s0, s4, s0 8676; GFX9-NEXT: v_mov_b32_e32 v1, s0 8677; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 8678; GFX9-NEXT: s_endpgm 8679; 8680; GFX90A-LABEL: srem_i32_oddk_denom: 8681; GFX90A: ; %bb.0: 8682; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 8683; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8684; GFX90A-NEXT: v_mov_b32_e32 v0, 0 8685; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8686; GFX90A-NEXT: s_mul_hi_i32 s0, s4, 0xd9528441 8687; GFX90A-NEXT: s_add_i32 s0, s0, s4 8688; GFX90A-NEXT: s_lshr_b32 s1, s0, 31 8689; GFX90A-NEXT: s_ashr_i32 s0, s0, 20 8690; GFX90A-NEXT: s_add_i32 s0, s0, s1 8691; GFX90A-NEXT: s_mul_i32 s0, s0, 0x12d8fb 8692; GFX90A-NEXT: s_sub_i32 s0, s4, s0 8693; GFX90A-NEXT: v_mov_b32_e32 v1, s0 8694; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 8695; GFX90A-NEXT: s_endpgm 8696 %r = srem i32 %x, 1235195 8697 store i32 %r, i32 addrspace(1)* %out 8698 ret void 8699} 8700 8701define amdgpu_kernel void @srem_i32_pow2k_denom(i32 addrspace(1)* %out, i32 %x) { 8702; CHECK-LABEL: @srem_i32_pow2k_denom( 8703; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], 4096 8704; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 8705; CHECK-NEXT: ret void 8706; 8707; GFX6-LABEL: srem_i32_pow2k_denom: 8708; GFX6: ; %bb.0: 8709; GFX6-NEXT: s_load_dword s4, s[0:1], 0xb 8710; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 8711; GFX6-NEXT: s_mov_b32 s3, 0xf000 8712; GFX6-NEXT: s_mov_b32 s2, -1 8713; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8714; GFX6-NEXT: s_ashr_i32 s5, s4, 31 8715; GFX6-NEXT: s_lshr_b32 s5, s5, 20 8716; GFX6-NEXT: s_add_i32 s5, s4, s5 8717; GFX6-NEXT: s_and_b32 s5, s5, 0xfffff000 8718; GFX6-NEXT: s_sub_i32 s4, s4, s5 8719; GFX6-NEXT: v_mov_b32_e32 v0, s4 8720; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 8721; GFX6-NEXT: s_endpgm 8722; 8723; GFX9-LABEL: srem_i32_pow2k_denom: 8724; GFX9: ; %bb.0: 8725; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c 8726; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8727; GFX9-NEXT: v_mov_b32_e32 v0, 0 8728; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8729; GFX9-NEXT: s_ashr_i32 s0, s4, 31 8730; GFX9-NEXT: s_lshr_b32 s0, s0, 20 8731; GFX9-NEXT: s_add_i32 s0, s4, s0 8732; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000 8733; GFX9-NEXT: s_sub_i32 s0, s4, s0 8734; GFX9-NEXT: v_mov_b32_e32 v1, s0 8735; GFX9-NEXT: global_store_dword v0, v1, s[2:3] 8736; GFX9-NEXT: s_endpgm 8737; 8738; GFX90A-LABEL: srem_i32_pow2k_denom: 8739; GFX90A: ; %bb.0: 8740; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x2c 8741; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 8742; GFX90A-NEXT: v_mov_b32_e32 v0, 0 8743; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8744; GFX90A-NEXT: s_ashr_i32 s0, s4, 31 8745; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 8746; GFX90A-NEXT: s_add_i32 s0, s4, s0 8747; GFX90A-NEXT: s_and_b32 s0, s0, 0xfffff000 8748; GFX90A-NEXT: s_sub_i32 s0, s4, s0 8749; GFX90A-NEXT: v_mov_b32_e32 v1, s0 8750; GFX90A-NEXT: global_store_dword v0, v1, s[2:3] 8751; GFX90A-NEXT: s_endpgm 8752 %r = srem i32 %x, 4096 8753 store i32 %r, i32 addrspace(1)* %out 8754 ret void 8755} 8756 8757define amdgpu_kernel void @srem_i32_pow2_shl_denom(i32 addrspace(1)* %out, i32 %x, i32 %y) { 8758; CHECK-LABEL: @srem_i32_pow2_shl_denom( 8759; CHECK-NEXT: [[SHL_Y:%.*]] = shl i32 4096, [[Y:%.*]] 8760; CHECK-NEXT: [[R:%.*]] = srem i32 [[X:%.*]], [[SHL_Y]] 8761; CHECK-NEXT: store i32 [[R]], i32 addrspace(1)* [[OUT:%.*]], align 4 8762; CHECK-NEXT: ret void 8763; 8764; GFX6-LABEL: srem_i32_pow2_shl_denom: 8765; GFX6: ; %bb.0: 8766; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb 8767; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 8768; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8769; GFX6-NEXT: s_lshl_b32 s3, 0x1000, s3 8770; GFX6-NEXT: s_ashr_i32 s4, s3, 31 8771; GFX6-NEXT: s_add_i32 s3, s3, s4 8772; GFX6-NEXT: s_xor_b32 s4, s3, s4 8773; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 8774; GFX6-NEXT: s_sub_i32 s3, 0, s4 8775; GFX6-NEXT: s_ashr_i32 s5, s2, 31 8776; GFX6-NEXT: s_add_i32 s2, s2, s5 8777; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 8778; GFX6-NEXT: s_xor_b32 s6, s2, s5 8779; GFX6-NEXT: s_mov_b32 s2, -1 8780; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 8781; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 8782; GFX6-NEXT: v_mul_lo_u32 v1, s3, v0 8783; GFX6-NEXT: s_mov_b32 s3, 0xf000 8784; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 8785; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 8786; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0 8787; GFX6-NEXT: v_mul_lo_u32 v0, v0, s4 8788; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 8789; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 8790; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 8791; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 8792; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s4, v0 8793; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s4, v0 8794; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 8795; GFX6-NEXT: v_xor_b32_e32 v0, s5, v0 8796; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s5, v0 8797; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 8798; GFX6-NEXT: s_endpgm 8799; 8800; GFX9-LABEL: srem_i32_pow2_shl_denom: 8801; GFX9: ; %bb.0: 8802; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 8803; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8804; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s3 8805; GFX9-NEXT: s_ashr_i32 s4, s3, 31 8806; GFX9-NEXT: s_add_i32 s3, s3, s4 8807; GFX9-NEXT: s_xor_b32 s3, s3, s4 8808; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3 8809; GFX9-NEXT: s_sub_i32 s4, 0, s3 8810; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 8811; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 8812; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 8813; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 8814; GFX9-NEXT: v_mul_lo_u32 v1, s4, v0 8815; GFX9-NEXT: s_ashr_i32 s4, s2, 31 8816; GFX9-NEXT: s_add_i32 s2, s2, s4 8817; GFX9-NEXT: s_xor_b32 s2, s2, s4 8818; GFX9-NEXT: v_mul_hi_u32 v1, v0, v1 8819; GFX9-NEXT: v_add_u32_e32 v0, v0, v1 8820; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 8821; GFX9-NEXT: v_mov_b32_e32 v1, 0 8822; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3 8823; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 8824; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 8825; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 8826; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 8827; GFX9-NEXT: v_subrev_u32_e32 v2, s3, v0 8828; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 8829; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 8830; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 8831; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0 8832; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8833; GFX9-NEXT: global_store_dword v1, v0, s[0:1] 8834; GFX9-NEXT: s_endpgm 8835; 8836; GFX90A-LABEL: srem_i32_pow2_shl_denom: 8837; GFX90A: ; %bb.0: 8838; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 8839; GFX90A-NEXT: v_mov_b32_e32 v1, 0 8840; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 8841; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8842; GFX90A-NEXT: s_lshl_b32 s3, 0x1000, s3 8843; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 8844; GFX90A-NEXT: s_add_i32 s3, s3, s4 8845; GFX90A-NEXT: s_xor_b32 s3, s3, s4 8846; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3 8847; GFX90A-NEXT: s_sub_i32 s5, 0, s3 8848; GFX90A-NEXT: s_ashr_i32 s4, s2, 31 8849; GFX90A-NEXT: s_add_i32 s2, s2, s4 8850; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 8851; GFX90A-NEXT: s_xor_b32 s2, s2, s4 8852; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 8853; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 8854; GFX90A-NEXT: v_mul_lo_u32 v2, s5, v0 8855; GFX90A-NEXT: v_mul_hi_u32 v2, v0, v2 8856; GFX90A-NEXT: v_add_u32_e32 v0, v0, v2 8857; GFX90A-NEXT: v_mul_hi_u32 v0, s2, v0 8858; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s3 8859; GFX90A-NEXT: v_sub_u32_e32 v0, s2, v0 8860; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 8861; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 8862; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 8863; GFX90A-NEXT: v_subrev_u32_e32 v2, s3, v0 8864; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s3, v0 8865; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 8866; GFX90A-NEXT: v_xor_b32_e32 v0, s4, v0 8867; GFX90A-NEXT: v_subrev_u32_e32 v0, s4, v0 8868; GFX90A-NEXT: global_store_dword v1, v0, s[0:1] 8869; GFX90A-NEXT: s_endpgm 8870 %shl.y = shl i32 4096, %y 8871 %r = srem i32 %x, %shl.y 8872 store i32 %r, i32 addrspace(1)* %out 8873 ret void 8874} 8875 8876define amdgpu_kernel void @srem_v2i32_pow2k_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x) { 8877; CHECK-LABEL: @srem_v2i32_pow2k_denom( 8878; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 8879; CHECK-NEXT: [[TMP2:%.*]] = srem i32 [[TMP1]], 4096 8880; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i64 0 8881; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[X]], i64 1 8882; CHECK-NEXT: [[TMP5:%.*]] = srem i32 [[TMP4]], 4096 8883; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 1 8884; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 8885; CHECK-NEXT: ret void 8886; 8887; GFX6-LABEL: srem_v2i32_pow2k_denom: 8888; GFX6: ; %bb.0: 8889; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb 8890; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 8891; GFX6-NEXT: s_movk_i32 s6, 0xf000 8892; GFX6-NEXT: s_mov_b32 s3, 0xf000 8893; GFX6-NEXT: s_mov_b32 s2, -1 8894; GFX6-NEXT: s_waitcnt lgkmcnt(0) 8895; GFX6-NEXT: s_ashr_i32 s7, s4, 31 8896; GFX6-NEXT: s_lshr_b32 s7, s7, 20 8897; GFX6-NEXT: s_add_i32 s7, s4, s7 8898; GFX6-NEXT: s_and_b32 s7, s7, s6 8899; GFX6-NEXT: s_sub_i32 s4, s4, s7 8900; GFX6-NEXT: s_ashr_i32 s7, s5, 31 8901; GFX6-NEXT: s_lshr_b32 s7, s7, 20 8902; GFX6-NEXT: s_add_i32 s7, s5, s7 8903; GFX6-NEXT: s_and_b32 s6, s7, s6 8904; GFX6-NEXT: s_sub_i32 s5, s5, s6 8905; GFX6-NEXT: v_mov_b32_e32 v0, s4 8906; GFX6-NEXT: v_mov_b32_e32 v1, s5 8907; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 8908; GFX6-NEXT: s_endpgm 8909; 8910; GFX9-LABEL: srem_v2i32_pow2k_denom: 8911; GFX9: ; %bb.0: 8912; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 8913; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 8914; GFX9-NEXT: s_movk_i32 s0, 0xf000 8915; GFX9-NEXT: v_mov_b32_e32 v2, 0 8916; GFX9-NEXT: s_waitcnt lgkmcnt(0) 8917; GFX9-NEXT: s_ashr_i32 s1, s2, 31 8918; GFX9-NEXT: s_lshr_b32 s1, s1, 20 8919; GFX9-NEXT: s_add_i32 s1, s2, s1 8920; GFX9-NEXT: s_ashr_i32 s6, s3, 31 8921; GFX9-NEXT: s_and_b32 s1, s1, s0 8922; GFX9-NEXT: s_sub_i32 s1, s2, s1 8923; GFX9-NEXT: s_lshr_b32 s2, s6, 20 8924; GFX9-NEXT: s_add_i32 s2, s3, s2 8925; GFX9-NEXT: s_and_b32 s0, s2, s0 8926; GFX9-NEXT: s_sub_i32 s0, s3, s0 8927; GFX9-NEXT: v_mov_b32_e32 v0, s1 8928; GFX9-NEXT: v_mov_b32_e32 v1, s0 8929; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 8930; GFX9-NEXT: s_endpgm 8931; 8932; GFX90A-LABEL: srem_v2i32_pow2k_denom: 8933; GFX90A: ; %bb.0: 8934; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c 8935; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 8936; GFX90A-NEXT: s_movk_i32 s0, 0xf000 8937; GFX90A-NEXT: v_mov_b32_e32 v2, 0 8938; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 8939; GFX90A-NEXT: s_ashr_i32 s1, s2, 31 8940; GFX90A-NEXT: s_lshr_b32 s1, s1, 20 8941; GFX90A-NEXT: s_add_i32 s1, s2, s1 8942; GFX90A-NEXT: s_ashr_i32 s6, s3, 31 8943; GFX90A-NEXT: s_and_b32 s1, s1, s0 8944; GFX90A-NEXT: s_sub_i32 s1, s2, s1 8945; GFX90A-NEXT: s_lshr_b32 s2, s6, 20 8946; GFX90A-NEXT: s_add_i32 s2, s3, s2 8947; GFX90A-NEXT: s_and_b32 s0, s2, s0 8948; GFX90A-NEXT: s_sub_i32 s0, s3, s0 8949; GFX90A-NEXT: v_mov_b32_e32 v0, s1 8950; GFX90A-NEXT: v_mov_b32_e32 v1, s0 8951; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 8952; GFX90A-NEXT: s_endpgm 8953 %r = srem <2 x i32> %x, <i32 4096, i32 4096> 8954 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 8955 ret void 8956} 8957 8958define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) { 8959; CHECK-LABEL: @srem_v2i32_pow2_shl_denom( 8960; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i32> <i32 4096, i32 4096>, [[Y:%.*]] 8961; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[X:%.*]], i64 0 8962; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 0 8963; CHECK-NEXT: [[TMP3:%.*]] = ashr i32 [[TMP1]], 31 8964; CHECK-NEXT: [[TMP4:%.*]] = ashr i32 [[TMP2]], 31 8965; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP1]], [[TMP3]] 8966; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2]], [[TMP4]] 8967; CHECK-NEXT: [[TMP7:%.*]] = xor i32 [[TMP5]], [[TMP3]] 8968; CHECK-NEXT: [[TMP8:%.*]] = xor i32 [[TMP6]], [[TMP4]] 8969; CHECK-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float 8970; CHECK-NEXT: [[TMP10:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP9]]) 8971; CHECK-NEXT: [[TMP11:%.*]] = fmul fast float [[TMP10]], 0x41EFFFFFC0000000 8972; CHECK-NEXT: [[TMP12:%.*]] = fptoui float [[TMP11]] to i32 8973; CHECK-NEXT: [[TMP13:%.*]] = sub i32 0, [[TMP8]] 8974; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], [[TMP12]] 8975; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP12]] to i64 8976; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP14]] to i64 8977; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP15]], [[TMP16]] 8978; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 8979; CHECK-NEXT: [[TMP19:%.*]] = lshr i64 [[TMP17]], 32 8980; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i32 8981; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] 8982; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP7]] to i64 8983; CHECK-NEXT: [[TMP23:%.*]] = zext i32 [[TMP21]] to i64 8984; CHECK-NEXT: [[TMP24:%.*]] = mul i64 [[TMP22]], [[TMP23]] 8985; CHECK-NEXT: [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32 8986; CHECK-NEXT: [[TMP26:%.*]] = lshr i64 [[TMP24]], 32 8987; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[TMP26]] to i32 8988; CHECK-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], [[TMP8]] 8989; CHECK-NEXT: [[TMP29:%.*]] = sub i32 [[TMP7]], [[TMP28]] 8990; CHECK-NEXT: [[TMP30:%.*]] = icmp uge i32 [[TMP29]], [[TMP8]] 8991; CHECK-NEXT: [[TMP31:%.*]] = sub i32 [[TMP29]], [[TMP8]] 8992; CHECK-NEXT: [[TMP32:%.*]] = select i1 [[TMP30]], i32 [[TMP31]], i32 [[TMP29]] 8993; CHECK-NEXT: [[TMP33:%.*]] = icmp uge i32 [[TMP32]], [[TMP8]] 8994; CHECK-NEXT: [[TMP34:%.*]] = sub i32 [[TMP32]], [[TMP8]] 8995; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[TMP33]], i32 [[TMP34]], i32 [[TMP32]] 8996; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[TMP35]], [[TMP3]] 8997; CHECK-NEXT: [[TMP37:%.*]] = sub i32 [[TMP36]], [[TMP3]] 8998; CHECK-NEXT: [[TMP38:%.*]] = insertelement <2 x i32> undef, i32 [[TMP37]], i64 0 8999; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[X]], i64 1 9000; CHECK-NEXT: [[TMP40:%.*]] = extractelement <2 x i32> [[SHL_Y]], i64 1 9001; CHECK-NEXT: [[TMP41:%.*]] = ashr i32 [[TMP39]], 31 9002; CHECK-NEXT: [[TMP42:%.*]] = ashr i32 [[TMP40]], 31 9003; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP39]], [[TMP41]] 9004; CHECK-NEXT: [[TMP44:%.*]] = add i32 [[TMP40]], [[TMP42]] 9005; CHECK-NEXT: [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP41]] 9006; CHECK-NEXT: [[TMP46:%.*]] = xor i32 [[TMP44]], [[TMP42]] 9007; CHECK-NEXT: [[TMP47:%.*]] = uitofp i32 [[TMP46]] to float 9008; CHECK-NEXT: [[TMP48:%.*]] = call fast float @llvm.amdgcn.rcp.f32(float [[TMP47]]) 9009; CHECK-NEXT: [[TMP49:%.*]] = fmul fast float [[TMP48]], 0x41EFFFFFC0000000 9010; CHECK-NEXT: [[TMP50:%.*]] = fptoui float [[TMP49]] to i32 9011; CHECK-NEXT: [[TMP51:%.*]] = sub i32 0, [[TMP46]] 9012; CHECK-NEXT: [[TMP52:%.*]] = mul i32 [[TMP51]], [[TMP50]] 9013; CHECK-NEXT: [[TMP53:%.*]] = zext i32 [[TMP50]] to i64 9014; CHECK-NEXT: [[TMP54:%.*]] = zext i32 [[TMP52]] to i64 9015; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP53]], [[TMP54]] 9016; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP55]] to i32 9017; CHECK-NEXT: [[TMP57:%.*]] = lshr i64 [[TMP55]], 32 9018; CHECK-NEXT: [[TMP58:%.*]] = trunc i64 [[TMP57]] to i32 9019; CHECK-NEXT: [[TMP59:%.*]] = add i32 [[TMP50]], [[TMP58]] 9020; CHECK-NEXT: [[TMP60:%.*]] = zext i32 [[TMP45]] to i64 9021; CHECK-NEXT: [[TMP61:%.*]] = zext i32 [[TMP59]] to i64 9022; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP60]], [[TMP61]] 9023; CHECK-NEXT: [[TMP63:%.*]] = trunc i64 [[TMP62]] to i32 9024; CHECK-NEXT: [[TMP64:%.*]] = lshr i64 [[TMP62]], 32 9025; CHECK-NEXT: [[TMP65:%.*]] = trunc i64 [[TMP64]] to i32 9026; CHECK-NEXT: [[TMP66:%.*]] = mul i32 [[TMP65]], [[TMP46]] 9027; CHECK-NEXT: [[TMP67:%.*]] = sub i32 [[TMP45]], [[TMP66]] 9028; CHECK-NEXT: [[TMP68:%.*]] = icmp uge i32 [[TMP67]], [[TMP46]] 9029; CHECK-NEXT: [[TMP69:%.*]] = sub i32 [[TMP67]], [[TMP46]] 9030; CHECK-NEXT: [[TMP70:%.*]] = select i1 [[TMP68]], i32 [[TMP69]], i32 [[TMP67]] 9031; CHECK-NEXT: [[TMP71:%.*]] = icmp uge i32 [[TMP70]], [[TMP46]] 9032; CHECK-NEXT: [[TMP72:%.*]] = sub i32 [[TMP70]], [[TMP46]] 9033; CHECK-NEXT: [[TMP73:%.*]] = select i1 [[TMP71]], i32 [[TMP72]], i32 [[TMP70]] 9034; CHECK-NEXT: [[TMP74:%.*]] = xor i32 [[TMP73]], [[TMP41]] 9035; CHECK-NEXT: [[TMP75:%.*]] = sub i32 [[TMP74]], [[TMP41]] 9036; CHECK-NEXT: [[TMP76:%.*]] = insertelement <2 x i32> [[TMP38]], i32 [[TMP75]], i64 1 9037; CHECK-NEXT: store <2 x i32> [[TMP76]], <2 x i32> addrspace(1)* [[OUT:%.*]], align 8 9038; CHECK-NEXT: ret void 9039; 9040; GFX6-LABEL: srem_v2i32_pow2_shl_denom: 9041; GFX6: ; %bb.0: 9042; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd 9043; GFX6-NEXT: s_movk_i32 s6, 0x1000 9044; GFX6-NEXT: s_mov_b32 s10, 0x4f7ffffe 9045; GFX6-NEXT: s_mov_b32 s7, 0xf000 9046; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9047; GFX6-NEXT: s_lshl_b32 s2, s6, s2 9048; GFX6-NEXT: s_ashr_i32 s4, s2, 31 9049; GFX6-NEXT: s_add_i32 s2, s2, s4 9050; GFX6-NEXT: s_xor_b32 s2, s2, s4 9051; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2 9052; GFX6-NEXT: s_lshl_b32 s3, s6, s3 9053; GFX6-NEXT: s_ashr_i32 s6, s3, 31 9054; GFX6-NEXT: s_add_i32 s3, s3, s6 9055; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, v0 9056; GFX6-NEXT: s_sub_i32 s9, 0, s2 9057; GFX6-NEXT: s_xor_b32 s3, s3, s6 9058; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s3 9059; GFX6-NEXT: v_mul_f32_e32 v0, s10, v0 9060; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9061; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 9062; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb 9063; GFX6-NEXT: v_rcp_iflag_f32_e32 v2, v2 9064; GFX6-NEXT: s_mov_b32 s6, -1 9065; GFX6-NEXT: v_mul_lo_u32 v1, s9, v0 9066; GFX6-NEXT: s_sub_i32 s9, 0, s3 9067; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9068; GFX6-NEXT: s_ashr_i32 s8, s0, 31 9069; GFX6-NEXT: s_add_i32 s0, s0, s8 9070; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 9071; GFX6-NEXT: s_xor_b32 s0, s0, s8 9072; GFX6-NEXT: v_add_i32_e32 v0, vcc, v1, v0 9073; GFX6-NEXT: v_mul_f32_e32 v1, s10, v2 9074; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9075; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0 9076; GFX6-NEXT: v_mul_lo_u32 v2, s9, v1 9077; GFX6-NEXT: v_mul_lo_u32 v0, v0, s2 9078; GFX6-NEXT: v_mul_hi_u32 v2, v1, v2 9079; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 9080; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v0 9081; GFX6-NEXT: s_ashr_i32 s0, s1, 31 9082; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 9083; GFX6-NEXT: s_add_i32 s1, s1, s0 9084; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 9085; GFX6-NEXT: s_xor_b32 s1, s1, s0 9086; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 9087; GFX6-NEXT: v_mul_hi_u32 v1, s1, v1 9088; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, s2, v0 9089; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s2, v0 9090; GFX6-NEXT: v_mul_lo_u32 v1, v1, s3 9091; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 9092; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 9093; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 9094; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s1, v1 9095; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 9096; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 9097; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 9098; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s3, v1 9099; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s3, v1 9100; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 9101; GFX6-NEXT: v_xor_b32_e32 v1, s0, v1 9102; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, s0, v1 9103; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 9104; GFX6-NEXT: s_endpgm 9105; 9106; GFX9-LABEL: srem_v2i32_pow2_shl_denom: 9107; GFX9: ; %bb.0: 9108; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 9109; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 9110; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 9111; GFX9-NEXT: s_movk_i32 s0, 0x1000 9112; GFX9-NEXT: s_mov_b32 s8, 0x4f7ffffe 9113; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9114; GFX9-NEXT: s_lshl_b32 s1, s0, s2 9115; GFX9-NEXT: s_ashr_i32 s2, s1, 31 9116; GFX9-NEXT: s_add_i32 s1, s1, s2 9117; GFX9-NEXT: s_xor_b32 s1, s1, s2 9118; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1 9119; GFX9-NEXT: s_lshl_b32 s0, s0, s3 9120; GFX9-NEXT: s_ashr_i32 s2, s0, 31 9121; GFX9-NEXT: s_add_i32 s0, s0, s2 9122; GFX9-NEXT: s_xor_b32 s0, s0, s2 9123; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0 9124; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s0 9125; GFX9-NEXT: s_sub_i32 s3, 0, s1 9126; GFX9-NEXT: s_ashr_i32 s2, s6, 31 9127; GFX9-NEXT: v_mul_f32_e32 v0, s8, v0 9128; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1 9129; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 9130; GFX9-NEXT: v_mov_b32_e32 v2, 0 9131; GFX9-NEXT: v_mul_f32_e32 v1, s8, v1 9132; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 9133; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 9134; GFX9-NEXT: s_add_i32 s3, s6, s2 9135; GFX9-NEXT: s_sub_i32 s6, 0, s0 9136; GFX9-NEXT: v_mul_hi_u32 v3, v0, v3 9137; GFX9-NEXT: v_mul_lo_u32 v4, s6, v1 9138; GFX9-NEXT: s_xor_b32 s3, s3, s2 9139; GFX9-NEXT: s_ashr_i32 s6, s7, 31 9140; GFX9-NEXT: v_add_u32_e32 v0, v0, v3 9141; GFX9-NEXT: v_mul_hi_u32 v3, v1, v4 9142; GFX9-NEXT: v_mul_hi_u32 v0, s3, v0 9143; GFX9-NEXT: s_add_i32 s7, s7, s6 9144; GFX9-NEXT: s_xor_b32 s7, s7, s6 9145; GFX9-NEXT: v_add_u32_e32 v1, v1, v3 9146; GFX9-NEXT: v_mul_hi_u32 v1, s7, v1 9147; GFX9-NEXT: v_mul_lo_u32 v0, v0, s1 9148; GFX9-NEXT: v_mul_lo_u32 v1, v1, s0 9149; GFX9-NEXT: v_sub_u32_e32 v0, s3, v0 9150; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v0 9151; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v0 9152; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 9153; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v0 9154; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v0 9155; GFX9-NEXT: v_sub_u32_e32 v1, s7, v1 9156; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 9157; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v1 9158; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v1 9159; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 9160; GFX9-NEXT: v_subrev_u32_e32 v3, s0, v1 9161; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s0, v1 9162; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 9163; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 9164; GFX9-NEXT: v_xor_b32_e32 v1, s6, v1 9165; GFX9-NEXT: v_subrev_u32_e32 v0, s2, v0 9166; GFX9-NEXT: v_subrev_u32_e32 v1, s6, v1 9167; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 9168; GFX9-NEXT: s_endpgm 9169; 9170; GFX90A-LABEL: srem_v2i32_pow2_shl_denom: 9171; GFX90A: ; %bb.0: 9172; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 9173; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 9174; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x2c 9175; GFX90A-NEXT: s_movk_i32 s0, 0x1000 9176; GFX90A-NEXT: s_mov_b32 s9, 0x4f7ffffe 9177; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 9178; GFX90A-NEXT: s_lshl_b32 s1, s0, s2 9179; GFX90A-NEXT: s_ashr_i32 s2, s1, 31 9180; GFX90A-NEXT: s_add_i32 s1, s1, s2 9181; GFX90A-NEXT: s_xor_b32 s1, s1, s2 9182; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s1 9183; GFX90A-NEXT: s_sub_i32 s8, 0, s1 9184; GFX90A-NEXT: s_ashr_i32 s2, s6, 31 9185; GFX90A-NEXT: s_lshl_b32 s0, s0, s3 9186; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0 9187; GFX90A-NEXT: s_add_i32 s3, s6, s2 9188; GFX90A-NEXT: s_xor_b32 s3, s3, s2 9189; GFX90A-NEXT: s_ashr_i32 s6, s0, 31 9190; GFX90A-NEXT: v_mul_f32_e32 v0, s9, v0 9191; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 9192; GFX90A-NEXT: s_add_i32 s0, s0, s6 9193; GFX90A-NEXT: s_xor_b32 s0, s0, s6 9194; GFX90A-NEXT: v_mov_b32_e32 v2, 0 9195; GFX90A-NEXT: v_mul_lo_u32 v1, s8, v0 9196; GFX90A-NEXT: v_mul_hi_u32 v1, v0, v1 9197; GFX90A-NEXT: v_add_u32_e32 v0, v0, v1 9198; GFX90A-NEXT: v_mul_hi_u32 v0, s3, v0 9199; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s1 9200; GFX90A-NEXT: v_sub_u32_e32 v0, s3, v0 9201; GFX90A-NEXT: v_subrev_u32_e32 v1, s1, v0 9202; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s1, v0 9203; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc 9204; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s0 9205; GFX90A-NEXT: v_subrev_u32_e32 v3, s1, v0 9206; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s1, v0 9207; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 9208; GFX90A-NEXT: v_rcp_iflag_f32_e32 v1, v1 9209; GFX90A-NEXT: s_sub_i32 s3, 0, s0 9210; GFX90A-NEXT: v_xor_b32_e32 v0, s2, v0 9211; GFX90A-NEXT: s_ashr_i32 s1, s7, 31 9212; GFX90A-NEXT: v_mul_f32_e32 v1, s9, v1 9213; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 9214; GFX90A-NEXT: v_subrev_u32_e32 v0, s2, v0 9215; GFX90A-NEXT: s_add_i32 s2, s7, s1 9216; GFX90A-NEXT: s_xor_b32 s2, s2, s1 9217; GFX90A-NEXT: v_mul_lo_u32 v3, s3, v1 9218; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v3 9219; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 9220; GFX90A-NEXT: v_mul_hi_u32 v1, s2, v1 9221; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s0 9222; GFX90A-NEXT: v_sub_u32_e32 v1, s2, v1 9223; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v1 9224; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v1 9225; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 9226; GFX90A-NEXT: v_subrev_u32_e32 v3, s0, v1 9227; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s0, v1 9228; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 9229; GFX90A-NEXT: v_xor_b32_e32 v1, s1, v1 9230; GFX90A-NEXT: v_subrev_u32_e32 v1, s1, v1 9231; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 9232; GFX90A-NEXT: s_endpgm 9233 %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y 9234 %r = srem <2 x i32> %x, %shl.y 9235 store <2 x i32> %r, <2 x i32> addrspace(1)* %out 9236 ret void 9237} 9238 9239define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 9240; CHECK-LABEL: @udiv_i64_oddk_denom( 9241; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943 9242; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 9243; CHECK-NEXT: ret void 9244; 9245; GFX6-LABEL: udiv_i64_oddk_denom: 9246; GFX6: ; %bb.0: 9247; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f176a73 9248; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 9249; GFX6-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 9250; GFX6-NEXT: v_rcp_f32_e32 v0, v0 9251; GFX6-NEXT: s_movk_i32 s4, 0xfee0 9252; GFX6-NEXT: s_mov_b32 s5, 0x68958c89 9253; GFX6-NEXT: v_mov_b32_e32 v7, 0 9254; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9255; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9256; GFX6-NEXT: v_trunc_f32_e32 v1, v1 9257; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9258; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9259; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9260; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 9261; GFX6-NEXT: s_movk_i32 s8, 0x11f 9262; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 9263; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 9264; GFX6-NEXT: v_mul_lo_u32 v4, v1, s5 9265; GFX6-NEXT: s_mov_b32 s9, 0x976a7377 9266; GFX6-NEXT: s_mov_b32 s7, 0xf000 9267; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9268; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 9269; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 9270; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 9271; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 9272; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 9273; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 9274; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9275; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 9276; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 9277; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 9278; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 9279; GFX6-NEXT: s_mov_b32 s6, -1 9280; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 9281; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 9282; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc 9283; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9284; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9285; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9286; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9287; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 9288; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 9289; GFX6-NEXT: v_mul_lo_u32 v4, v1, s5 9290; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9291; GFX6-NEXT: s_mov_b32 s4, s0 9292; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9293; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 9294; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9295; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 9296; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 9297; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 9298; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 9299; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9300; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 9301; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 9302; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 9303; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 9304; GFX6-NEXT: s_mov_b32 s5, s1 9305; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 9306; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 9307; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc 9308; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9309; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9310; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9311; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9312; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 9313; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 9314; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 9315; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 9316; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 9317; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9318; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9319; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 9320; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 9321; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9322; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 9323; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 9324; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 9325; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 9326; GFX6-NEXT: v_mul_lo_u32 v2, v0, s8 9327; GFX6-NEXT: v_mul_hi_u32 v3, v0, s9 9328; GFX6-NEXT: v_mul_lo_u32 v4, v1, s9 9329; GFX6-NEXT: v_mov_b32_e32 v5, s8 9330; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9331; GFX6-NEXT: v_mul_lo_u32 v3, v0, s9 9332; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9333; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 9334; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 9335; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 9336; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s9, v3 9337; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 9338; GFX6-NEXT: s_movk_i32 s2, 0x11e 9339; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s2, v4 9340; GFX6-NEXT: s_mov_b32 s9, 0x976a7376 9341; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 9342; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s9, v5 9343; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 9344; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s8, v4 9345; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 9346; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 9347; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 9348; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 9349; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 9350; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 9351; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] 9352; GFX6-NEXT: v_mov_b32_e32 v6, s3 9353; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 9354; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s2, v2 9355; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 9356; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s9, v3 9357; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 9358; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s8, v2 9359; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 9360; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 9361; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 9362; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 9363; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 9364; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 9365; GFX6-NEXT: s_endpgm 9366; 9367; GFX9-LABEL: udiv_i64_oddk_denom: 9368; GFX9: ; %bb.0: 9369; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f176a73 9370; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 9371; GFX9-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 9372; GFX9-NEXT: v_rcp_f32_e32 v0, v0 9373; GFX9-NEXT: s_movk_i32 s2, 0xfee0 9374; GFX9-NEXT: s_mov_b32 s3, 0x68958c89 9375; GFX9-NEXT: v_mov_b32_e32 v6, 0 9376; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9377; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9378; GFX9-NEXT: v_trunc_f32_e32 v1, v1 9379; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9380; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 9381; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 9382; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 9383; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 9384; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 9385; GFX9-NEXT: v_mul_lo_u32 v5, v1, s3 9386; GFX9-NEXT: v_mul_lo_u32 v4, v0, s3 9387; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9388; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 9389; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 9390; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 9391; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 9392; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 9393; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 9394; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 9395; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc 9396; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 9397; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 9398; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 9399; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc 9400; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc 9401; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9402; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 9403; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 9404; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 9405; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 9406; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 9407; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 9408; GFX9-NEXT: v_mul_lo_u32 v5, v0, s3 9409; GFX9-NEXT: s_movk_i32 s2, 0x11f 9410; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9411; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 9412; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 9413; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 9414; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 9415; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 9416; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 9417; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 9418; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc 9419; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 9420; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 9421; GFX9-NEXT: s_mov_b32 s3, 0x976a7377 9422; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 9423; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v5, vcc 9424; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc 9425; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9426; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 9427; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 9428; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 9429; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9430; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 9431; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 9432; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 9433; GFX9-NEXT: v_mul_hi_u32 v5, s7, v1 9434; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 9435; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9436; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 9437; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 9438; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 9439; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 9440; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 9441; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v6, vcc 9442; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 9443; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 9444; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 9445; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 9446; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 9447; GFX9-NEXT: v_mov_b32_e32 v5, s2 9448; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9449; GFX9-NEXT: v_mul_lo_u32 v3, v0, s3 9450; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 9451; GFX9-NEXT: v_sub_u32_e32 v4, s7, v2 9452; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 9453; GFX9-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v5, vcc 9454; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s3, v3 9455; GFX9-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] 9456; GFX9-NEXT: s_movk_i32 s3, 0x11e 9457; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s3, v4 9458; GFX9-NEXT: s_mov_b32 s6, 0x976a7376 9459; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 9460; GFX9-NEXT: v_cmp_lt_u32_e64 s[0:1], s6, v5 9461; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 9462; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 9463; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v5, s[0:1] 9464; GFX9-NEXT: v_mov_b32_e32 v7, s7 9465; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v7, v2, vcc 9466; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s3, v2 9467; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 9468; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 9469; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v3 9470; GFX9-NEXT: v_cndmask_b32_e64 v4, 1, 2, s[0:1] 9471; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 9472; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s2, v2 9473; GFX9-NEXT: v_add_co_u32_e64 v4, s[0:1], v0, v4 9474; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc 9475; GFX9-NEXT: v_addc_co_u32_e64 v5, s[0:1], 0, v1, s[0:1] 9476; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 9477; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 9478; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 9479; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] 9480; GFX9-NEXT: s_endpgm 9481; 9482; GFX90A-LABEL: udiv_i64_oddk_denom: 9483; GFX90A: ; %bb.0: 9484; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f176a73 9485; GFX90A-NEXT: v_mov_b32_e32 v1, 0x4f800000 9486; GFX90A-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 9487; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 9488; GFX90A-NEXT: s_movk_i32 s2, 0xfee0 9489; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 9490; GFX90A-NEXT: s_mov_b32 s0, 0x68958c89 9491; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9492; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9493; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 9494; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9495; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 9496; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 9497; GFX90A-NEXT: v_mov_b32_e32 v2, 0 9498; GFX90A-NEXT: s_mov_b32 s3, 0x976a7377 9499; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s2 9500; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s0 9501; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 9502; GFX90A-NEXT: v_mul_lo_u32 v4, v1, s0 9503; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 9504; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s0 9505; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 9506; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 9507; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 9508; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 9509; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 9510; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 9511; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 9512; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 9513; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 9514; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc 9515; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 9516; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 9517; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 9518; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 9519; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 9520; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 9521; GFX90A-NEXT: v_mul_lo_u32 v4, v0, s2 9522; GFX90A-NEXT: v_mul_hi_u32 v5, v0, s0 9523; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s0 9524; GFX90A-NEXT: v_add_u32_e32 v4, v5, v4 9525; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 9526; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s0 9527; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 9528; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 9529; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 9530; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 9531; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 9532; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 9533; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 9534; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 9535; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 9536; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc 9537; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 9538; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 9539; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 9540; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 9541; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 9542; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 9543; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 9544; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 9545; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 9546; GFX90A-NEXT: v_mul_hi_u32 v3, s6, v1 9547; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 9548; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 9549; GFX90A-NEXT: v_mul_hi_u32 v6, s7, v0 9550; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 9551; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 9552; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 9553; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc 9554; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc 9555; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 9556; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 9557; GFX90A-NEXT: s_movk_i32 s2, 0x11f 9558; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc 9559; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s2 9560; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s3 9561; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 9562; GFX90A-NEXT: v_mul_lo_u32 v4, v1, s3 9563; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 9564; GFX90A-NEXT: v_mul_lo_u32 v5, v0, s3 9565; GFX90A-NEXT: v_sub_u32_e32 v4, s7, v3 9566; GFX90A-NEXT: v_mov_b32_e32 v6, s2 9567; GFX90A-NEXT: v_sub_co_u32_e32 v5, vcc, s6, v5 9568; GFX90A-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc 9569; GFX90A-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s3, v5 9570; GFX90A-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] 9571; GFX90A-NEXT: s_movk_i32 s3, 0x11e 9572; GFX90A-NEXT: v_cmp_lt_u32_e64 s[0:1], s3, v4 9573; GFX90A-NEXT: s_mov_b32 s6, 0x976a7376 9574; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 9575; GFX90A-NEXT: v_cmp_lt_u32_e64 s[0:1], s6, v6 9576; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 9577; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v4 9578; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] 9579; GFX90A-NEXT: v_mov_b32_e32 v7, s7 9580; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc 9581; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s3, v3 9582; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 9583; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 9584; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s6, v5 9585; GFX90A-NEXT: v_cndmask_b32_e64 v4, 1, 2, s[0:1] 9586; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 9587; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s2, v3 9588; GFX90A-NEXT: v_add_co_u32_e64 v4, s[0:1], v0, v4 9589; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc 9590; GFX90A-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] 9591; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 9592; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 9593; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 9594; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 9595; GFX90A-NEXT: s_endpgm 9596 %r = udiv i64 %x, 1235195949943 9597 store i64 %r, i64 addrspace(1)* %out 9598 ret void 9599} 9600 9601define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 9602; CHECK-LABEL: @udiv_i64_pow2k_denom( 9603; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], 4096 9604; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 9605; CHECK-NEXT: ret void 9606; 9607; GFX6-LABEL: udiv_i64_pow2k_denom: 9608; GFX6: ; %bb.0: 9609; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 9610; GFX6-NEXT: s_mov_b32 s7, 0xf000 9611; GFX6-NEXT: s_mov_b32 s6, -1 9612; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9613; GFX6-NEXT: s_mov_b32 s4, s0 9614; GFX6-NEXT: s_mov_b32 s5, s1 9615; GFX6-NEXT: s_lshr_b64 s[0:1], s[2:3], 12 9616; GFX6-NEXT: v_mov_b32_e32 v0, s0 9617; GFX6-NEXT: v_mov_b32_e32 v1, s1 9618; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 9619; GFX6-NEXT: s_endpgm 9620; 9621; GFX9-LABEL: udiv_i64_pow2k_denom: 9622; GFX9: ; %bb.0: 9623; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 9624; GFX9-NEXT: v_mov_b32_e32 v2, 0 9625; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9626; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 9627; GFX9-NEXT: v_mov_b32_e32 v0, s2 9628; GFX9-NEXT: v_mov_b32_e32 v1, s3 9629; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 9630; GFX9-NEXT: s_endpgm 9631; 9632; GFX90A-LABEL: udiv_i64_pow2k_denom: 9633; GFX90A: ; %bb.0: 9634; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 9635; GFX90A-NEXT: v_mov_b32_e32 v2, 0 9636; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 9637; GFX90A-NEXT: s_lshr_b64 s[2:3], s[2:3], 12 9638; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] 9639; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 9640; GFX90A-NEXT: s_endpgm 9641 %r = udiv i64 %x, 4096 9642 store i64 %r, i64 addrspace(1)* %out 9643 ret void 9644} 9645 9646define amdgpu_kernel void @udiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 9647; CHECK-LABEL: @udiv_i64_pow2_shl_denom( 9648; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 9649; CHECK-NEXT: [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]] 9650; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 9651; CHECK-NEXT: ret void 9652; 9653; GFX6-LABEL: udiv_i64_pow2_shl_denom: 9654; GFX6: ; %bb.0: 9655; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 9656; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd 9657; GFX6-NEXT: s_mov_b32 s3, 0xf000 9658; GFX6-NEXT: s_mov_b32 s2, -1 9659; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9660; GFX6-NEXT: s_mov_b32 s0, s4 9661; GFX6-NEXT: s_add_i32 s8, s8, 12 9662; GFX6-NEXT: s_mov_b32 s1, s5 9663; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], s8 9664; GFX6-NEXT: v_mov_b32_e32 v0, s4 9665; GFX6-NEXT: v_mov_b32_e32 v1, s5 9666; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 9667; GFX6-NEXT: s_endpgm 9668; 9669; GFX9-LABEL: udiv_i64_pow2_shl_denom: 9670; GFX9: ; %bb.0: 9671; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 9672; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 9673; GFX9-NEXT: v_mov_b32_e32 v2, 0 9674; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9675; GFX9-NEXT: s_add_i32 s2, s2, 12 9676; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], s2 9677; GFX9-NEXT: v_mov_b32_e32 v0, s0 9678; GFX9-NEXT: v_mov_b32_e32 v1, s1 9679; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 9680; GFX9-NEXT: s_endpgm 9681; 9682; GFX90A-LABEL: udiv_i64_pow2_shl_denom: 9683; GFX90A: ; %bb.0: 9684; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x34 9685; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 9686; GFX90A-NEXT: v_mov_b32_e32 v2, 0 9687; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 9688; GFX90A-NEXT: s_add_i32 s2, s2, 12 9689; GFX90A-NEXT: s_lshr_b64 s[0:1], s[6:7], s2 9690; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 9691; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 9692; GFX90A-NEXT: s_endpgm 9693 %shl.y = shl i64 4096, %y 9694 %r = udiv i64 %x, %shl.y 9695 store i64 %r, i64 addrspace(1)* %out 9696 ret void 9697} 9698 9699define amdgpu_kernel void @udiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 9700; CHECK-LABEL: @udiv_v2i64_pow2k_denom( 9701; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 9702; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 9703; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 9704; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 9705; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4096 9706; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 9707; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 9708; CHECK-NEXT: ret void 9709; 9710; GFX6-LABEL: udiv_v2i64_pow2k_denom: 9711; GFX6: ; %bb.0: 9712; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 9713; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 9714; GFX6-NEXT: s_mov_b32 s3, 0xf000 9715; GFX6-NEXT: s_mov_b32 s2, -1 9716; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9717; GFX6-NEXT: s_lshr_b64 s[4:5], s[4:5], 12 9718; GFX6-NEXT: s_lshr_b64 s[6:7], s[6:7], 12 9719; GFX6-NEXT: v_mov_b32_e32 v0, s4 9720; GFX6-NEXT: v_mov_b32_e32 v1, s5 9721; GFX6-NEXT: v_mov_b32_e32 v2, s6 9722; GFX6-NEXT: v_mov_b32_e32 v3, s7 9723; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 9724; GFX6-NEXT: s_endpgm 9725; 9726; GFX9-LABEL: udiv_v2i64_pow2k_denom: 9727; GFX9: ; %bb.0: 9728; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 9729; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 9730; GFX9-NEXT: v_mov_b32_e32 v4, 0 9731; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9732; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 9733; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], 12 9734; GFX9-NEXT: v_mov_b32_e32 v0, s0 9735; GFX9-NEXT: v_mov_b32_e32 v1, s1 9736; GFX9-NEXT: v_mov_b32_e32 v2, s4 9737; GFX9-NEXT: v_mov_b32_e32 v3, s5 9738; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 9739; GFX9-NEXT: s_endpgm 9740; 9741; GFX90A-LABEL: udiv_v2i64_pow2k_denom: 9742; GFX90A: ; %bb.0: 9743; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 9744; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 9745; GFX90A-NEXT: v_mov_b32_e32 v4, 0 9746; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 9747; GFX90A-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 9748; GFX90A-NEXT: s_lshr_b64 s[4:5], s[6:7], 12 9749; GFX90A-NEXT: v_mov_b32_e32 v0, s0 9750; GFX90A-NEXT: v_mov_b32_e32 v1, s1 9751; GFX90A-NEXT: v_mov_b32_e32 v2, s4 9752; GFX90A-NEXT: v_mov_b32_e32 v3, s5 9753; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 9754; GFX90A-NEXT: s_endpgm 9755 %r = udiv <2 x i64> %x, <i64 4096, i64 4096> 9756 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 9757 ret void 9758} 9759 9760define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 9761; CHECK-LABEL: @udiv_v2i64_mixed_pow2k_denom( 9762; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 9763; CHECK-NEXT: [[TMP2:%.*]] = udiv i64 [[TMP1]], 4096 9764; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 9765; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 9766; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP4]], 4095 9767; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 9768; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 9769; CHECK-NEXT: ret void 9770; 9771; GFX6-LABEL: udiv_v2i64_mixed_pow2k_denom: 9772; GFX6: ; %bb.0: 9773; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 9774; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 9775; GFX6-NEXT: v_rcp_f32_e32 v0, v0 9776; GFX6-NEXT: s_movk_i32 s6, 0xf001 9777; GFX6-NEXT: v_mov_b32_e32 v7, 0 9778; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 9779; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 9780; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9781; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9782; GFX6-NEXT: v_trunc_f32_e32 v1, v1 9783; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9784; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 9785; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 9786; GFX6-NEXT: s_waitcnt lgkmcnt(0) 9787; GFX6-NEXT: s_lshr_b64 s[8:9], s[0:1], 12 9788; GFX6-NEXT: s_movk_i32 s0, 0xfff 9789; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 9790; GFX6-NEXT: v_mul_lo_u32 v3, v1, s6 9791; GFX6-NEXT: v_mul_lo_u32 v4, v0, s6 9792; GFX6-NEXT: s_mov_b32 s7, 0xf000 9793; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 9794; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9795; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 9796; GFX6-NEXT: v_mul_hi_u32 v5, v0, v4 9797; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 9798; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 9799; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9800; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 9801; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 9802; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 9803; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 9804; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 9805; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 9806; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc 9807; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9808; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9809; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9810; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9811; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 9812; GFX6-NEXT: v_mul_lo_u32 v3, v1, s6 9813; GFX6-NEXT: v_mul_lo_u32 v4, v0, s6 9814; GFX6-NEXT: s_mov_b32 s6, -1 9815; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 9816; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 9817; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 9818; GFX6-NEXT: v_mul_hi_u32 v5, v0, v4 9819; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 9820; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 9821; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 9822; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 9823; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 9824; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 9825; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 9826; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 9827; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 9828; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc 9829; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9830; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9831; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 9832; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 9833; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 9834; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 9835; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 9836; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 9837; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 9838; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 9839; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 9840; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 9841; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 9842; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 9843; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 9844; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 9845; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 9846; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 9847; GFX6-NEXT: v_mul_lo_u32 v4, v1, s0 9848; GFX6-NEXT: v_mul_hi_u32 v5, v0, s0 9849; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 9850; GFX6-NEXT: v_mul_lo_u32 v8, v0, s0 9851; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 9852; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 9853; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 9854; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 9855; GFX6-NEXT: v_mov_b32_e32 v5, s3 9856; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s2, v8 9857; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc 9858; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s0, v8 9859; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc 9860; GFX6-NEXT: s_movk_i32 s0, 0xffe 9861; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 9862; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 9863; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 9864; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 9865; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 9866; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 9867; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 9868; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 9869; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] 9870; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 9871; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 9872; GFX6-NEXT: v_cndmask_b32_e64 v3, v1, v3, s[0:1] 9873; GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc 9874; GFX6-NEXT: v_cndmask_b32_e64 v2, v0, v1, s[0:1] 9875; GFX6-NEXT: v_mov_b32_e32 v0, s8 9876; GFX6-NEXT: v_mov_b32_e32 v1, s9 9877; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 9878; GFX6-NEXT: s_endpgm 9879; 9880; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom: 9881; GFX9: ; %bb.0: 9882; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 9883; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 9884; GFX9-NEXT: v_rcp_f32_e32 v0, v0 9885; GFX9-NEXT: s_movk_i32 s2, 0xf001 9886; GFX9-NEXT: v_mov_b32_e32 v5, 0 9887; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 9888; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9889; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9890; GFX9-NEXT: v_trunc_f32_e32 v1, v1 9891; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9892; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 9893; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 9894; GFX9-NEXT: s_movk_i32 s8, 0xfff 9895; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 9896; GFX9-NEXT: v_mul_hi_u32 v2, v0, s2 9897; GFX9-NEXT: v_mul_lo_u32 v4, v1, s2 9898; GFX9-NEXT: v_mul_lo_u32 v3, v0, s2 9899; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 9900; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 9901; GFX9-NEXT: v_mul_hi_u32 v6, v0, v3 9902; GFX9-NEXT: v_mul_lo_u32 v4, v0, v2 9903; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 9904; GFX9-NEXT: v_mul_lo_u32 v7, v1, v3 9905; GFX9-NEXT: v_mul_hi_u32 v3, v1, v3 9906; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v6, v4 9907; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc 9908; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 9909; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 9910; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 9911; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc 9912; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc 9913; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9914; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 9915; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 9916; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 9917; GFX9-NEXT: v_mul_hi_u32 v2, v0, s2 9918; GFX9-NEXT: v_mul_lo_u32 v3, v1, s2 9919; GFX9-NEXT: v_mul_lo_u32 v4, v0, s2 9920; GFX9-NEXT: s_waitcnt lgkmcnt(0) 9921; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 12 9922; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 9923; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 9924; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 9925; GFX9-NEXT: v_mul_hi_u32 v6, v0, v4 9926; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 9927; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 9928; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 9929; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 9930; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 9931; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 9932; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 9933; GFX9-NEXT: s_movk_i32 s4, 0xffe 9934; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 9935; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc 9936; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc 9937; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9938; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 9939; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 9940; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 9941; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 9942; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 9943; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 9944; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 9945; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 9946; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 9947; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 9948; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 9949; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 9950; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 9951; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 9952; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc 9953; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 9954; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 9955; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 9956; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 9957; GFX9-NEXT: v_mul_lo_u32 v4, v0, s8 9958; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 9959; GFX9-NEXT: v_mov_b32_e32 v3, s7 9960; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s6, v4 9961; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v2, vcc 9962; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s8, v4 9963; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc 9964; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s4, v3 9965; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 9966; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 9967; GFX9-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 9968; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 9969; GFX9-NEXT: v_cndmask_b32_e64 v3, 1, 2, vcc 9970; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 9971; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc 9972; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s4, v4 9973; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 9974; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 9975; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc 9976; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 9977; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v3, vcc 9978; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v6, vcc 9979; GFX9-NEXT: v_mov_b32_e32 v0, s2 9980; GFX9-NEXT: v_mov_b32_e32 v1, s3 9981; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] 9982; GFX9-NEXT: s_endpgm 9983; 9984; GFX90A-LABEL: udiv_v2i64_mixed_pow2k_denom: 9985; GFX90A: ; %bb.0: 9986; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f800000 9987; GFX90A-NEXT: v_madak_f32 v0, 0, v0, 0x457ff000 9988; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 9989; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 9990; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 9991; GFX90A-NEXT: v_mov_b32_e32 v4, 0 9992; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 9993; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 9994; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 9995; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 9996; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 9997; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 9998; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 9999; GFX90A-NEXT: s_lshr_b64 s[0:1], s[4:5], 12 10000; GFX90A-NEXT: s_movk_i32 s4, 0xf001 10001; GFX90A-NEXT: v_mul_hi_u32 v2, v0, s4 10002; GFX90A-NEXT: v_sub_u32_e32 v2, v2, v0 10003; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s4 10004; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 10005; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s4 10006; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 10007; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 10008; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 10009; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 10010; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 10011; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 10012; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 10013; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 10014; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 10015; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc 10016; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc 10017; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 10018; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10019; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc 10020; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 10021; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 10022; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s4 10023; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s4 10024; GFX90A-NEXT: v_sub_u32_e32 v3, v3, v0 10025; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 10026; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s4 10027; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 10028; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 10029; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 10030; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 10031; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 10032; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 10033; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 10034; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 10035; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 10036; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc 10037; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc 10038; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 10039; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10040; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc 10041; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 10042; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 10043; GFX90A-NEXT: v_mul_lo_u32 v3, s6, v1 10044; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 10045; GFX90A-NEXT: v_mul_hi_u32 v2, s6, v1 10046; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 10047; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 10048; GFX90A-NEXT: v_mul_hi_u32 v6, s7, v0 10049; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 10050; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 10051; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 10052; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v2, v6, vcc 10053; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc 10054; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 10055; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 10056; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 10057; GFX90A-NEXT: s_movk_i32 s4, 0xfff 10058; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s4 10059; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s4 10060; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 10061; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s4 10062; GFX90A-NEXT: v_mov_b32_e32 v5, s7 10063; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 10064; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v5, v2, vcc 10065; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s4, v3 10066; GFX90A-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc 10067; GFX90A-NEXT: s_movk_i32 s4, 0xffe 10068; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5 10069; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 10070; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 10071; GFX90A-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 10072; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 10073; GFX90A-NEXT: v_cndmask_b32_e64 v5, 1, 2, vcc 10074; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v0, v5 10075; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc 10076; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s4, v3 10077; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 10078; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 10079; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v3, vcc 10080; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 10081; GFX90A-NEXT: v_cndmask_b32_e32 v2, v0, v5, vcc 10082; GFX90A-NEXT: v_cndmask_b32_e32 v3, v1, v6, vcc 10083; GFX90A-NEXT: v_mov_b32_e32 v0, s0 10084; GFX90A-NEXT: v_mov_b32_e32 v1, s1 10085; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 10086; GFX90A-NEXT: s_endpgm 10087 %r = udiv <2 x i64> %x, <i64 4096, i64 4095> 10088 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 10089 ret void 10090} 10091 10092define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 10093; CHECK-LABEL: @udiv_v2i64_pow2_shl_denom( 10094; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 10095; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 10096; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 10097; CHECK-NEXT: [[TMP3:%.*]] = udiv i64 [[TMP1]], [[TMP2]] 10098; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 10099; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 10100; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 10101; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP6]] 10102; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 10103; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 10104; CHECK-NEXT: ret void 10105; 10106; GFX6-LABEL: udiv_v2i64_pow2_shl_denom: 10107; GFX6: ; %bb.0: 10108; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 10109; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 10110; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 10111; GFX6-NEXT: s_mov_b32 s3, 0xf000 10112; GFX6-NEXT: s_mov_b32 s2, -1 10113; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10114; GFX6-NEXT: s_add_i32 s4, s4, 12 10115; GFX6-NEXT: s_add_i32 s6, s6, 12 10116; GFX6-NEXT: s_lshr_b64 s[4:5], s[8:9], s4 10117; GFX6-NEXT: s_lshr_b64 s[6:7], s[10:11], s6 10118; GFX6-NEXT: v_mov_b32_e32 v0, s4 10119; GFX6-NEXT: v_mov_b32_e32 v1, s5 10120; GFX6-NEXT: v_mov_b32_e32 v2, s6 10121; GFX6-NEXT: v_mov_b32_e32 v3, s7 10122; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 10123; GFX6-NEXT: s_endpgm 10124; 10125; GFX9-LABEL: udiv_v2i64_pow2_shl_denom: 10126; GFX9: ; %bb.0: 10127; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 10128; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 10129; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10130; GFX9-NEXT: v_mov_b32_e32 v4, 0 10131; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10132; GFX9-NEXT: s_add_i32 s0, s4, 12 10133; GFX9-NEXT: s_add_i32 s4, s6, 12 10134; GFX9-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 10135; GFX9-NEXT: s_lshr_b64 s[4:5], s[10:11], s4 10136; GFX9-NEXT: v_mov_b32_e32 v0, s0 10137; GFX9-NEXT: v_mov_b32_e32 v1, s1 10138; GFX9-NEXT: v_mov_b32_e32 v2, s4 10139; GFX9-NEXT: v_mov_b32_e32 v3, s5 10140; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 10141; GFX9-NEXT: s_endpgm 10142; 10143; GFX90A-LABEL: udiv_v2i64_pow2_shl_denom: 10144; GFX90A: ; %bb.0: 10145; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 10146; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 10147; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10148; GFX90A-NEXT: v_mov_b32_e32 v4, 0 10149; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 10150; GFX90A-NEXT: s_add_i32 s0, s4, 12 10151; GFX90A-NEXT: s_add_i32 s4, s6, 12 10152; GFX90A-NEXT: s_lshr_b64 s[0:1], s[8:9], s0 10153; GFX90A-NEXT: s_lshr_b64 s[4:5], s[10:11], s4 10154; GFX90A-NEXT: v_mov_b32_e32 v0, s0 10155; GFX90A-NEXT: v_mov_b32_e32 v1, s1 10156; GFX90A-NEXT: v_mov_b32_e32 v2, s4 10157; GFX90A-NEXT: v_mov_b32_e32 v3, s5 10158; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 10159; GFX90A-NEXT: s_endpgm 10160 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 10161 %r = udiv <2 x i64> %x, %shl.y 10162 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 10163 ret void 10164} 10165 10166define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 10167; CHECK-LABEL: @urem_i64_oddk_denom( 10168; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993 10169; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 10170; CHECK-NEXT: ret void 10171; 10172; GFX6-LABEL: urem_i64_oddk_denom: 10173; GFX6: ; %bb.0: 10174; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 10175; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 10176; GFX6-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 10177; GFX6-NEXT: v_rcp_f32_e32 v0, v0 10178; GFX6-NEXT: s_movk_i32 s2, 0xfee0 10179; GFX6-NEXT: s_mov_b32 s3, 0x689e0837 10180; GFX6-NEXT: v_mov_b32_e32 v7, 0 10181; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10182; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10183; GFX6-NEXT: v_trunc_f32_e32 v1, v1 10184; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10185; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 10186; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 10187; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 10188; GFX6-NEXT: s_mov_b32 s12, 0x9761f7c9 10189; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 10190; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 10191; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 10192; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10193; GFX6-NEXT: s_mov_b32 s8, s4 10194; GFX6-NEXT: s_movk_i32 s4, 0x11f 10195; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10196; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 10197; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 10198; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 10199; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 10200; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 10201; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 10202; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 10203; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 10204; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 10205; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 10206; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 10207; GFX6-NEXT: s_mov_b32 s9, s5 10208; GFX6-NEXT: s_movk_i32 s5, 0x11e 10209; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 10210; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 10211; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc 10212; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10213; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 10214; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 10215; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 10216; GFX6-NEXT: v_mul_lo_u32 v2, v0, s2 10217; GFX6-NEXT: v_mul_hi_u32 v3, v0, s3 10218; GFX6-NEXT: v_mul_lo_u32 v4, v1, s3 10219; GFX6-NEXT: s_mov_b32 s11, 0xf000 10220; GFX6-NEXT: s_mov_b32 s10, -1 10221; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10222; GFX6-NEXT: v_mul_lo_u32 v3, v0, s3 10223; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 10224; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 10225; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 10226; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 10227; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 10228; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 10229; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 10230; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 10231; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 10232; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 10233; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 10234; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 10235; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc 10236; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10237; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 10238; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 10239; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 10240; GFX6-NEXT: v_mul_lo_u32 v2, s6, v1 10241; GFX6-NEXT: v_mul_hi_u32 v3, s6, v0 10242; GFX6-NEXT: v_mul_hi_u32 v4, s6, v1 10243; GFX6-NEXT: v_mul_hi_u32 v5, s7, v1 10244; GFX6-NEXT: v_mul_lo_u32 v1, s7, v1 10245; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10246; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 10247; GFX6-NEXT: v_mul_lo_u32 v4, s7, v0 10248; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0 10249; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 10250; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 10251; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 10252; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 10253; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 10254; GFX6-NEXT: v_mul_lo_u32 v2, v0, s4 10255; GFX6-NEXT: v_mul_hi_u32 v3, v0, s12 10256; GFX6-NEXT: v_mul_lo_u32 v1, v1, s12 10257; GFX6-NEXT: v_mul_lo_u32 v0, v0, s12 10258; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10259; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 10260; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s7, v1 10261; GFX6-NEXT: v_mov_b32_e32 v3, s4 10262; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s6, v0 10263; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 10264; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s12, v0 10265; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 10266; GFX6-NEXT: v_cmp_lt_u32_e64 s[2:3], s5, v5 10267; GFX6-NEXT: s_mov_b32 s6, 0x9761f7c8 10268; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 10269; GFX6-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v4 10270; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 10271; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 10272; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s4, v5 10273; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s12, v4 10274; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 10275; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 10276; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 10277; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 10278; GFX6-NEXT: v_mov_b32_e32 v5, s7 10279; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 10280; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s5, v1 10281; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 10282; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0 10283; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 10284; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 10285; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 10286; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 10287; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 10288; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 10289; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 10290; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 10291; GFX6-NEXT: s_endpgm 10292; 10293; GFX9-LABEL: urem_i64_oddk_denom: 10294; GFX9: ; %bb.0: 10295; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 10296; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 10297; GFX9-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 10298; GFX9-NEXT: v_rcp_f32_e32 v0, v0 10299; GFX9-NEXT: s_movk_i32 s2, 0xfee0 10300; GFX9-NEXT: s_mov_b32 s3, 0x689e0837 10301; GFX9-NEXT: v_mov_b32_e32 v6, 0 10302; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10303; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10304; GFX9-NEXT: v_trunc_f32_e32 v1, v1 10305; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10306; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 10307; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 10308; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 10309; GFX9-NEXT: s_movk_i32 s8, 0x11f 10310; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 10311; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 10312; GFX9-NEXT: v_mul_lo_u32 v5, v1, s3 10313; GFX9-NEXT: v_mul_lo_u32 v4, v0, s3 10314; GFX9-NEXT: s_mov_b32 s9, 0x9761f7c9 10315; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 10316; GFX9-NEXT: v_add_u32_e32 v2, v2, v5 10317; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 10318; GFX9-NEXT: v_mul_lo_u32 v5, v0, v2 10319; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 10320; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 10321; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 10322; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 10323; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc 10324; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 10325; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 10326; GFX9-NEXT: s_mov_b32 s10, 0x9761f7c8 10327; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 10328; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v4, vcc 10329; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc 10330; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10331; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 10332; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 10333; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 10334; GFX9-NEXT: v_mul_lo_u32 v2, v0, s2 10335; GFX9-NEXT: v_mul_hi_u32 v3, v0, s3 10336; GFX9-NEXT: v_mul_lo_u32 v4, v1, s3 10337; GFX9-NEXT: v_mul_lo_u32 v5, v0, s3 10338; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 10339; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 10340; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 10341; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 10342; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 10343; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 10344; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 10345; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 10346; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v7, vcc 10347; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 10348; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 10349; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 10350; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v5, vcc 10351; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc 10352; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10353; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 10354; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 10355; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 10356; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10357; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 10358; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 10359; GFX9-NEXT: v_mul_hi_u32 v4, s6, v1 10360; GFX9-NEXT: v_mul_hi_u32 v5, s7, v1 10361; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 10362; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10363; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 10364; GFX9-NEXT: v_mul_lo_u32 v4, s7, v0 10365; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 10366; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 10367; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 10368; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v6, vcc 10369; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 10370; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 10371; GFX9-NEXT: v_mul_lo_u32 v2, v0, s8 10372; GFX9-NEXT: v_mul_hi_u32 v3, v0, s9 10373; GFX9-NEXT: v_mul_lo_u32 v1, v1, s9 10374; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9 10375; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 10376; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 10377; GFX9-NEXT: v_sub_u32_e32 v2, s7, v1 10378; GFX9-NEXT: v_mov_b32_e32 v3, s8 10379; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 10380; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc 10381; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s9, v0 10382; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[2:3], 0, v2, s[0:1] 10383; GFX9-NEXT: s_movk_i32 s6, 0x11e 10384; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v5 10385; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 10386; GFX9-NEXT: v_cmp_lt_u32_e64 s[2:3], s10, v4 10387; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] 10388; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 10389; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s8, v5 10390; GFX9-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s9, v4 10391; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] 10392; GFX9-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] 10393; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 10394; GFX9-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] 10395; GFX9-NEXT: v_mov_b32_e32 v4, s7 10396; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v4, v1, vcc 10397; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 10398; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 10399; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 10400; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 10401; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 10402; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 10403; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc 10404; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 10405; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 10406; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 10407; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] 10408; GFX9-NEXT: s_endpgm 10409; 10410; GFX90A-LABEL: urem_i64_oddk_denom: 10411; GFX90A: ; %bb.0: 10412; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f1761f8 10413; GFX90A-NEXT: v_mov_b32_e32 v1, 0x4f800000 10414; GFX90A-NEXT: v_madmk_f32 v0, v1, 0x438f8000, v0 10415; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 10416; GFX90A-NEXT: s_movk_i32 s2, 0xfee0 10417; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 10418; GFX90A-NEXT: s_mov_b32 s0, 0x689e0837 10419; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10420; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10421; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 10422; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10423; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 10424; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 10425; GFX90A-NEXT: v_mov_b32_e32 v2, 0 10426; GFX90A-NEXT: s_movk_i32 s8, 0x11f 10427; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s2 10428; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s0 10429; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 10430; GFX90A-NEXT: v_mul_lo_u32 v4, v1, s0 10431; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 10432; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s0 10433; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 10434; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 10435; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 10436; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 10437; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 10438; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 10439; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 10440; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 10441; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 10442; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc 10443; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 10444; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 10445; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 10446; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 10447; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 10448; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 10449; GFX90A-NEXT: v_mul_lo_u32 v4, v0, s2 10450; GFX90A-NEXT: v_mul_hi_u32 v5, v0, s0 10451; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s0 10452; GFX90A-NEXT: v_add_u32_e32 v4, v5, v4 10453; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 10454; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s0 10455; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 10456; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 10457; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 10458; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 10459; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 10460; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 10461; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 10462; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 10463; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 10464; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc 10465; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 10466; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 10467; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 10468; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 10469; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 10470; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 10471; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 10472; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 10473; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 10474; GFX90A-NEXT: v_mul_hi_u32 v3, s6, v1 10475; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 10476; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 10477; GFX90A-NEXT: v_mul_hi_u32 v6, s7, v0 10478; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 10479; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 10480; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 10481; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc 10482; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc 10483; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 10484; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 10485; GFX90A-NEXT: s_mov_b32 s9, 0x9761f7c9 10486; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc 10487; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s8 10488; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s9 10489; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 10490; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s9 10491; GFX90A-NEXT: v_add_u32_e32 v1, v3, v1 10492; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s9 10493; GFX90A-NEXT: v_sub_u32_e32 v3, s7, v1 10494; GFX90A-NEXT: v_mov_b32_e32 v4, s8 10495; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 10496; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc 10497; GFX90A-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s9, v0 10498; GFX90A-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1] 10499; GFX90A-NEXT: s_movk_i32 s6, 0x11e 10500; GFX90A-NEXT: v_cmp_lt_u32_e64 s[2:3], s6, v6 10501; GFX90A-NEXT: s_mov_b32 s10, 0x9761f7c8 10502; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 10503; GFX90A-NEXT: v_cmp_lt_u32_e64 s[2:3], s10, v5 10504; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] 10505; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 10506; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], s8, v6 10507; GFX90A-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s9, v5 10508; GFX90A-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] 10509; GFX90A-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] 10510; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 10511; GFX90A-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[0:1] 10512; GFX90A-NEXT: v_mov_b32_e32 v5, s7 10513; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc 10514; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1 10515; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 10516; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s10, v0 10517; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] 10518; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 10519; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s8, v1 10520; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 10521; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 10522; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 10523; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 10524; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 10525; GFX90A-NEXT: s_endpgm 10526 %r = urem i64 %x, 1235195393993 10527 store i64 %r, i64 addrspace(1)* %out 10528 ret void 10529} 10530 10531define amdgpu_kernel void @urem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 10532; CHECK-LABEL: @urem_i64_pow2k_denom( 10533; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], 4096 10534; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 10535; CHECK-NEXT: ret void 10536; 10537; GFX6-LABEL: urem_i64_pow2k_denom: 10538; GFX6: ; %bb.0: 10539; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 10540; GFX6-NEXT: s_mov_b32 s7, 0xf000 10541; GFX6-NEXT: s_mov_b32 s6, -1 10542; GFX6-NEXT: v_mov_b32_e32 v1, 0 10543; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10544; GFX6-NEXT: s_mov_b32 s4, s0 10545; GFX6-NEXT: s_and_b32 s0, s2, 0xfff 10546; GFX6-NEXT: s_mov_b32 s5, s1 10547; GFX6-NEXT: v_mov_b32_e32 v0, s0 10548; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 10549; GFX6-NEXT: s_endpgm 10550; 10551; GFX9-LABEL: urem_i64_pow2k_denom: 10552; GFX9: ; %bb.0: 10553; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 10554; GFX9-NEXT: v_mov_b32_e32 v1, 0 10555; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10556; GFX9-NEXT: s_and_b32 s2, s2, 0xfff 10557; GFX9-NEXT: v_mov_b32_e32 v0, s2 10558; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 10559; GFX9-NEXT: s_endpgm 10560; 10561; GFX90A-LABEL: urem_i64_pow2k_denom: 10562; GFX90A: ; %bb.0: 10563; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 10564; GFX90A-NEXT: v_mov_b32_e32 v1, 0 10565; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 10566; GFX90A-NEXT: s_and_b32 s2, s2, 0xfff 10567; GFX90A-NEXT: v_mov_b32_e32 v0, s2 10568; GFX90A-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1] 10569; GFX90A-NEXT: s_endpgm 10570 %r = urem i64 %x, 4096 10571 store i64 %r, i64 addrspace(1)* %out 10572 ret void 10573} 10574 10575define amdgpu_kernel void @urem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 10576; CHECK-LABEL: @urem_i64_pow2_shl_denom( 10577; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 10578; CHECK-NEXT: [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]] 10579; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 10580; CHECK-NEXT: ret void 10581; 10582; GFX6-LABEL: urem_i64_pow2_shl_denom: 10583; GFX6: ; %bb.0: 10584; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 10585; GFX6-NEXT: s_load_dword s8, s[0:1], 0xd 10586; GFX6-NEXT: s_mov_b32 s3, 0xf000 10587; GFX6-NEXT: s_mov_b32 s2, -1 10588; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10589; GFX6-NEXT: s_mov_b32 s0, s4 10590; GFX6-NEXT: s_mov_b32 s1, s5 10591; GFX6-NEXT: s_mov_b64 s[4:5], 0x1000 10592; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], s8 10593; GFX6-NEXT: s_add_u32 s4, s4, -1 10594; GFX6-NEXT: s_addc_u32 s5, s5, -1 10595; GFX6-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] 10596; GFX6-NEXT: v_mov_b32_e32 v0, s4 10597; GFX6-NEXT: v_mov_b32_e32 v1, s5 10598; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 10599; GFX6-NEXT: s_endpgm 10600; 10601; GFX9-LABEL: urem_i64_pow2_shl_denom: 10602; GFX9: ; %bb.0: 10603; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 10604; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 10605; GFX9-NEXT: s_mov_b64 s[0:1], 0x1000 10606; GFX9-NEXT: v_mov_b32_e32 v2, 0 10607; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10608; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 10609; GFX9-NEXT: s_add_u32 s0, s0, -1 10610; GFX9-NEXT: s_addc_u32 s1, s1, -1 10611; GFX9-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] 10612; GFX9-NEXT: v_mov_b32_e32 v0, s0 10613; GFX9-NEXT: v_mov_b32_e32 v1, s1 10614; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 10615; GFX9-NEXT: s_endpgm 10616; 10617; GFX90A-LABEL: urem_i64_pow2_shl_denom: 10618; GFX90A: ; %bb.0: 10619; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x34 10620; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 10621; GFX90A-NEXT: s_mov_b64 s[0:1], 0x1000 10622; GFX90A-NEXT: v_mov_b32_e32 v2, 0 10623; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 10624; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], s2 10625; GFX90A-NEXT: s_add_u32 s0, s0, -1 10626; GFX90A-NEXT: s_addc_u32 s1, s1, -1 10627; GFX90A-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] 10628; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] 10629; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 10630; GFX90A-NEXT: s_endpgm 10631 %shl.y = shl i64 4096, %y 10632 %r = urem i64 %x, %shl.y 10633 store i64 %r, i64 addrspace(1)* %out 10634 ret void 10635} 10636 10637define amdgpu_kernel void @urem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 10638; CHECK-LABEL: @urem_v2i64_pow2k_denom( 10639; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 10640; CHECK-NEXT: [[TMP2:%.*]] = urem i64 [[TMP1]], 4096 10641; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 10642; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 10643; CHECK-NEXT: [[TMP5:%.*]] = urem i64 [[TMP4]], 4096 10644; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 10645; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 10646; CHECK-NEXT: ret void 10647; 10648; GFX6-LABEL: urem_v2i64_pow2k_denom: 10649; GFX6: ; %bb.0: 10650; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 10651; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 10652; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10653; GFX6-NEXT: s_movk_i32 s5, 0xfff 10654; GFX6-NEXT: v_mov_b32_e32 v1, 0 10655; GFX6-NEXT: s_mov_b32 s3, 0xf000 10656; GFX6-NEXT: s_and_b32 s4, s4, s5 10657; GFX6-NEXT: s_and_b32 s5, s6, s5 10658; GFX6-NEXT: s_mov_b32 s2, -1 10659; GFX6-NEXT: v_mov_b32_e32 v0, s4 10660; GFX6-NEXT: v_mov_b32_e32 v2, s5 10661; GFX6-NEXT: v_mov_b32_e32 v3, v1 10662; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 10663; GFX6-NEXT: s_endpgm 10664; 10665; GFX9-LABEL: urem_v2i64_pow2k_denom: 10666; GFX9: ; %bb.0: 10667; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 10668; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10669; GFX9-NEXT: s_movk_i32 s0, 0xfff 10670; GFX9-NEXT: v_mov_b32_e32 v1, 0 10671; GFX9-NEXT: v_mov_b32_e32 v3, v1 10672; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10673; GFX9-NEXT: s_and_b32 s1, s4, s0 10674; GFX9-NEXT: s_and_b32 s0, s6, s0 10675; GFX9-NEXT: v_mov_b32_e32 v0, s1 10676; GFX9-NEXT: v_mov_b32_e32 v2, s0 10677; GFX9-NEXT: global_store_dwordx4 v1, v[0:3], s[2:3] 10678; GFX9-NEXT: s_endpgm 10679; 10680; GFX90A-LABEL: urem_v2i64_pow2k_denom: 10681; GFX90A: ; %bb.0: 10682; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 10683; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10684; GFX90A-NEXT: s_movk_i32 s0, 0xfff 10685; GFX90A-NEXT: v_mov_b32_e32 v1, 0 10686; GFX90A-NEXT: v_mov_b32_e32 v3, v1 10687; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 10688; GFX90A-NEXT: s_and_b32 s1, s4, s0 10689; GFX90A-NEXT: s_and_b32 s0, s6, s0 10690; GFX90A-NEXT: v_mov_b32_e32 v0, s1 10691; GFX90A-NEXT: v_mov_b32_e32 v2, s0 10692; GFX90A-NEXT: global_store_dwordx4 v1, v[0:3], s[2:3] 10693; GFX90A-NEXT: s_endpgm 10694 %r = urem <2 x i64> %x, <i64 4096, i64 4096> 10695 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 10696 ret void 10697} 10698 10699define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 10700; CHECK-LABEL: @urem_v2i64_pow2_shl_denom( 10701; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 10702; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 10703; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 10704; CHECK-NEXT: [[TMP3:%.*]] = urem i64 [[TMP1]], [[TMP2]] 10705; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 10706; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 10707; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 10708; CHECK-NEXT: [[TMP7:%.*]] = urem i64 [[TMP5]], [[TMP6]] 10709; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 10710; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 10711; CHECK-NEXT: ret void 10712; 10713; GFX6-LABEL: urem_v2i64_pow2_shl_denom: 10714; GFX6: ; %bb.0: 10715; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 10716; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 10717; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x11 10718; GFX6-NEXT: s_mov_b64 s[12:13], 0x1000 10719; GFX6-NEXT: s_mov_b32 s7, 0xf000 10720; GFX6-NEXT: s_mov_b32 s6, -1 10721; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10722; GFX6-NEXT: s_lshl_b64 s[2:3], s[12:13], s2 10723; GFX6-NEXT: s_lshl_b64 s[0:1], s[12:13], s0 10724; GFX6-NEXT: s_add_u32 s0, s0, -1 10725; GFX6-NEXT: s_addc_u32 s1, s1, -1 10726; GFX6-NEXT: s_and_b64 s[0:1], s[8:9], s[0:1] 10727; GFX6-NEXT: s_add_u32 s2, s2, -1 10728; GFX6-NEXT: s_addc_u32 s3, s3, -1 10729; GFX6-NEXT: s_and_b64 s[2:3], s[10:11], s[2:3] 10730; GFX6-NEXT: v_mov_b32_e32 v0, s0 10731; GFX6-NEXT: v_mov_b32_e32 v1, s1 10732; GFX6-NEXT: v_mov_b32_e32 v2, s2 10733; GFX6-NEXT: v_mov_b32_e32 v3, s3 10734; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 10735; GFX6-NEXT: s_endpgm 10736; 10737; GFX9-LABEL: urem_v2i64_pow2_shl_denom: 10738; GFX9: ; %bb.0: 10739; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 10740; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10741; GFX9-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 10742; GFX9-NEXT: s_mov_b64 s[0:1], 0x1000 10743; GFX9-NEXT: v_mov_b32_e32 v4, 0 10744; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10745; GFX9-NEXT: s_lshl_b64 s[6:7], s[0:1], s6 10746; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 10747; GFX9-NEXT: s_add_u32 s0, s0, -1 10748; GFX9-NEXT: s_addc_u32 s1, s1, -1 10749; GFX9-NEXT: s_and_b64 s[0:1], s[8:9], s[0:1] 10750; GFX9-NEXT: s_add_u32 s4, s6, -1 10751; GFX9-NEXT: s_addc_u32 s5, s7, -1 10752; GFX9-NEXT: s_and_b64 s[4:5], s[10:11], s[4:5] 10753; GFX9-NEXT: v_mov_b32_e32 v0, s0 10754; GFX9-NEXT: v_mov_b32_e32 v1, s1 10755; GFX9-NEXT: v_mov_b32_e32 v2, s4 10756; GFX9-NEXT: v_mov_b32_e32 v3, s5 10757; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 10758; GFX9-NEXT: s_endpgm 10759; 10760; GFX90A-LABEL: urem_v2i64_pow2_shl_denom: 10761; GFX90A: ; %bb.0: 10762; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 10763; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 10764; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34 10765; GFX90A-NEXT: s_mov_b64 s[0:1], 0x1000 10766; GFX90A-NEXT: v_mov_b32_e32 v4, 0 10767; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 10768; GFX90A-NEXT: s_lshl_b64 s[6:7], s[0:1], s6 10769; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], s4 10770; GFX90A-NEXT: s_add_u32 s0, s0, -1 10771; GFX90A-NEXT: s_addc_u32 s1, s1, -1 10772; GFX90A-NEXT: s_and_b64 s[0:1], s[8:9], s[0:1] 10773; GFX90A-NEXT: s_add_u32 s4, s6, -1 10774; GFX90A-NEXT: s_addc_u32 s5, s7, -1 10775; GFX90A-NEXT: s_and_b64 s[4:5], s[10:11], s[4:5] 10776; GFX90A-NEXT: v_mov_b32_e32 v0, s0 10777; GFX90A-NEXT: v_mov_b32_e32 v1, s1 10778; GFX90A-NEXT: v_mov_b32_e32 v2, s4 10779; GFX90A-NEXT: v_mov_b32_e32 v3, s5 10780; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 10781; GFX90A-NEXT: s_endpgm 10782 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 10783 %r = urem <2 x i64> %x, %shl.y 10784 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 10785 ret void 10786} 10787 10788define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 10789; CHECK-LABEL: @sdiv_i64_oddk_denom( 10790; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195 10791; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 10792; CHECK-NEXT: ret void 10793; 10794; GFX6-LABEL: sdiv_i64_oddk_denom: 10795; GFX6: ; %bb.0: 10796; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 10797; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 10798; GFX6-NEXT: v_rcp_f32_e32 v0, v0 10799; GFX6-NEXT: s_mov_b32 s5, 0xffed2705 10800; GFX6-NEXT: v_mov_b32_e32 v7, 0 10801; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 10802; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10803; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10804; GFX6-NEXT: v_trunc_f32_e32 v1, v1 10805; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10806; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 10807; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 10808; GFX6-NEXT: s_waitcnt lgkmcnt(0) 10809; GFX6-NEXT: s_ashr_i32 s8, s3, 31 10810; GFX6-NEXT: s_add_u32 s2, s2, s8 10811; GFX6-NEXT: v_mul_lo_u32 v2, v1, s5 10812; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 10813; GFX6-NEXT: v_mul_lo_u32 v4, v0, s5 10814; GFX6-NEXT: s_mov_b32 s9, s8 10815; GFX6-NEXT: s_addc_u32 s3, s3, s8 10816; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10817; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 10818; GFX6-NEXT: v_mul_lo_u32 v3, v0, v2 10819; GFX6-NEXT: v_mul_hi_u32 v5, v0, v4 10820; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 10821; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 10822; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 10823; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 10824; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 10825; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 10826; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 10827; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] 10828; GFX6-NEXT: s_mov_b32 s4, s0 10829; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 10830; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 10831; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v7, vcc 10832; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10833; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 10834; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 10835; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 10836; GFX6-NEXT: v_mul_lo_u32 v2, v1, s5 10837; GFX6-NEXT: v_mul_hi_u32 v3, v0, s5 10838; GFX6-NEXT: s_mov_b32 s0, 0x12d8fb 10839; GFX6-NEXT: s_mov_b32 s7, 0xf000 10840; GFX6-NEXT: s_mov_b32 s6, -1 10841; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10842; GFX6-NEXT: v_mul_lo_u32 v3, v0, s5 10843; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 10844; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 10845; GFX6-NEXT: v_mul_hi_u32 v8, v0, v3 10846; GFX6-NEXT: v_mul_hi_u32 v9, v0, v2 10847; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 10848; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 10849; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 10850; GFX6-NEXT: v_add_i32_e32 v6, vcc, v8, v6 10851; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc 10852; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 10853; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 10854; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v5, vcc 10855; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v7, vcc 10856; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10857; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 10858; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 10859; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 10860; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 10861; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 10862; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 10863; GFX6-NEXT: v_mul_hi_u32 v5, s3, v1 10864; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 10865; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 10866; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 10867; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 10868; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 10869; GFX6-NEXT: s_mov_b32 s5, s1 10870; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 10871; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 10872; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v5, v7, vcc 10873; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 10874; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 10875; GFX6-NEXT: v_mul_lo_u32 v4, v1, s0 10876; GFX6-NEXT: v_mul_hi_u32 v5, v0, s0 10877; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 10878; GFX6-NEXT: v_mul_lo_u32 v8, v0, s0 10879; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 10880; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 10881; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 10882; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 10883; GFX6-NEXT: v_mov_b32_e32 v5, s3 10884; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s2, v8 10885; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc 10886; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s0, v8 10887; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc 10888; GFX6-NEXT: s_mov_b32 s0, 0x12d8fa 10889; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 10890; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 10891; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 10892; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 10893; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 10894; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 10895; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 10896; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 10897; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] 10898; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 10899; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 10900; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 10901; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 10902; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 10903; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 10904; GFX6-NEXT: v_xor_b32_e32 v1, s8, v1 10905; GFX6-NEXT: v_mov_b32_e32 v2, s8 10906; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 10907; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 10908; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 10909; GFX6-NEXT: s_endpgm 10910; 10911; GFX9-LABEL: sdiv_i64_oddk_denom: 10912; GFX9: ; %bb.0: 10913; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 10914; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 10915; GFX9-NEXT: v_rcp_f32_e32 v0, v0 10916; GFX9-NEXT: s_mov_b32 s4, 0xffed2705 10917; GFX9-NEXT: v_mov_b32_e32 v5, 0 10918; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 10919; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 10920; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 10921; GFX9-NEXT: v_trunc_f32_e32 v1, v1 10922; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 10923; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 10924; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 10925; GFX9-NEXT: v_mul_lo_u32 v2, v1, s4 10926; GFX9-NEXT: v_mul_hi_u32 v3, v0, s4 10927; GFX9-NEXT: v_mul_lo_u32 v4, v0, s4 10928; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 10929; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 10930; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 10931; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 10932; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 10933; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 10934; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 10935; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 10936; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc 10937; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 10938; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 10939; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 10940; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc 10941; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc 10942; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10943; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 10944; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 10945; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 10946; GFX9-NEXT: v_mul_lo_u32 v2, v1, s4 10947; GFX9-NEXT: v_mul_hi_u32 v3, v0, s4 10948; GFX9-NEXT: v_mul_lo_u32 v4, v0, s4 10949; GFX9-NEXT: s_waitcnt lgkmcnt(0) 10950; GFX9-NEXT: s_ashr_i32 s4, s3, 31 10951; GFX9-NEXT: s_add_u32 s2, s2, s4 10952; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 10953; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 10954; GFX9-NEXT: v_mul_lo_u32 v7, v0, v2 10955; GFX9-NEXT: v_mul_hi_u32 v8, v0, v4 10956; GFX9-NEXT: v_mul_hi_u32 v9, v0, v2 10957; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 10958; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 10959; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 10960; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 10961; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc 10962; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 10963; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v4 10964; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc 10965; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc 10966; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 10967; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 10968; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 10969; GFX9-NEXT: s_mov_b32 s5, s4 10970; GFX9-NEXT: s_addc_u32 s3, s3, s4 10971; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 10972; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] 10973; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 10974; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 10975; GFX9-NEXT: v_mul_hi_u32 v4, s2, v1 10976; GFX9-NEXT: v_mul_hi_u32 v6, s3, v1 10977; GFX9-NEXT: v_mul_lo_u32 v1, s3, v1 10978; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 10979; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 10980; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 10981; GFX9-NEXT: v_mul_hi_u32 v0, s3, v0 10982; GFX9-NEXT: s_mov_b32 s5, 0x12d8fb 10983; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 10984; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 10985; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc 10986; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 10987; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 10988; GFX9-NEXT: v_mul_lo_u32 v2, v1, s5 10989; GFX9-NEXT: v_mul_hi_u32 v3, v0, s5 10990; GFX9-NEXT: v_mul_lo_u32 v4, v0, s5 10991; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 10992; GFX9-NEXT: v_mov_b32_e32 v3, s3 10993; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s2, v4 10994; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v2, vcc 10995; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s5, v4 10996; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc 10997; GFX9-NEXT: s_mov_b32 s2, 0x12d8fa 10998; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s2, v3 10999; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 11000; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 11001; GFX9-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 11002; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 11003; GFX9-NEXT: v_cndmask_b32_e64 v3, 1, 2, vcc 11004; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 11005; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc 11006; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s2, v4 11007; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 11008; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 11009; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v4, vcc 11010; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 11011; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 11012; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 11013; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 11014; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 11015; GFX9-NEXT: v_mov_b32_e32 v2, s4 11016; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s4, v0 11017; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc 11018; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] 11019; GFX9-NEXT: s_endpgm 11020; 11021; GFX90A-LABEL: sdiv_i64_oddk_denom: 11022; GFX90A: ; %bb.0: 11023; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f800000 11024; GFX90A-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 11025; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 11026; GFX90A-NEXT: s_mov_b32 s4, 0xffed2705 11027; GFX90A-NEXT: v_mov_b32_e32 v2, 0 11028; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 11029; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 11030; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 11031; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 11032; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 11033; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 11034; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 11035; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s4 11036; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s4 11037; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 11038; GFX90A-NEXT: v_sub_u32_e32 v3, v3, v0 11039; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s4 11040; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 11041; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 11042; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 11043; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 11044; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 11045; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 11046; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 11047; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 11048; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 11049; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc 11050; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 11051; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 11052; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 11053; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 11054; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 11055; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 11056; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s4 11057; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s4 11058; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 11059; GFX90A-NEXT: v_sub_u32_e32 v3, v3, v0 11060; GFX90A-NEXT: v_mul_lo_u32 v5, v0, s4 11061; GFX90A-NEXT: v_mul_hi_u32 v6, v1, v5 11062; GFX90A-NEXT: v_mul_lo_u32 v7, v1, v5 11063; GFX90A-NEXT: v_mul_lo_u32 v9, v0, v3 11064; GFX90A-NEXT: v_mul_hi_u32 v5, v0, v5 11065; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v3 11066; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 11067; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v8, vcc 11068; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 11069; GFX90A-NEXT: v_mul_hi_u32 v4, v1, v3 11070; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc 11071; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc 11072; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 11073; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 11074; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 11075; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 11076; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 11077; GFX90A-NEXT: s_add_u32 s2, s2, s4 11078; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 11079; GFX90A-NEXT: s_mov_b32 s5, s4 11080; GFX90A-NEXT: s_addc_u32 s3, s3, s4 11081; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 11082; GFX90A-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] 11083; GFX90A-NEXT: v_mul_lo_u32 v4, s2, v1 11084; GFX90A-NEXT: v_mul_hi_u32 v5, s2, v0 11085; GFX90A-NEXT: v_mul_hi_u32 v3, s2, v1 11086; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 11087; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 11088; GFX90A-NEXT: v_mul_hi_u32 v6, s3, v0 11089; GFX90A-NEXT: v_mul_lo_u32 v0, s3, v0 11090; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 11091; GFX90A-NEXT: v_mul_hi_u32 v5, s3, v1 11092; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc 11093; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc 11094; GFX90A-NEXT: v_mul_lo_u32 v1, s3, v1 11095; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 11096; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc 11097; GFX90A-NEXT: s_mov_b32 s5, 0x12d8fb 11098; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s5 11099; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s5 11100; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 11101; GFX90A-NEXT: v_mul_lo_u32 v4, v0, s5 11102; GFX90A-NEXT: v_mov_b32_e32 v5, s3 11103; GFX90A-NEXT: v_sub_co_u32_e32 v4, vcc, s2, v4 11104; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v3, vcc 11105; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s5, v4 11106; GFX90A-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc 11107; GFX90A-NEXT: s_mov_b32 s2, 0x12d8fa 11108; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s2, v5 11109; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 11110; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 11111; GFX90A-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 11112; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 11113; GFX90A-NEXT: v_cndmask_b32_e64 v5, 1, 2, vcc 11114; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v0, v5 11115; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc 11116; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s2, v4 11117; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 11118; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 11119; GFX90A-NEXT: v_cndmask_b32_e32 v3, -1, v4, vcc 11120; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 11121; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 11122; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 11123; GFX90A-NEXT: v_xor_b32_e32 v0, s4, v0 11124; GFX90A-NEXT: v_xor_b32_e32 v1, s4, v1 11125; GFX90A-NEXT: v_mov_b32_e32 v3, s4 11126; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s4, v0 11127; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 11128; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 11129; GFX90A-NEXT: s_endpgm 11130 %r = sdiv i64 %x, 1235195 11131 store i64 %r, i64 addrspace(1)* %out 11132 ret void 11133} 11134 11135define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 11136; CHECK-LABEL: @sdiv_i64_pow2k_denom( 11137; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], 4096 11138; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 11139; CHECK-NEXT: ret void 11140; 11141; GFX6-LABEL: sdiv_i64_pow2k_denom: 11142; GFX6: ; %bb.0: 11143; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 11144; GFX6-NEXT: s_mov_b32 s7, 0xf000 11145; GFX6-NEXT: s_mov_b32 s6, -1 11146; GFX6-NEXT: s_waitcnt lgkmcnt(0) 11147; GFX6-NEXT: s_mov_b32 s4, s0 11148; GFX6-NEXT: s_ashr_i32 s0, s3, 31 11149; GFX6-NEXT: s_lshr_b32 s0, s0, 20 11150; GFX6-NEXT: s_add_u32 s0, s2, s0 11151; GFX6-NEXT: s_mov_b32 s5, s1 11152; GFX6-NEXT: s_addc_u32 s1, s3, 0 11153; GFX6-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 11154; GFX6-NEXT: v_mov_b32_e32 v0, s0 11155; GFX6-NEXT: v_mov_b32_e32 v1, s1 11156; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 11157; GFX6-NEXT: s_endpgm 11158; 11159; GFX9-LABEL: sdiv_i64_pow2k_denom: 11160; GFX9: ; %bb.0: 11161; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 11162; GFX9-NEXT: v_mov_b32_e32 v2, 0 11163; GFX9-NEXT: s_waitcnt lgkmcnt(0) 11164; GFX9-NEXT: s_ashr_i32 s4, s3, 31 11165; GFX9-NEXT: s_lshr_b32 s4, s4, 20 11166; GFX9-NEXT: s_add_u32 s2, s2, s4 11167; GFX9-NEXT: s_addc_u32 s3, s3, 0 11168; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 11169; GFX9-NEXT: v_mov_b32_e32 v0, s2 11170; GFX9-NEXT: v_mov_b32_e32 v1, s3 11171; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 11172; GFX9-NEXT: s_endpgm 11173; 11174; GFX90A-LABEL: sdiv_i64_pow2k_denom: 11175; GFX90A: ; %bb.0: 11176; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 11177; GFX90A-NEXT: v_mov_b32_e32 v2, 0 11178; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 11179; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 11180; GFX90A-NEXT: s_lshr_b32 s4, s4, 20 11181; GFX90A-NEXT: s_add_u32 s2, s2, s4 11182; GFX90A-NEXT: s_addc_u32 s3, s3, 0 11183; GFX90A-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 11184; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] 11185; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 11186; GFX90A-NEXT: s_endpgm 11187 %r = sdiv i64 %x, 4096 11188 store i64 %r, i64 addrspace(1)* %out 11189 ret void 11190} 11191 11192define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 11193; CHECK-LABEL: @sdiv_i64_pow2_shl_denom( 11194; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 11195; CHECK-NEXT: [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]] 11196; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 11197; CHECK-NEXT: ret void 11198; 11199; GFX6-LABEL: sdiv_i64_pow2_shl_denom: 11200; GFX6: ; %bb.0: 11201; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd 11202; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 11203; GFX6-NEXT: s_mov_b32 s7, 0xf000 11204; GFX6-NEXT: s_mov_b32 s6, -1 11205; GFX6-NEXT: s_waitcnt lgkmcnt(0) 11206; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 11207; GFX6-NEXT: s_ashr_i32 s8, s3, 31 11208; GFX6-NEXT: s_add_u32 s2, s2, s8 11209; GFX6-NEXT: s_mov_b32 s9, s8 11210; GFX6-NEXT: s_addc_u32 s3, s3, s8 11211; GFX6-NEXT: s_xor_b64 s[10:11], s[2:3], s[8:9] 11212; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 11213; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11 11214; GFX6-NEXT: s_sub_u32 s4, 0, s10 11215; GFX6-NEXT: s_subb_u32 s5, 0, s11 11216; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 11217; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 11218; GFX6-NEXT: v_rcp_f32_e32 v0, v0 11219; GFX6-NEXT: s_waitcnt lgkmcnt(0) 11220; GFX6-NEXT: s_ashr_i32 s12, s3, 31 11221; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 11222; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 11223; GFX6-NEXT: v_trunc_f32_e32 v1, v1 11224; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 11225; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 11226; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 11227; GFX6-NEXT: s_add_u32 s2, s2, s12 11228; GFX6-NEXT: s_mov_b32 s13, s12 11229; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 11230; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 11231; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 11232; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 11233; GFX6-NEXT: s_addc_u32 s3, s3, s12 11234; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11235; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 11236; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 11237; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 11238; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 11239; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 11240; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 11241; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 11242; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 11243; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 11244; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 11245; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[12:13] 11246; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 11247; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 11248; GFX6-NEXT: v_mov_b32_e32 v4, 0 11249; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 11250; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11251; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 11252; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 11253; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 11254; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 11255; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 11256; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 11257; GFX6-NEXT: s_mov_b32 s5, s1 11258; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11259; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 11260; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 11261; GFX6-NEXT: v_mul_lo_u32 v7, v0, v2 11262; GFX6-NEXT: v_mul_hi_u32 v8, v0, v3 11263; GFX6-NEXT: v_mul_hi_u32 v9, v0, v2 11264; GFX6-NEXT: v_mul_hi_u32 v6, v1, v3 11265; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 11266; GFX6-NEXT: v_mul_hi_u32 v5, v1, v2 11267; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 11268; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc 11269; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 11270; GFX6-NEXT: v_add_i32_e32 v3, vcc, v7, v3 11271; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v6, vcc 11272; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v4, vcc 11273; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11274; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 11275; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 11276; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 11277; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 11278; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 11279; GFX6-NEXT: v_mul_hi_u32 v5, s2, v1 11280; GFX6-NEXT: v_mul_hi_u32 v6, s3, v1 11281; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 11282; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11283; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 11284; GFX6-NEXT: v_mul_lo_u32 v5, s3, v0 11285; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 11286; GFX6-NEXT: s_mov_b32 s4, s0 11287; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 11288; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 11289; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v6, v4, vcc 11290; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 11291; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 11292; GFX6-NEXT: v_mul_lo_u32 v2, s10, v1 11293; GFX6-NEXT: v_mul_hi_u32 v3, s10, v0 11294; GFX6-NEXT: v_mul_lo_u32 v4, s11, v0 11295; GFX6-NEXT: v_mov_b32_e32 v5, s11 11296; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11297; GFX6-NEXT: v_mul_lo_u32 v3, s10, v0 11298; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 11299; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s3, v2 11300; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 11301; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 11302; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s10, v3 11303; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 11304; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v4 11305; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 11306; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v5 11307; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 11308; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v4 11309; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 11310; GFX6-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 11311; GFX6-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] 11312; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 11313; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] 11314; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 11315; GFX6-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] 11316; GFX6-NEXT: v_mov_b32_e32 v6, s3 11317; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 11318; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v2 11319; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 11320; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 11321; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 11322; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s11, v2 11323; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc 11324; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 11325; GFX6-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] 11326; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 11327; GFX6-NEXT: s_xor_b64 s[0:1], s[12:13], s[8:9] 11328; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 11329; GFX6-NEXT: v_xor_b32_e32 v0, s0, v0 11330; GFX6-NEXT: v_xor_b32_e32 v1, s1, v1 11331; GFX6-NEXT: v_mov_b32_e32 v2, s1 11332; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 11333; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 11334; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 11335; GFX6-NEXT: s_endpgm 11336; 11337; GFX9-LABEL: sdiv_i64_pow2_shl_denom: 11338; GFX9: ; %bb.0: 11339; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 11340; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 11341; GFX9-NEXT: v_mov_b32_e32 v2, 0 11342; GFX9-NEXT: s_waitcnt lgkmcnt(0) 11343; GFX9-NEXT: s_lshl_b64 s[4:5], s[2:3], s4 11344; GFX9-NEXT: s_ashr_i32 s2, s5, 31 11345; GFX9-NEXT: s_add_u32 s4, s4, s2 11346; GFX9-NEXT: s_mov_b32 s3, s2 11347; GFX9-NEXT: s_addc_u32 s5, s5, s2 11348; GFX9-NEXT: s_xor_b64 s[8:9], s[4:5], s[2:3] 11349; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 11350; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 11351; GFX9-NEXT: s_sub_u32 s10, 0, s8 11352; GFX9-NEXT: s_subb_u32 s4, 0, s9 11353; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 11354; GFX9-NEXT: v_rcp_f32_e32 v0, v0 11355; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 11356; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 11357; GFX9-NEXT: v_trunc_f32_e32 v1, v1 11358; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 11359; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 11360; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 11361; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 11362; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 11363; GFX9-NEXT: v_mul_lo_u32 v6, s4, v0 11364; GFX9-NEXT: v_mul_lo_u32 v5, s10, v0 11365; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 11366; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 11367; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 11368; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3 11369; GFX9-NEXT: v_mul_hi_u32 v8, v0, v3 11370; GFX9-NEXT: v_mul_hi_u32 v7, v1, v5 11371; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 11372; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 11373; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 11374; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc 11375; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 11376; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 11377; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v7, vcc 11378; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v2, vcc 11379; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 11380; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 11381; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 11382; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 11383; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1 11384; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0 11385; GFX9-NEXT: v_mul_lo_u32 v5, s4, v0 11386; GFX9-NEXT: v_mul_lo_u32 v6, s10, v0 11387; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 11388; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 11389; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 11390; GFX9-NEXT: v_mul_lo_u32 v7, v0, v3 11391; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6 11392; GFX9-NEXT: v_mul_hi_u32 v9, v0, v3 11393; GFX9-NEXT: v_mul_hi_u32 v5, v1, v6 11394; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6 11395; GFX9-NEXT: v_mul_hi_u32 v4, v1, v3 11396; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 11397; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc 11398; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 11399; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 11400; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v5, vcc 11401; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc 11402; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 11403; GFX9-NEXT: s_waitcnt lgkmcnt(0) 11404; GFX9-NEXT: s_ashr_i32 s10, s7, 31 11405; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 11406; GFX9-NEXT: s_add_u32 s0, s6, s10 11407; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 11408; GFX9-NEXT: s_mov_b32 s11, s10 11409; GFX9-NEXT: s_addc_u32 s1, s7, s10 11410; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 11411; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 11412; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 11413; GFX9-NEXT: v_mul_hi_u32 v4, s6, v0 11414; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 11415; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 11416; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 11417; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 11418; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 11419; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 11420; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 11421; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 11422; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v4, v0, vcc 11423; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v2, vcc 11424; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 11425; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc 11426; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1 11427; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0 11428; GFX9-NEXT: v_mul_lo_u32 v5, s9, v0 11429; GFX9-NEXT: v_mov_b32_e32 v6, s9 11430; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 11431; GFX9-NEXT: v_mul_lo_u32 v4, s8, v0 11432; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 11433; GFX9-NEXT: v_sub_u32_e32 v5, s7, v3 11434; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s6, v4 11435; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc 11436; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s8, v4 11437; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] 11438; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v5 11439; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 11440; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v6 11441; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 11442; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v5 11443; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v6, s[0:1] 11444; GFX9-NEXT: v_mov_b32_e32 v7, s7 11445; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc 11446; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 11447; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 11448; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 11449; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v4 11450; GFX9-NEXT: v_cndmask_b32_e64 v5, 1, 2, s[0:1] 11451; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 11452; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 11453; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v0, v5 11454; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc 11455; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] 11456; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 11457; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 11458; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[2:3] 11459; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 11460; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 11461; GFX9-NEXT: v_xor_b32_e32 v1, s1, v1 11462; GFX9-NEXT: v_mov_b32_e32 v3, s1 11463; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 11464; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 11465; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 11466; GFX9-NEXT: s_endpgm 11467; 11468; GFX90A-LABEL: sdiv_i64_pow2_shl_denom: 11469; GFX90A: ; %bb.0: 11470; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x34 11471; GFX90A-NEXT: s_mov_b64 s[2:3], 0x1000 11472; GFX90A-NEXT: v_mov_b32_e32 v2, 0 11473; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 11474; GFX90A-NEXT: s_lshl_b64 s[4:5], s[2:3], s4 11475; GFX90A-NEXT: s_ashr_i32 s2, s5, 31 11476; GFX90A-NEXT: s_add_u32 s4, s4, s2 11477; GFX90A-NEXT: s_mov_b32 s3, s2 11478; GFX90A-NEXT: s_addc_u32 s5, s5, s2 11479; GFX90A-NEXT: s_xor_b64 s[8:9], s[4:5], s[2:3] 11480; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 11481; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 11482; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 11483; GFX90A-NEXT: s_sub_u32 s0, 0, s8 11484; GFX90A-NEXT: s_subb_u32 s1, 0, s9 11485; GFX90A-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 11486; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 11487; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 11488; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 11489; GFX90A-NEXT: s_mov_b32 s11, s10 11490; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 11491; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 11492; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 11493; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 11494; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 11495; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 11496; GFX90A-NEXT: v_mul_lo_u32 v3, s0, v1 11497; GFX90A-NEXT: v_mul_hi_u32 v5, s0, v0 11498; GFX90A-NEXT: v_mul_lo_u32 v4, s1, v0 11499; GFX90A-NEXT: v_add_u32_e32 v3, v5, v3 11500; GFX90A-NEXT: v_mul_lo_u32 v6, s0, v0 11501; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 11502; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 11503; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 11504; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 11505; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 11506; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 11507; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 11508; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 11509; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 11510; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 11511; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc 11512; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 11513; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 11514; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 11515; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 11516; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 11517; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 11518; GFX90A-NEXT: v_mul_lo_u32 v3, s0, v1 11519; GFX90A-NEXT: v_mul_hi_u32 v4, s0, v0 11520; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 11521; GFX90A-NEXT: v_mul_lo_u32 v4, s1, v0 11522; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 11523; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v0 11524; GFX90A-NEXT: v_mul_hi_u32 v6, v1, v5 11525; GFX90A-NEXT: v_mul_lo_u32 v7, v1, v5 11526; GFX90A-NEXT: v_mul_lo_u32 v9, v0, v3 11527; GFX90A-NEXT: v_mul_hi_u32 v5, v0, v5 11528; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v3 11529; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 11530; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v8, vcc 11531; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 11532; GFX90A-NEXT: v_mul_hi_u32 v4, v1, v3 11533; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc 11534; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc 11535; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 11536; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 11537; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 11538; GFX90A-NEXT: s_add_u32 s0, s6, s10 11539; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 11540; GFX90A-NEXT: s_addc_u32 s1, s7, s10 11541; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 11542; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 11543; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 11544; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 11545; GFX90A-NEXT: v_mul_hi_u32 v3, s6, v1 11546; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 11547; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 11548; GFX90A-NEXT: v_mul_hi_u32 v6, s7, v0 11549; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 11550; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 11551; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 11552; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc 11553; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc 11554; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 11555; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 11556; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc 11557; GFX90A-NEXT: v_mul_lo_u32 v3, s8, v1 11558; GFX90A-NEXT: v_mul_hi_u32 v4, s8, v0 11559; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 11560; GFX90A-NEXT: v_mul_lo_u32 v4, s9, v0 11561; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 11562; GFX90A-NEXT: v_mul_lo_u32 v5, s8, v0 11563; GFX90A-NEXT: v_sub_u32_e32 v4, s7, v3 11564; GFX90A-NEXT: v_mov_b32_e32 v6, s9 11565; GFX90A-NEXT: v_sub_co_u32_e32 v5, vcc, s6, v5 11566; GFX90A-NEXT: v_subb_co_u32_e64 v4, s[0:1], v4, v6, vcc 11567; GFX90A-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s8, v5 11568; GFX90A-NEXT: v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1] 11569; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v4 11570; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 11571; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v6 11572; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 11573; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v4 11574; GFX90A-NEXT: v_cndmask_b32_e64 v4, v7, v6, s[0:1] 11575; GFX90A-NEXT: v_mov_b32_e32 v7, s7 11576; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc 11577; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v3 11578; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 11579; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 11580; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 11581; GFX90A-NEXT: v_cndmask_b32_e64 v4, 1, 2, s[0:1] 11582; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 11583; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s9, v3 11584; GFX90A-NEXT: v_add_co_u32_e64 v4, s[0:1], v0, v4 11585; GFX90A-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc 11586; GFX90A-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] 11587; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 11588; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 11589; GFX90A-NEXT: s_xor_b64 s[0:1], s[10:11], s[2:3] 11590; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 11591; GFX90A-NEXT: v_xor_b32_e32 v0, s0, v0 11592; GFX90A-NEXT: v_xor_b32_e32 v1, s1, v1 11593; GFX90A-NEXT: v_mov_b32_e32 v3, s1 11594; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 11595; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 11596; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 11597; GFX90A-NEXT: s_endpgm 11598 %shl.y = shl i64 4096, %y 11599 %r = sdiv i64 %x, %shl.y 11600 store i64 %r, i64 addrspace(1)* %out 11601 ret void 11602} 11603 11604define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 11605; CHECK-LABEL: @sdiv_v2i64_pow2k_denom( 11606; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 11607; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 11608; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 11609; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 11610; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4096 11611; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 11612; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 11613; CHECK-NEXT: ret void 11614; 11615; GFX6-LABEL: sdiv_v2i64_pow2k_denom: 11616; GFX6: ; %bb.0: 11617; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 11618; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 11619; GFX6-NEXT: s_mov_b32 s3, 0xf000 11620; GFX6-NEXT: s_mov_b32 s2, -1 11621; GFX6-NEXT: s_waitcnt lgkmcnt(0) 11622; GFX6-NEXT: s_ashr_i32 s8, s5, 31 11623; GFX6-NEXT: s_lshr_b32 s8, s8, 20 11624; GFX6-NEXT: s_add_u32 s4, s4, s8 11625; GFX6-NEXT: s_addc_u32 s5, s5, 0 11626; GFX6-NEXT: s_ashr_i32 s8, s7, 31 11627; GFX6-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 11628; GFX6-NEXT: s_lshr_b32 s8, s8, 20 11629; GFX6-NEXT: s_add_u32 s6, s6, s8 11630; GFX6-NEXT: s_addc_u32 s7, s7, 0 11631; GFX6-NEXT: s_ashr_i64 s[6:7], s[6:7], 12 11632; GFX6-NEXT: v_mov_b32_e32 v0, s4 11633; GFX6-NEXT: v_mov_b32_e32 v1, s5 11634; GFX6-NEXT: v_mov_b32_e32 v2, s6 11635; GFX6-NEXT: v_mov_b32_e32 v3, s7 11636; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 11637; GFX6-NEXT: s_endpgm 11638; 11639; GFX9-LABEL: sdiv_v2i64_pow2k_denom: 11640; GFX9: ; %bb.0: 11641; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 11642; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 11643; GFX9-NEXT: v_mov_b32_e32 v4, 0 11644; GFX9-NEXT: s_waitcnt lgkmcnt(0) 11645; GFX9-NEXT: s_ashr_i32 s0, s5, 31 11646; GFX9-NEXT: s_lshr_b32 s0, s0, 20 11647; GFX9-NEXT: s_add_u32 s0, s4, s0 11648; GFX9-NEXT: s_addc_u32 s1, s5, 0 11649; GFX9-NEXT: s_ashr_i32 s4, s7, 31 11650; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 11651; GFX9-NEXT: s_lshr_b32 s4, s4, 20 11652; GFX9-NEXT: s_add_u32 s4, s6, s4 11653; GFX9-NEXT: s_addc_u32 s5, s7, 0 11654; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 11655; GFX9-NEXT: v_mov_b32_e32 v0, s0 11656; GFX9-NEXT: v_mov_b32_e32 v1, s1 11657; GFX9-NEXT: v_mov_b32_e32 v2, s4 11658; GFX9-NEXT: v_mov_b32_e32 v3, s5 11659; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 11660; GFX9-NEXT: s_endpgm 11661; 11662; GFX90A-LABEL: sdiv_v2i64_pow2k_denom: 11663; GFX90A: ; %bb.0: 11664; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 11665; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 11666; GFX90A-NEXT: v_mov_b32_e32 v4, 0 11667; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 11668; GFX90A-NEXT: s_ashr_i32 s0, s5, 31 11669; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 11670; GFX90A-NEXT: s_add_u32 s0, s4, s0 11671; GFX90A-NEXT: s_addc_u32 s1, s5, 0 11672; GFX90A-NEXT: s_ashr_i32 s4, s7, 31 11673; GFX90A-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 11674; GFX90A-NEXT: s_lshr_b32 s4, s4, 20 11675; GFX90A-NEXT: s_add_u32 s4, s6, s4 11676; GFX90A-NEXT: s_addc_u32 s5, s7, 0 11677; GFX90A-NEXT: s_ashr_i64 s[4:5], s[4:5], 12 11678; GFX90A-NEXT: v_mov_b32_e32 v0, s0 11679; GFX90A-NEXT: v_mov_b32_e32 v1, s1 11680; GFX90A-NEXT: v_mov_b32_e32 v2, s4 11681; GFX90A-NEXT: v_mov_b32_e32 v3, s5 11682; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 11683; GFX90A-NEXT: s_endpgm 11684 %r = sdiv <2 x i64> %x, <i64 4096, i64 4096> 11685 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 11686 ret void 11687} 11688 11689define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 11690; CHECK-LABEL: @ssdiv_v2i64_mixed_pow2k_denom( 11691; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 11692; CHECK-NEXT: [[TMP2:%.*]] = sdiv i64 [[TMP1]], 4096 11693; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 11694; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 11695; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP4]], 4095 11696; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 11697; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 11698; CHECK-NEXT: ret void 11699; 11700; GFX6-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 11701; GFX6: ; %bb.0: 11702; GFX6-NEXT: v_mov_b32_e32 v0, 0x457ff000 11703; GFX6-NEXT: v_mov_b32_e32 v1, 0x4f800000 11704; GFX6-NEXT: v_mac_f32_e32 v0, 0, v1 11705; GFX6-NEXT: v_rcp_f32_e32 v0, v0 11706; GFX6-NEXT: s_movk_i32 s6, 0xf001 11707; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 11708; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 11709; GFX6-NEXT: s_mov_b32 s7, 0xf000 11710; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 11711; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 11712; GFX6-NEXT: v_trunc_f32_e32 v1, v1 11713; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 11714; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 11715; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 11716; GFX6-NEXT: s_waitcnt lgkmcnt(0) 11717; GFX6-NEXT: s_ashr_i32 s8, s1, 31 11718; GFX6-NEXT: s_lshr_b32 s8, s8, 20 11719; GFX6-NEXT: v_mul_hi_u32 v2, v0, s6 11720; GFX6-NEXT: v_mul_lo_u32 v3, v1, s6 11721; GFX6-NEXT: s_add_u32 s0, s0, s8 11722; GFX6-NEXT: s_addc_u32 s1, s1, 0 11723; GFX6-NEXT: s_ashr_i64 s[8:9], s[0:1], 12 11724; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 11725; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 11726; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 11727; GFX6-NEXT: v_mul_lo_u32 v4, v0, v2 11728; GFX6-NEXT: v_mul_hi_u32 v5, v0, v3 11729; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 11730; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 11731; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 11732; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 11733; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 11734; GFX6-NEXT: v_mul_lo_u32 v6, v1, v3 11735; GFX6-NEXT: v_mul_hi_u32 v3, v1, v3 11736; GFX6-NEXT: s_ashr_i32 s10, s3, 31 11737; GFX6-NEXT: s_add_u32 s0, s2, s10 11738; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v6 11739; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc 11740; GFX6-NEXT: v_mov_b32_e32 v4, 0 11741; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 11742; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11743; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 11744; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 11745; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 11746; GFX6-NEXT: v_mul_lo_u32 v2, v1, s6 11747; GFX6-NEXT: v_mul_hi_u32 v3, v0, s6 11748; GFX6-NEXT: s_mov_b32 s11, s10 11749; GFX6-NEXT: s_addc_u32 s1, s3, s10 11750; GFX6-NEXT: s_xor_b64 s[0:1], s[0:1], s[10:11] 11751; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11752; GFX6-NEXT: v_mul_lo_u32 v3, v0, s6 11753; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 11754; GFX6-NEXT: v_mul_lo_u32 v7, v0, v2 11755; GFX6-NEXT: v_mul_hi_u32 v8, v0, v3 11756; GFX6-NEXT: v_mul_hi_u32 v9, v0, v2 11757; GFX6-NEXT: v_mul_hi_u32 v6, v1, v3 11758; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 11759; GFX6-NEXT: v_mul_hi_u32 v5, v1, v2 11760; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 11761; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc 11762; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 11763; GFX6-NEXT: v_add_i32_e32 v3, vcc, v7, v3 11764; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v6, vcc 11765; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v4, vcc 11766; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11767; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 11768; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 11769; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 11770; GFX6-NEXT: v_mul_lo_u32 v2, s0, v1 11771; GFX6-NEXT: v_mul_hi_u32 v3, s0, v0 11772; GFX6-NEXT: v_mul_hi_u32 v5, s0, v1 11773; GFX6-NEXT: v_mul_hi_u32 v6, s1, v1 11774; GFX6-NEXT: v_mul_lo_u32 v1, s1, v1 11775; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 11776; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 11777; GFX6-NEXT: v_mul_lo_u32 v5, s1, v0 11778; GFX6-NEXT: v_mul_hi_u32 v0, s1, v0 11779; GFX6-NEXT: s_movk_i32 s2, 0xfff 11780; GFX6-NEXT: s_mov_b32 s6, -1 11781; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 11782; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 11783; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v6, v4, vcc 11784; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 11785; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 11786; GFX6-NEXT: v_mul_lo_u32 v4, v1, s2 11787; GFX6-NEXT: v_mul_hi_u32 v5, v0, s2 11788; GFX6-NEXT: v_add_i32_e32 v2, vcc, 2, v0 11789; GFX6-NEXT: v_mul_lo_u32 v8, v0, s2 11790; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc 11791; GFX6-NEXT: v_add_i32_e32 v6, vcc, 1, v0 11792; GFX6-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc 11793; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 11794; GFX6-NEXT: v_mov_b32_e32 v5, s1 11795; GFX6-NEXT: v_sub_i32_e32 v8, vcc, s0, v8 11796; GFX6-NEXT: v_subb_u32_e32 v4, vcc, v5, v4, vcc 11797; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, s2, v8 11798; GFX6-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc 11799; GFX6-NEXT: s_movk_i32 s0, 0xffe 11800; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v5 11801; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 11802; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9 11803; GFX6-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 11804; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v8 11805; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 11806; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 11807; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v4 11808; GFX6-NEXT: v_cndmask_b32_e64 v4, -1, v5, s[0:1] 11809; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 11810; GFX6-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc 11811; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc 11812; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 11813; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 11814; GFX6-NEXT: v_xor_b32_e32 v0, s10, v0 11815; GFX6-NEXT: v_xor_b32_e32 v1, s10, v1 11816; GFX6-NEXT: v_mov_b32_e32 v3, s10 11817; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s10, v0 11818; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc 11819; GFX6-NEXT: v_mov_b32_e32 v0, s8 11820; GFX6-NEXT: v_mov_b32_e32 v1, s9 11821; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 11822; GFX6-NEXT: s_endpgm 11823; 11824; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 11825; GFX9: ; %bb.0: 11826; GFX9-NEXT: v_mov_b32_e32 v0, 0x457ff000 11827; GFX9-NEXT: v_mov_b32_e32 v1, 0x4f800000 11828; GFX9-NEXT: v_mac_f32_e32 v0, 0, v1 11829; GFX9-NEXT: v_rcp_f32_e32 v0, v0 11830; GFX9-NEXT: s_movk_i32 s8, 0xf001 11831; GFX9-NEXT: v_mov_b32_e32 v4, 0 11832; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 11833; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 11834; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 11835; GFX9-NEXT: v_trunc_f32_e32 v1, v1 11836; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 11837; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 11838; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 11839; GFX9-NEXT: s_waitcnt lgkmcnt(0) 11840; GFX9-NEXT: s_ashr_i32 s2, s5, 31 11841; GFX9-NEXT: s_lshr_b32 s2, s2, 20 11842; GFX9-NEXT: v_mul_hi_u32 v2, v0, s8 11843; GFX9-NEXT: v_mul_lo_u32 v3, v1, s8 11844; GFX9-NEXT: v_mul_lo_u32 v5, v0, s8 11845; GFX9-NEXT: s_add_u32 s2, s4, s2 11846; GFX9-NEXT: s_addc_u32 s3, s5, 0 11847; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 11848; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 11849; GFX9-NEXT: v_mul_lo_u32 v3, v0, v2 11850; GFX9-NEXT: v_mul_hi_u32 v6, v0, v5 11851; GFX9-NEXT: v_mul_hi_u32 v7, v0, v2 11852; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 11853; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 11854; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v6, v3 11855; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 11856; GFX9-NEXT: v_mul_lo_u32 v7, v1, v5 11857; GFX9-NEXT: v_mul_hi_u32 v5, v1, v5 11858; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12 11859; GFX9-NEXT: s_ashr_i32 s4, s7, 31 11860; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 11861; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v5, vcc 11862; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v4, vcc 11863; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 11864; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc 11865; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 11866; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 11867; GFX9-NEXT: v_mul_lo_u32 v2, v1, s8 11868; GFX9-NEXT: v_mul_hi_u32 v3, v0, s8 11869; GFX9-NEXT: v_mul_lo_u32 v5, v0, s8 11870; GFX9-NEXT: s_add_u32 s6, s6, s4 11871; GFX9-NEXT: s_mov_b32 s5, s4 11872; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 11873; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 11874; GFX9-NEXT: v_mul_lo_u32 v7, v0, v2 11875; GFX9-NEXT: v_mul_hi_u32 v8, v0, v5 11876; GFX9-NEXT: v_mul_hi_u32 v9, v0, v2 11877; GFX9-NEXT: v_mul_hi_u32 v6, v1, v5 11878; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 11879; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 11880; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 11881; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc 11882; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 11883; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 11884; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc 11885; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc 11886; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 11887; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 11888; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 11889; GFX9-NEXT: s_addc_u32 s7, s7, s4 11890; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 11891; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] 11892; GFX9-NEXT: v_mul_lo_u32 v2, s6, v1 11893; GFX9-NEXT: v_mul_hi_u32 v3, s6, v0 11894; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 11895; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 11896; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 11897; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 11898; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc 11899; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 11900; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 11901; GFX9-NEXT: s_movk_i32 s5, 0xfff 11902; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 11903; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 11904; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 11905; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v4, vcc 11906; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 11907; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 11908; GFX9-NEXT: v_mul_lo_u32 v2, v1, s5 11909; GFX9-NEXT: v_mul_hi_u32 v3, v0, s5 11910; GFX9-NEXT: v_mul_lo_u32 v5, v0, s5 11911; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 11912; GFX9-NEXT: v_mov_b32_e32 v3, s7 11913; GFX9-NEXT: v_sub_co_u32_e32 v5, vcc, s6, v5 11914; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v3, v2, vcc 11915; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s5, v5 11916; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc 11917; GFX9-NEXT: s_movk_i32 s5, 0xffe 11918; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s5, v3 11919; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 11920; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 11921; GFX9-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc 11922; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 11923; GFX9-NEXT: v_cndmask_b32_e64 v3, 1, 2, vcc 11924; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v0, v3 11925; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc 11926; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s5, v5 11927; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 11928; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 11929; GFX9-NEXT: v_cndmask_b32_e32 v2, -1, v5, vcc 11930; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 11931; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 11932; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 11933; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 11934; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 11935; GFX9-NEXT: v_mov_b32_e32 v3, s4 11936; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s4, v0 11937; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc 11938; GFX9-NEXT: v_mov_b32_e32 v0, s2 11939; GFX9-NEXT: v_mov_b32_e32 v1, s3 11940; GFX9-NEXT: s_waitcnt lgkmcnt(0) 11941; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] 11942; GFX9-NEXT: s_endpgm 11943; 11944; GFX90A-LABEL: ssdiv_v2i64_mixed_pow2k_denom: 11945; GFX90A: ; %bb.0: 11946; GFX90A-NEXT: v_mov_b32_e32 v0, 0x457ff000 11947; GFX90A-NEXT: v_mov_b32_e32 v1, 0x4f800000 11948; GFX90A-NEXT: v_mac_f32_e32 v0, 0, v1 11949; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 11950; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 11951; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 11952; GFX90A-NEXT: v_mov_b32_e32 v4, 0 11953; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 11954; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 11955; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 11956; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 11957; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 11958; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 11959; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 11960; GFX90A-NEXT: s_ashr_i32 s0, s5, 31 11961; GFX90A-NEXT: s_lshr_b32 s0, s0, 20 11962; GFX90A-NEXT: s_add_u32 s0, s4, s0 11963; GFX90A-NEXT: s_movk_i32 s4, 0xf001 11964; GFX90A-NEXT: v_mul_hi_u32 v2, v0, s4 11965; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s4 11966; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 11967; GFX90A-NEXT: v_sub_u32_e32 v2, v2, v0 11968; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s4 11969; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 11970; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 11971; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 11972; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 11973; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 11974; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 11975; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 11976; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 11977; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 11978; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc 11979; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc 11980; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 11981; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 11982; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc 11983; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 11984; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 11985; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s4 11986; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s4 11987; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 11988; GFX90A-NEXT: v_sub_u32_e32 v2, v2, v0 11989; GFX90A-NEXT: v_mul_lo_u32 v5, v0, s4 11990; GFX90A-NEXT: v_mul_hi_u32 v6, v1, v5 11991; GFX90A-NEXT: v_mul_lo_u32 v7, v1, v5 11992; GFX90A-NEXT: v_mul_lo_u32 v9, v0, v2 11993; GFX90A-NEXT: v_mul_hi_u32 v5, v0, v5 11994; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v2 11995; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 11996; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v8, vcc 11997; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 11998; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v2 11999; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc 12000; GFX90A-NEXT: s_addc_u32 s1, s5, 0 12001; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc 12002; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 12003; GFX90A-NEXT: s_ashr_i64 s[0:1], s[0:1], 12 12004; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 12005; GFX90A-NEXT: s_ashr_i32 s4, s7, 31 12006; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 12007; GFX90A-NEXT: s_add_u32 s6, s6, s4 12008; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 12009; GFX90A-NEXT: s_mov_b32 s5, s4 12010; GFX90A-NEXT: s_addc_u32 s7, s7, s4 12011; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 12012; GFX90A-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] 12013; GFX90A-NEXT: v_mul_lo_u32 v3, s6, v1 12014; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 12015; GFX90A-NEXT: v_mul_hi_u32 v2, s6, v1 12016; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 12017; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 12018; GFX90A-NEXT: v_mul_hi_u32 v6, s7, v0 12019; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 12020; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 12021; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 12022; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v2, v6, vcc 12023; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc 12024; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 12025; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 12026; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 12027; GFX90A-NEXT: s_movk_i32 s5, 0xfff 12028; GFX90A-NEXT: v_mul_lo_u32 v2, v1, s5 12029; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s5 12030; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 12031; GFX90A-NEXT: v_mul_lo_u32 v3, v0, s5 12032; GFX90A-NEXT: v_mov_b32_e32 v5, s7 12033; GFX90A-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 12034; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v5, v2, vcc 12035; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s5, v3 12036; GFX90A-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc 12037; GFX90A-NEXT: s_movk_i32 s5, 0xffe 12038; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s5, v5 12039; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 12040; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 12041; GFX90A-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 12042; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 12043; GFX90A-NEXT: v_cndmask_b32_e64 v5, 1, 2, vcc 12044; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v0, v5 12045; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc 12046; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s5, v3 12047; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc 12048; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 12049; GFX90A-NEXT: v_cndmask_b32_e32 v2, -1, v3, vcc 12050; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 12051; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc 12052; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 12053; GFX90A-NEXT: v_xor_b32_e32 v0, s4, v0 12054; GFX90A-NEXT: v_xor_b32_e32 v1, s4, v1 12055; GFX90A-NEXT: v_mov_b32_e32 v3, s4 12056; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s4, v0 12057; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc 12058; GFX90A-NEXT: v_mov_b32_e32 v0, s0 12059; GFX90A-NEXT: v_mov_b32_e32 v1, s1 12060; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 12061; GFX90A-NEXT: s_endpgm 12062 %r = sdiv <2 x i64> %x, <i64 4096, i64 4095> 12063 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 12064 ret void 12065} 12066 12067define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 12068; CHECK-LABEL: @sdiv_v2i64_pow2_shl_denom( 12069; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 12070; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 12071; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 12072; CHECK-NEXT: [[TMP3:%.*]] = sdiv i64 [[TMP1]], [[TMP2]] 12073; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 12074; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 12075; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 12076; CHECK-NEXT: [[TMP7:%.*]] = sdiv i64 [[TMP5]], [[TMP6]] 12077; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 12078; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 12079; CHECK-NEXT: ret void 12080; 12081; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom: 12082; GFX6: ; %bb.0: 12083; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 12084; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 12085; GFX6-NEXT: s_mov_b32 s18, 0x4f800000 12086; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc 12087; GFX6-NEXT: s_mov_b32 s20, 0x2f800000 12088; GFX6-NEXT: s_waitcnt lgkmcnt(0) 12089; GFX6-NEXT: s_lshl_b64 s[8:9], s[2:3], s6 12090; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 12091; GFX6-NEXT: s_ashr_i32 s12, s3, 31 12092; GFX6-NEXT: s_add_u32 s2, s2, s12 12093; GFX6-NEXT: s_mov_b32 s13, s12 12094; GFX6-NEXT: s_addc_u32 s3, s3, s12 12095; GFX6-NEXT: s_xor_b64 s[10:11], s[2:3], s[12:13] 12096; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10 12097; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11 12098; GFX6-NEXT: s_mov_b32 s21, 0xcf800000 12099; GFX6-NEXT: s_sub_u32 s6, 0, s10 12100; GFX6-NEXT: s_subb_u32 s7, 0, s11 12101; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1 12102; GFX6-NEXT: v_rcp_f32_e32 v0, v0 12103; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 12104; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xd 12105; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0 12106; GFX6-NEXT: v_mul_f32_e32 v1, s20, v0 12107; GFX6-NEXT: v_trunc_f32_e32 v1, v1 12108; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 12109; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 12110; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v0 12111; GFX6-NEXT: s_waitcnt lgkmcnt(0) 12112; GFX6-NEXT: s_ashr_i32 s14, s1, 31 12113; GFX6-NEXT: s_add_u32 s0, s0, s14 12114; GFX6-NEXT: v_mul_lo_u32 v0, s6, v1 12115; GFX6-NEXT: v_mul_hi_u32 v3, s6, v2 12116; GFX6-NEXT: v_mul_lo_u32 v4, s7, v2 12117; GFX6-NEXT: v_mul_lo_u32 v5, s6, v2 12118; GFX6-NEXT: s_mov_b32 s15, s14 12119; GFX6-NEXT: v_add_i32_e32 v0, vcc, v3, v0 12120; GFX6-NEXT: v_add_i32_e32 v3, vcc, v0, v4 12121; GFX6-NEXT: v_mul_lo_u32 v0, v2, v3 12122; GFX6-NEXT: v_mul_hi_u32 v4, v2, v5 12123; GFX6-NEXT: v_mul_hi_u32 v6, v2, v3 12124; GFX6-NEXT: v_mul_hi_u32 v7, v1, v3 12125; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 12126; GFX6-NEXT: v_add_i32_e32 v0, vcc, v4, v0 12127; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 12128; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 12129; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 12130; GFX6-NEXT: s_addc_u32 s1, s1, s14 12131; GFX6-NEXT: s_xor_b64 s[16:17], s[0:1], s[14:15] 12132; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 12133; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc 12134; GFX6-NEXT: v_mov_b32_e32 v0, 0 12135; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v0, vcc 12136; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 12137; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc 12138; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 12139; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc 12140; GFX6-NEXT: v_mul_lo_u32 v3, s6, v1 12141; GFX6-NEXT: v_mul_hi_u32 v4, s6, v2 12142; GFX6-NEXT: v_mul_lo_u32 v5, s7, v2 12143; GFX6-NEXT: s_xor_b64 s[14:15], s[14:15], s[12:13] 12144; GFX6-NEXT: s_ashr_i32 s12, s9, 31 12145; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 12146; GFX6-NEXT: v_mul_lo_u32 v4, s6, v2 12147; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 12148; GFX6-NEXT: v_mul_lo_u32 v7, v2, v3 12149; GFX6-NEXT: v_mul_hi_u32 v8, v2, v4 12150; GFX6-NEXT: v_mul_hi_u32 v9, v2, v3 12151; GFX6-NEXT: v_mul_hi_u32 v6, v1, v4 12152; GFX6-NEXT: v_mul_lo_u32 v4, v1, v4 12153; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 12154; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 12155; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc 12156; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 12157; GFX6-NEXT: v_add_i32_e32 v4, vcc, v7, v4 12158; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc 12159; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v0, vcc 12160; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 12161; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc 12162; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 12163; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc 12164; GFX6-NEXT: v_mul_lo_u32 v3, s16, v1 12165; GFX6-NEXT: v_mul_hi_u32 v4, s16, v2 12166; GFX6-NEXT: v_mul_hi_u32 v5, s16, v1 12167; GFX6-NEXT: v_mul_hi_u32 v6, s17, v1 12168; GFX6-NEXT: v_mul_lo_u32 v1, s17, v1 12169; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 12170; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc 12171; GFX6-NEXT: v_mul_lo_u32 v5, s17, v2 12172; GFX6-NEXT: v_mul_hi_u32 v2, s17, v2 12173; GFX6-NEXT: s_add_u32 s8, s8, s12 12174; GFX6-NEXT: s_mov_b32 s13, s12 12175; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 12176; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc 12177; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v0, vcc 12178; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 12179; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 12180; GFX6-NEXT: v_mul_lo_u32 v3, s10, v2 12181; GFX6-NEXT: v_mul_hi_u32 v4, s10, v1 12182; GFX6-NEXT: v_mul_lo_u32 v5, s11, v1 12183; GFX6-NEXT: v_mov_b32_e32 v6, s11 12184; GFX6-NEXT: s_addc_u32 s9, s9, s12 12185; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 12186; GFX6-NEXT: v_mul_lo_u32 v4, s10, v1 12187; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 12188; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s17, v3 12189; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s16, v4 12190; GFX6-NEXT: v_subb_u32_e64 v5, s[0:1], v5, v6, vcc 12191; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s10, v4 12192; GFX6-NEXT: v_subbrev_u32_e64 v5, s[0:1], 0, v5, s[0:1] 12193; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v5 12194; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 12195; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v6 12196; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 12197; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v5 12198; GFX6-NEXT: v_cndmask_b32_e64 v5, v7, v6, s[0:1] 12199; GFX6-NEXT: v_add_i32_e64 v6, s[0:1], 2, v1 12200; GFX6-NEXT: v_addc_u32_e64 v7, s[0:1], 0, v2, s[0:1] 12201; GFX6-NEXT: v_add_i32_e64 v8, s[0:1], 1, v1 12202; GFX6-NEXT: v_addc_u32_e64 v9, s[0:1], 0, v2, s[0:1] 12203; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 12204; GFX6-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13] 12205; GFX6-NEXT: v_cndmask_b32_e64 v5, v9, v7, s[0:1] 12206; GFX6-NEXT: v_mov_b32_e32 v7, s17 12207; GFX6-NEXT: v_cvt_f32_u32_e32 v9, s8 12208; GFX6-NEXT: v_cvt_f32_u32_e32 v10, s9 12209; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc 12210; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 12211; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 12212; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s10, v4 12213; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 12214; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s11, v3 12215; GFX6-NEXT: v_mac_f32_e32 v9, s18, v10 12216; GFX6-NEXT: v_cndmask_b32_e32 v3, v7, v4, vcc 12217; GFX6-NEXT: v_rcp_f32_e32 v4, v9 12218; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 12219; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc 12220; GFX6-NEXT: v_cndmask_b32_e64 v3, v8, v6, s[0:1] 12221; GFX6-NEXT: v_mul_f32_e32 v4, s19, v4 12222; GFX6-NEXT: v_mul_f32_e32 v5, s20, v4 12223; GFX6-NEXT: v_trunc_f32_e32 v5, v5 12224; GFX6-NEXT: v_mac_f32_e32 v4, s21, v5 12225; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 12226; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 12227; GFX6-NEXT: s_sub_u32 s0, 0, s8 12228; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 12229; GFX6-NEXT: v_mul_hi_u32 v3, s0, v4 12230; GFX6-NEXT: v_mul_lo_u32 v6, s0, v5 12231; GFX6-NEXT: s_subb_u32 s1, 0, s9 12232; GFX6-NEXT: v_mul_lo_u32 v7, s1, v4 12233; GFX6-NEXT: s_ashr_i32 s10, s3, 31 12234; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 12235; GFX6-NEXT: v_mul_lo_u32 v6, s0, v4 12236; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v7 12237; GFX6-NEXT: v_mul_lo_u32 v7, v4, v3 12238; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 12239; GFX6-NEXT: v_mul_hi_u32 v9, v4, v3 12240; GFX6-NEXT: v_mul_hi_u32 v10, v5, v3 12241; GFX6-NEXT: v_mul_lo_u32 v3, v5, v3 12242; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 12243; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc 12244; GFX6-NEXT: v_mul_lo_u32 v9, v5, v6 12245; GFX6-NEXT: v_mul_hi_u32 v6, v5, v6 12246; GFX6-NEXT: s_mov_b32 s11, s10 12247; GFX6-NEXT: v_xor_b32_e32 v1, s14, v1 12248; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v9 12249; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc 12250; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v10, v0, vcc 12251; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 12252; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc 12253; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 12254; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v5, v6, vcc 12255; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 12256; GFX6-NEXT: v_mul_hi_u32 v6, s0, v3 12257; GFX6-NEXT: v_mul_lo_u32 v7, s1, v3 12258; GFX6-NEXT: v_xor_b32_e32 v2, s15, v2 12259; GFX6-NEXT: s_mov_b32 s7, 0xf000 12260; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 12261; GFX6-NEXT: v_mul_lo_u32 v6, s0, v3 12262; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 12263; GFX6-NEXT: v_mul_lo_u32 v9, v3, v5 12264; GFX6-NEXT: v_mul_hi_u32 v10, v3, v6 12265; GFX6-NEXT: v_mul_hi_u32 v11, v3, v5 12266; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 12267; GFX6-NEXT: v_mul_lo_u32 v6, v4, v6 12268; GFX6-NEXT: v_mul_hi_u32 v7, v4, v5 12269; GFX6-NEXT: v_add_i32_e32 v9, vcc, v10, v9 12270; GFX6-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc 12271; GFX6-NEXT: v_mul_lo_u32 v5, v4, v5 12272; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 12273; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v10, v8, vcc 12274; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v0, vcc 12275; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 12276; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc 12277; GFX6-NEXT: s_add_u32 s0, s2, s10 12278; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 12279; GFX6-NEXT: s_addc_u32 s1, s3, s10 12280; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc 12281; GFX6-NEXT: s_xor_b64 s[2:3], s[0:1], s[10:11] 12282; GFX6-NEXT: v_mul_lo_u32 v5, s2, v4 12283; GFX6-NEXT: v_mul_hi_u32 v6, s2, v3 12284; GFX6-NEXT: v_mul_hi_u32 v8, s2, v4 12285; GFX6-NEXT: v_mul_hi_u32 v9, s3, v4 12286; GFX6-NEXT: v_mul_lo_u32 v4, s3, v4 12287; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 12288; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc 12289; GFX6-NEXT: v_mul_lo_u32 v8, s3, v3 12290; GFX6-NEXT: v_mul_hi_u32 v3, s3, v3 12291; GFX6-NEXT: v_mov_b32_e32 v7, s15 12292; GFX6-NEXT: s_mov_b32 s6, -1 12293; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8 12294; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc 12295; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v9, v0, vcc 12296; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 12297; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc 12298; GFX6-NEXT: v_mul_lo_u32 v5, s8, v4 12299; GFX6-NEXT: v_mul_hi_u32 v6, s8, v3 12300; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s14, v1 12301; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v7, vcc 12302; GFX6-NEXT: v_mul_lo_u32 v2, s9, v3 12303; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 12304; GFX6-NEXT: v_mov_b32_e32 v7, s9 12305; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 12306; GFX6-NEXT: v_mul_lo_u32 v5, s8, v3 12307; GFX6-NEXT: v_sub_i32_e32 v6, vcc, s3, v2 12308; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s2, v5 12309; GFX6-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v7, vcc 12310; GFX6-NEXT: v_subrev_i32_e64 v7, s[0:1], s8, v5 12311; GFX6-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] 12312; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 12313; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 12314; GFX6-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7 12315; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 12316; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 12317; GFX6-NEXT: v_cndmask_b32_e64 v6, v8, v7, s[0:1] 12318; GFX6-NEXT: v_add_i32_e64 v7, s[0:1], 2, v3 12319; GFX6-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v4, s[0:1] 12320; GFX6-NEXT: v_add_i32_e64 v9, s[0:1], 1, v3 12321; GFX6-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v4, s[0:1] 12322; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 12323; GFX6-NEXT: v_cndmask_b32_e64 v6, v10, v8, s[0:1] 12324; GFX6-NEXT: v_mov_b32_e32 v8, s3 12325; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc 12326; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 12327; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 12328; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v5 12329; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 12330; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v2 12331; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v5, vcc 12332; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 12333; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc 12334; GFX6-NEXT: v_cndmask_b32_e64 v4, v9, v7, s[0:1] 12335; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 12336; GFX6-NEXT: s_xor_b64 s[0:1], s[10:11], s[12:13] 12337; GFX6-NEXT: v_xor_b32_e32 v3, s0, v3 12338; GFX6-NEXT: v_xor_b32_e32 v4, s1, v2 12339; GFX6-NEXT: v_mov_b32_e32 v5, s1 12340; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v3 12341; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v4, v5, vcc 12342; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 12343; GFX6-NEXT: s_endpgm 12344; 12345; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom: 12346; GFX9: ; %bb.0: 12347; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 12348; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 12349; GFX9-NEXT: s_mov_b32 s16, 0x4f800000 12350; GFX9-NEXT: s_mov_b32 s17, 0x5f7ffffc 12351; GFX9-NEXT: s_mov_b32 s18, 0x2f800000 12352; GFX9-NEXT: s_waitcnt lgkmcnt(0) 12353; GFX9-NEXT: s_lshl_b64 s[8:9], s[2:3], s6 12354; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 12355; GFX9-NEXT: s_ashr_i32 s12, s3, 31 12356; GFX9-NEXT: s_add_u32 s2, s2, s12 12357; GFX9-NEXT: s_mov_b32 s13, s12 12358; GFX9-NEXT: s_addc_u32 s3, s3, s12 12359; GFX9-NEXT: s_xor_b64 s[10:11], s[2:3], s[12:13] 12360; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s10 12361; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s11 12362; GFX9-NEXT: s_mov_b32 s19, 0xcf800000 12363; GFX9-NEXT: s_sub_u32 s2, 0, s10 12364; GFX9-NEXT: s_subb_u32 s3, 0, s11 12365; GFX9-NEXT: v_mac_f32_e32 v0, s16, v1 12366; GFX9-NEXT: v_rcp_f32_e32 v0, v0 12367; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 12368; GFX9-NEXT: v_mul_f32_e32 v0, s17, v0 12369; GFX9-NEXT: v_mul_f32_e32 v1, s18, v0 12370; GFX9-NEXT: v_trunc_f32_e32 v1, v1 12371; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 12372; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 12373; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v0 12374; GFX9-NEXT: s_waitcnt lgkmcnt(0) 12375; GFX9-NEXT: s_ashr_i32 s14, s5, 31 12376; GFX9-NEXT: s_mov_b32 s15, s14 12377; GFX9-NEXT: v_mul_lo_u32 v0, s2, v1 12378; GFX9-NEXT: v_mul_hi_u32 v3, s2, v2 12379; GFX9-NEXT: v_mul_lo_u32 v5, s3, v2 12380; GFX9-NEXT: v_mul_lo_u32 v4, s2, v2 12381; GFX9-NEXT: v_add_u32_e32 v0, v3, v0 12382; GFX9-NEXT: v_add_u32_e32 v5, v0, v5 12383; GFX9-NEXT: v_mul_hi_u32 v3, v2, v4 12384; GFX9-NEXT: v_mul_lo_u32 v6, v2, v5 12385; GFX9-NEXT: v_mul_hi_u32 v7, v2, v5 12386; GFX9-NEXT: v_mul_hi_u32 v8, v1, v5 12387; GFX9-NEXT: v_mov_b32_e32 v0, 0 12388; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 12389; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 12390; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 12391; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 12392; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 12393; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc 12394; GFX9-NEXT: v_mul_lo_u32 v4, v1, v5 12395; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v0, vcc 12396; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 12397; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 12398; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 12399; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 12400; GFX9-NEXT: v_mul_lo_u32 v3, s2, v1 12401; GFX9-NEXT: v_mul_hi_u32 v4, s2, v2 12402; GFX9-NEXT: v_mul_lo_u32 v5, s3, v2 12403; GFX9-NEXT: v_mul_lo_u32 v6, s2, v2 12404; GFX9-NEXT: s_add_u32 s2, s4, s14 12405; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 12406; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 12407; GFX9-NEXT: v_mul_lo_u32 v7, v2, v3 12408; GFX9-NEXT: v_mul_hi_u32 v8, v2, v6 12409; GFX9-NEXT: v_mul_hi_u32 v9, v2, v3 12410; GFX9-NEXT: v_mul_hi_u32 v5, v1, v6 12411; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6 12412; GFX9-NEXT: v_mul_hi_u32 v4, v1, v3 12413; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 12414; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc 12415; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 12416; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 12417; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v5, vcc 12418; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v0, vcc 12419; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 12420; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 12421; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 12422; GFX9-NEXT: s_addc_u32 s3, s5, s14 12423; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 12424; GFX9-NEXT: s_xor_b64 s[4:5], s[2:3], s[14:15] 12425; GFX9-NEXT: v_mul_lo_u32 v3, s4, v1 12426; GFX9-NEXT: v_mul_hi_u32 v4, s4, v2 12427; GFX9-NEXT: v_mul_hi_u32 v5, s4, v1 12428; GFX9-NEXT: v_mul_hi_u32 v6, s5, v1 12429; GFX9-NEXT: v_mul_lo_u32 v1, s5, v1 12430; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 12431; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 12432; GFX9-NEXT: v_mul_lo_u32 v5, s5, v2 12433; GFX9-NEXT: v_mul_hi_u32 v2, s5, v2 12434; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 12435; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 12436; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v2, vcc 12437; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v0, vcc 12438; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v2, v1 12439; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc 12440; GFX9-NEXT: v_mul_lo_u32 v3, s10, v2 12441; GFX9-NEXT: v_mul_hi_u32 v4, s10, v1 12442; GFX9-NEXT: v_mul_lo_u32 v5, s11, v1 12443; GFX9-NEXT: v_mov_b32_e32 v6, s11 12444; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 12445; GFX9-NEXT: v_mul_lo_u32 v4, s10, v1 12446; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 12447; GFX9-NEXT: v_sub_u32_e32 v5, s5, v3 12448; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, s4, v4 12449; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc 12450; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s10, v4 12451; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] 12452; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v5 12453; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 12454; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v6 12455; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 12456; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v5 12457; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v6, s[0:1] 12458; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 12459; GFX9-NEXT: v_cndmask_b32_e64 v5, 1, 2, s[0:1] 12460; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], v1, v5 12461; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v2, s[0:1] 12462; GFX9-NEXT: s_xor_b64 s[0:1], s[14:15], s[12:13] 12463; GFX9-NEXT: s_ashr_i32 s4, s9, 31 12464; GFX9-NEXT: s_add_u32 s8, s8, s4 12465; GFX9-NEXT: v_mov_b32_e32 v7, s5 12466; GFX9-NEXT: s_mov_b32 s5, s4 12467; GFX9-NEXT: s_addc_u32 s9, s9, s4 12468; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5] 12469; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc 12470; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s8 12471; GFX9-NEXT: v_cvt_f32_u32_e32 v8, s9 12472; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v3 12473; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc 12474; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v4 12475; GFX9-NEXT: v_mac_f32_e32 v7, s16, v8 12476; GFX9-NEXT: v_rcp_f32_e32 v7, v7 12477; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 12478; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v3 12479; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc 12480; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 12481; GFX9-NEXT: v_mul_f32_e32 v3, s17, v7 12482; GFX9-NEXT: v_mul_f32_e32 v4, s18, v3 12483; GFX9-NEXT: v_trunc_f32_e32 v4, v4 12484; GFX9-NEXT: v_mac_f32_e32 v3, s19, v4 12485; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 12486; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 12487; GFX9-NEXT: s_sub_u32 s10, 0, s8 12488; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc 12489; GFX9-NEXT: s_subb_u32 s11, 0, s9 12490; GFX9-NEXT: v_mul_hi_u32 v5, s10, v3 12491; GFX9-NEXT: v_mul_lo_u32 v7, s10, v4 12492; GFX9-NEXT: v_mul_lo_u32 v8, s11, v3 12493; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 12494; GFX9-NEXT: v_mul_lo_u32 v6, s10, v3 12495; GFX9-NEXT: v_add_u32_e32 v5, v5, v7 12496; GFX9-NEXT: v_add_u32_e32 v5, v5, v8 12497; GFX9-NEXT: v_mul_lo_u32 v7, v3, v5 12498; GFX9-NEXT: v_mul_hi_u32 v8, v3, v6 12499; GFX9-NEXT: v_mul_hi_u32 v9, v3, v5 12500; GFX9-NEXT: v_mul_hi_u32 v10, v4, v5 12501; GFX9-NEXT: v_mul_lo_u32 v5, v4, v5 12502; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 12503; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc 12504; GFX9-NEXT: v_mul_lo_u32 v9, v4, v6 12505; GFX9-NEXT: v_mul_hi_u32 v6, v4, v6 12506; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1 12507; GFX9-NEXT: v_xor_b32_e32 v2, s1, v2 12508; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 12509; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v8, v6, vcc 12510; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v0, vcc 12511; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 12512; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 12513; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 12514; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc 12515; GFX9-NEXT: v_mul_lo_u32 v5, s10, v4 12516; GFX9-NEXT: v_mul_hi_u32 v6, s10, v3 12517; GFX9-NEXT: v_mul_lo_u32 v7, s11, v3 12518; GFX9-NEXT: v_mul_lo_u32 v8, s10, v3 12519; GFX9-NEXT: s_ashr_i32 s10, s7, 31 12520; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 12521; GFX9-NEXT: v_add_u32_e32 v5, v5, v7 12522; GFX9-NEXT: v_mul_lo_u32 v9, v3, v5 12523; GFX9-NEXT: v_mul_hi_u32 v10, v3, v8 12524; GFX9-NEXT: v_mul_hi_u32 v11, v3, v5 12525; GFX9-NEXT: v_mul_hi_u32 v7, v4, v8 12526; GFX9-NEXT: v_mul_lo_u32 v8, v4, v8 12527; GFX9-NEXT: v_mul_hi_u32 v6, v4, v5 12528; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 12529; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v11, vcc 12530; GFX9-NEXT: v_mul_lo_u32 v5, v4, v5 12531; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 12532; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v7, vcc 12533; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v0, vcc 12534; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 12535; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 12536; GFX9-NEXT: s_add_u32 s6, s6, s10 12537; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 12538; GFX9-NEXT: s_mov_b32 s11, s10 12539; GFX9-NEXT: s_addc_u32 s7, s7, s10 12540; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc 12541; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[10:11] 12542; GFX9-NEXT: v_mul_lo_u32 v5, s6, v4 12543; GFX9-NEXT: v_mul_hi_u32 v6, s6, v3 12544; GFX9-NEXT: v_mul_hi_u32 v8, s6, v4 12545; GFX9-NEXT: v_mul_hi_u32 v9, s7, v4 12546; GFX9-NEXT: v_mul_lo_u32 v4, s7, v4 12547; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 12548; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc 12549; GFX9-NEXT: v_mul_lo_u32 v8, s7, v3 12550; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 12551; GFX9-NEXT: v_mov_b32_e32 v7, s1 12552; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 12553; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc 12554; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v0, vcc 12555; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 12556; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 12557; GFX9-NEXT: v_mul_lo_u32 v5, s8, v4 12558; GFX9-NEXT: v_mul_hi_u32 v6, s8, v3 12559; GFX9-NEXT: v_mul_lo_u32 v8, s9, v3 12560; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s0, v1 12561; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 12562; GFX9-NEXT: v_mul_lo_u32 v6, s8, v3 12563; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc 12564; GFX9-NEXT: v_add_u32_e32 v5, v5, v8 12565; GFX9-NEXT: v_sub_u32_e32 v7, s7, v5 12566; GFX9-NEXT: v_mov_b32_e32 v8, s9 12567; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, s6, v6 12568; GFX9-NEXT: v_subb_co_u32_e64 v7, s[0:1], v7, v8, vcc 12569; GFX9-NEXT: v_subrev_co_u32_e64 v8, s[0:1], s8, v6 12570; GFX9-NEXT: v_subbrev_co_u32_e64 v7, s[0:1], 0, v7, s[0:1] 12571; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v7 12572; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] 12573; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 12574; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 12575; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v7 12576; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v8, s[0:1] 12577; GFX9-NEXT: v_mov_b32_e32 v9, s7 12578; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v9, v5, vcc 12579; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 12580; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 12581; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc 12582; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v6 12583; GFX9-NEXT: v_cndmask_b32_e64 v7, 1, 2, s[0:1] 12584; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 12585; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v5 12586; GFX9-NEXT: v_add_co_u32_e64 v7, s[0:1], v3, v7 12587; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc 12588; GFX9-NEXT: v_addc_co_u32_e64 v8, s[0:1], 0, v4, s[0:1] 12589; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 12590; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc 12591; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], s[4:5] 12592; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc 12593; GFX9-NEXT: v_xor_b32_e32 v3, s0, v3 12594; GFX9-NEXT: v_xor_b32_e32 v4, s1, v4 12595; GFX9-NEXT: v_mov_b32_e32 v5, s1 12596; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s0, v3 12597; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v5, vcc 12598; GFX9-NEXT: s_waitcnt lgkmcnt(0) 12599; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] 12600; GFX9-NEXT: s_endpgm 12601; 12602; GFX90A-LABEL: sdiv_v2i64_pow2_shl_denom: 12603; GFX90A: ; %bb.0: 12604; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 12605; GFX90A-NEXT: s_mov_b64 s[2:3], 0x1000 12606; GFX90A-NEXT: s_mov_b32 s16, 0x4f800000 12607; GFX90A-NEXT: s_mov_b32 s17, 0x5f7ffffc 12608; GFX90A-NEXT: s_mov_b32 s18, 0x2f800000 12609; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 12610; GFX90A-NEXT: s_lshl_b64 s[8:9], s[2:3], s6 12611; GFX90A-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 12612; GFX90A-NEXT: s_ashr_i32 s10, s3, 31 12613; GFX90A-NEXT: s_add_u32 s2, s2, s10 12614; GFX90A-NEXT: s_mov_b32 s11, s10 12615; GFX90A-NEXT: s_addc_u32 s3, s3, s10 12616; GFX90A-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] 12617; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s12 12618; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s13 12619; GFX90A-NEXT: s_mov_b32 s19, 0xcf800000 12620; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 12621; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 12622; GFX90A-NEXT: s_sub_u32 s0, 0, s12 12623; GFX90A-NEXT: v_mac_f32_e32 v0, s16, v1 12624; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 12625; GFX90A-NEXT: s_subb_u32 s1, 0, s13 12626; GFX90A-NEXT: v_mov_b32_e32 v4, 0 12627; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 12628; GFX90A-NEXT: s_ashr_i32 s14, s5, 31 12629; GFX90A-NEXT: v_mul_f32_e32 v0, s17, v0 12630; GFX90A-NEXT: v_mul_f32_e32 v1, s18, v0 12631; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 12632; GFX90A-NEXT: v_mac_f32_e32 v0, s19, v1 12633; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 12634; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 12635; GFX90A-NEXT: s_mov_b32 s15, s14 12636; GFX90A-NEXT: v_mul_hi_u32 v3, s0, v0 12637; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v1 12638; GFX90A-NEXT: v_mul_lo_u32 v2, s1, v0 12639; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 12640; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 12641; GFX90A-NEXT: v_mul_lo_u32 v6, s0, v0 12642; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 12643; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 12644; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 12645; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 12646; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 12647; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 12648; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 12649; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 12650; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 12651; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc 12652; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc 12653; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 12654; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 12655; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc 12656; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 12657; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 12658; GFX90A-NEXT: v_mul_lo_u32 v2, s0, v1 12659; GFX90A-NEXT: v_mul_hi_u32 v3, s0, v0 12660; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 12661; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v0 12662; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 12663; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v0 12664; GFX90A-NEXT: v_mul_hi_u32 v6, v1, v5 12665; GFX90A-NEXT: v_mul_lo_u32 v7, v1, v5 12666; GFX90A-NEXT: v_mul_lo_u32 v9, v0, v2 12667; GFX90A-NEXT: v_mul_hi_u32 v5, v0, v5 12668; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v2 12669; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 12670; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v8, vcc 12671; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 12672; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v2 12673; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc 12674; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc 12675; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 12676; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 12677; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 12678; GFX90A-NEXT: s_add_u32 s0, s4, s14 12679; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 12680; GFX90A-NEXT: s_addc_u32 s1, s5, s14 12681; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 12682; GFX90A-NEXT: s_xor_b64 s[4:5], s[0:1], s[14:15] 12683; GFX90A-NEXT: v_mul_lo_u32 v3, s4, v1 12684; GFX90A-NEXT: v_mul_hi_u32 v5, s4, v0 12685; GFX90A-NEXT: v_mul_hi_u32 v2, s4, v1 12686; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 12687; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 12688; GFX90A-NEXT: v_mul_hi_u32 v6, s5, v0 12689; GFX90A-NEXT: v_mul_lo_u32 v0, s5, v0 12690; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 12691; GFX90A-NEXT: v_mul_hi_u32 v5, s5, v1 12692; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v2, v6, vcc 12693; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc 12694; GFX90A-NEXT: v_mul_lo_u32 v1, s5, v1 12695; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 12696; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 12697; GFX90A-NEXT: v_mul_lo_u32 v2, s12, v1 12698; GFX90A-NEXT: v_mul_hi_u32 v3, s12, v0 12699; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 12700; GFX90A-NEXT: v_mul_lo_u32 v3, s13, v0 12701; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 12702; GFX90A-NEXT: v_mul_lo_u32 v5, s12, v0 12703; GFX90A-NEXT: v_sub_u32_e32 v3, s5, v2 12704; GFX90A-NEXT: v_mov_b32_e32 v6, s13 12705; GFX90A-NEXT: v_sub_co_u32_e32 v5, vcc, s4, v5 12706; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v6, vcc 12707; GFX90A-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s12, v5 12708; GFX90A-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] 12709; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v3 12710; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1] 12711; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v6 12712; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1] 12713; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v3 12714; GFX90A-NEXT: v_cndmask_b32_e64 v3, v7, v6, s[0:1] 12715; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v3 12716; GFX90A-NEXT: v_cndmask_b32_e64 v3, 1, 2, s[0:1] 12717; GFX90A-NEXT: v_mov_b32_e32 v7, s5 12718; GFX90A-NEXT: v_add_co_u32_e64 v3, s[0:1], v0, v3 12719; GFX90A-NEXT: v_subb_co_u32_e32 v2, vcc, v7, v2, vcc 12720; GFX90A-NEXT: v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1] 12721; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 12722; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 12723; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s12, v5 12724; GFX90A-NEXT: s_xor_b64 s[0:1], s[14:15], s[10:11] 12725; GFX90A-NEXT: s_ashr_i32 s4, s9, 31 12726; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 12727; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 12728; GFX90A-NEXT: s_add_u32 s8, s8, s4 12729; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v5, vcc 12730; GFX90A-NEXT: s_mov_b32 s5, s4 12731; GFX90A-NEXT: s_addc_u32 s9, s9, s4 12732; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 12733; GFX90A-NEXT: s_xor_b64 s[8:9], s[8:9], s[4:5] 12734; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 12735; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s8 12736; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s9 12737; GFX90A-NEXT: v_xor_b32_e32 v0, s0, v0 12738; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc 12739; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 12740; GFX90A-NEXT: v_mac_f32_e32 v2, s16, v3 12741; GFX90A-NEXT: v_rcp_f32_e32 v2, v2 12742; GFX90A-NEXT: s_sub_u32 s0, 0, s8 12743; GFX90A-NEXT: v_xor_b32_e32 v1, s1, v1 12744; GFX90A-NEXT: v_mov_b32_e32 v5, s1 12745; GFX90A-NEXT: v_mul_f32_e32 v2, s17, v2 12746; GFX90A-NEXT: v_mul_f32_e32 v3, s18, v2 12747; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 12748; GFX90A-NEXT: v_mac_f32_e32 v2, s19, v3 12749; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 12750; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 12751; GFX90A-NEXT: s_subb_u32 s1, 0, s9 12752; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc 12753; GFX90A-NEXT: v_mul_hi_u32 v6, s0, v2 12754; GFX90A-NEXT: v_mul_lo_u32 v7, s0, v3 12755; GFX90A-NEXT: v_mul_lo_u32 v5, s1, v2 12756; GFX90A-NEXT: v_add_u32_e32 v6, v6, v7 12757; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 12758; GFX90A-NEXT: v_mul_lo_u32 v8, s0, v2 12759; GFX90A-NEXT: v_mul_lo_u32 v7, v2, v5 12760; GFX90A-NEXT: v_mul_hi_u32 v9, v2, v8 12761; GFX90A-NEXT: v_mul_hi_u32 v6, v2, v5 12762; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v9, v7 12763; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 12764; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v8 12765; GFX90A-NEXT: v_mul_lo_u32 v8, v3, v8 12766; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 12767; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v5 12768; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v10, vcc 12769; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v4, vcc 12770; GFX90A-NEXT: v_mul_lo_u32 v5, v3, v5 12771; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 12772; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 12773; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 12774; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc 12775; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v3 12776; GFX90A-NEXT: v_mul_hi_u32 v6, s0, v2 12777; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 12778; GFX90A-NEXT: v_mul_lo_u32 v6, s1, v2 12779; GFX90A-NEXT: v_add_u32_e32 v5, v5, v6 12780; GFX90A-NEXT: v_mul_lo_u32 v7, s0, v2 12781; GFX90A-NEXT: v_mul_hi_u32 v8, v3, v7 12782; GFX90A-NEXT: v_mul_lo_u32 v9, v3, v7 12783; GFX90A-NEXT: v_mul_lo_u32 v11, v2, v5 12784; GFX90A-NEXT: v_mul_hi_u32 v7, v2, v7 12785; GFX90A-NEXT: v_mul_hi_u32 v10, v2, v5 12786; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v11 12787; GFX90A-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v10, vcc 12788; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 12789; GFX90A-NEXT: v_mul_hi_u32 v6, v3, v5 12790; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v8, vcc 12791; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v4, vcc 12792; GFX90A-NEXT: v_mul_lo_u32 v5, v3, v5 12793; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 12794; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 12795; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 12796; GFX90A-NEXT: s_add_u32 s0, s6, s10 12797; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 12798; GFX90A-NEXT: s_mov_b32 s11, s10 12799; GFX90A-NEXT: s_addc_u32 s1, s7, s10 12800; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc 12801; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 12802; GFX90A-NEXT: v_mul_lo_u32 v6, s6, v3 12803; GFX90A-NEXT: v_mul_hi_u32 v7, s6, v2 12804; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v3 12805; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 12806; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc 12807; GFX90A-NEXT: v_mul_hi_u32 v8, s7, v2 12808; GFX90A-NEXT: v_mul_lo_u32 v2, s7, v2 12809; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 12810; GFX90A-NEXT: v_mul_hi_u32 v7, s7, v3 12811; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v8, vcc 12812; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc 12813; GFX90A-NEXT: v_mul_lo_u32 v3, s7, v3 12814; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 12815; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc 12816; GFX90A-NEXT: v_mul_lo_u32 v5, s8, v3 12817; GFX90A-NEXT: v_mul_hi_u32 v6, s8, v2 12818; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 12819; GFX90A-NEXT: v_mul_lo_u32 v6, s9, v2 12820; GFX90A-NEXT: v_add_u32_e32 v5, v5, v6 12821; GFX90A-NEXT: v_mul_lo_u32 v7, s8, v2 12822; GFX90A-NEXT: v_sub_u32_e32 v6, s7, v5 12823; GFX90A-NEXT: v_mov_b32_e32 v8, s9 12824; GFX90A-NEXT: v_sub_co_u32_e32 v7, vcc, s6, v7 12825; GFX90A-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v8, vcc 12826; GFX90A-NEXT: v_subrev_co_u32_e64 v8, s[0:1], s8, v7 12827; GFX90A-NEXT: v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1] 12828; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v6 12829; GFX90A-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[0:1] 12830; GFX90A-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8 12831; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1] 12832; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v6 12833; GFX90A-NEXT: v_cndmask_b32_e64 v6, v9, v8, s[0:1] 12834; GFX90A-NEXT: v_mov_b32_e32 v9, s7 12835; GFX90A-NEXT: v_subb_co_u32_e32 v5, vcc, v9, v5, vcc 12836; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v5 12837; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 12838; GFX90A-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc 12839; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v7 12840; GFX90A-NEXT: v_cndmask_b32_e64 v6, 1, 2, s[0:1] 12841; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 12842; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s9, v5 12843; GFX90A-NEXT: v_add_co_u32_e64 v6, s[0:1], v2, v6 12844; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v7, vcc 12845; GFX90A-NEXT: v_addc_co_u32_e64 v8, s[0:1], 0, v3, s[0:1] 12846; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 12847; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 12848; GFX90A-NEXT: s_xor_b64 s[0:1], s[10:11], s[4:5] 12849; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc 12850; GFX90A-NEXT: v_xor_b32_e32 v2, s0, v2 12851; GFX90A-NEXT: v_xor_b32_e32 v3, s1, v3 12852; GFX90A-NEXT: v_mov_b32_e32 v5, s1 12853; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v2 12854; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc 12855; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 12856; GFX90A-NEXT: s_endpgm 12857 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 12858 %r = sdiv <2 x i64> %x, %shl.y 12859 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 12860 ret void 12861} 12862 12863define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) { 12864; CHECK-LABEL: @srem_i64_oddk_denom( 12865; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 1235195 12866; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 12867; CHECK-NEXT: ret void 12868; 12869; GFX6-LABEL: srem_i64_oddk_denom: 12870; GFX6: ; %bb.0: 12871; GFX6-NEXT: v_mov_b32_e32 v0, 0x4f800000 12872; GFX6-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 12873; GFX6-NEXT: v_rcp_f32_e32 v0, v0 12874; GFX6-NEXT: s_mov_b32 s4, 0xffed2705 12875; GFX6-NEXT: v_mov_b32_e32 v5, 0 12876; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 12877; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 12878; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 12879; GFX6-NEXT: v_trunc_f32_e32 v1, v1 12880; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 12881; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 12882; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 12883; GFX6-NEXT: s_waitcnt lgkmcnt(0) 12884; GFX6-NEXT: s_ashr_i32 s8, s3, 31 12885; GFX6-NEXT: s_add_u32 s2, s2, s8 12886; GFX6-NEXT: v_mul_lo_u32 v2, v1, s4 12887; GFX6-NEXT: v_mul_hi_u32 v3, v0, s4 12888; GFX6-NEXT: v_mul_lo_u32 v4, v0, s4 12889; GFX6-NEXT: s_mov_b32 s9, s8 12890; GFX6-NEXT: s_addc_u32 s3, s3, s8 12891; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 12892; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 12893; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 12894; GFX6-NEXT: v_mul_lo_u32 v6, v0, v2 12895; GFX6-NEXT: v_mul_hi_u32 v7, v0, v2 12896; GFX6-NEXT: v_mul_hi_u32 v8, v1, v2 12897; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 12898; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 12899; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc 12900; GFX6-NEXT: v_mul_lo_u32 v7, v1, v4 12901; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 12902; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] 12903; GFX6-NEXT: s_mov_b32 s5, s1 12904; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v7 12905; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v4, vcc 12906; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v5, vcc 12907; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 12908; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 12909; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 12910; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 12911; GFX6-NEXT: v_mul_lo_u32 v2, v1, s4 12912; GFX6-NEXT: v_mul_hi_u32 v3, v0, s4 12913; GFX6-NEXT: s_mov_b32 s7, 0xf000 12914; GFX6-NEXT: s_mov_b32 s6, -1 12915; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 12916; GFX6-NEXT: v_mul_lo_u32 v3, v0, s4 12917; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v0, v2 12918; GFX6-NEXT: v_mul_lo_u32 v7, v0, v2 12919; GFX6-NEXT: v_mul_hi_u32 v8, v0, v3 12920; GFX6-NEXT: v_mul_hi_u32 v9, v0, v2 12921; GFX6-NEXT: v_mul_hi_u32 v6, v1, v3 12922; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 12923; GFX6-NEXT: v_mul_hi_u32 v4, v1, v2 12924; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 12925; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc 12926; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 12927; GFX6-NEXT: v_add_i32_e32 v3, vcc, v7, v3 12928; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v6, vcc 12929; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc 12930; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 12931; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 12932; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 12933; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 12934; GFX6-NEXT: v_mul_lo_u32 v2, s2, v1 12935; GFX6-NEXT: v_mul_hi_u32 v3, s2, v0 12936; GFX6-NEXT: v_mul_hi_u32 v4, s2, v1 12937; GFX6-NEXT: v_mul_hi_u32 v6, s3, v1 12938; GFX6-NEXT: v_mul_lo_u32 v1, s3, v1 12939; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 12940; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc 12941; GFX6-NEXT: v_mul_lo_u32 v4, s3, v0 12942; GFX6-NEXT: v_mul_hi_u32 v0, s3, v0 12943; GFX6-NEXT: s_mov_b32 s4, s0 12944; GFX6-NEXT: s_mov_b32 s0, 0x12d8fb 12945; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 12946; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 12947; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v6, v5, vcc 12948; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 12949; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 12950; GFX6-NEXT: v_mul_lo_u32 v1, v1, s0 12951; GFX6-NEXT: v_mul_hi_u32 v2, v0, s0 12952; GFX6-NEXT: v_mul_lo_u32 v0, v0, s0 12953; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 12954; GFX6-NEXT: v_mov_b32_e32 v2, s3 12955; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 12956; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 12957; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s0, v0 12958; GFX6-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v1, vcc 12959; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, s0, v2 12960; GFX6-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v3, vcc 12961; GFX6-NEXT: s_mov_b32 s0, 0x12d8fa 12962; GFX6-NEXT: v_cmp_lt_u32_e32 vcc, s0, v2 12963; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 12964; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 12965; GFX6-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc 12966; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 12967; GFX6-NEXT: v_cmp_lt_u32_e64 s[0:1], s0, v0 12968; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 12969; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1] 12970; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v1 12971; GFX6-NEXT: v_cndmask_b32_e64 v5, -1, v5, s[0:1] 12972; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 12973; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 12974; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] 12975; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] 12976; GFX6-NEXT: v_xor_b32_e32 v0, s8, v0 12977; GFX6-NEXT: v_xor_b32_e32 v1, s8, v1 12978; GFX6-NEXT: v_mov_b32_e32 v2, s8 12979; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s8, v0 12980; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 12981; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 12982; GFX6-NEXT: s_endpgm 12983; 12984; GFX9-LABEL: srem_i64_oddk_denom: 12985; GFX9: ; %bb.0: 12986; GFX9-NEXT: v_mov_b32_e32 v0, 0x4f800000 12987; GFX9-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 12988; GFX9-NEXT: v_rcp_f32_e32 v0, v0 12989; GFX9-NEXT: s_mov_b32 s4, 0xffed2705 12990; GFX9-NEXT: v_mov_b32_e32 v5, 0 12991; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 12992; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 12993; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 12994; GFX9-NEXT: v_trunc_f32_e32 v1, v1 12995; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 12996; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 12997; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 12998; GFX9-NEXT: v_mul_lo_u32 v2, v1, s4 12999; GFX9-NEXT: v_mul_hi_u32 v3, v0, s4 13000; GFX9-NEXT: v_mul_lo_u32 v4, v0, s4 13001; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 13002; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 13003; GFX9-NEXT: v_mul_hi_u32 v3, v0, v4 13004; GFX9-NEXT: v_mul_lo_u32 v6, v0, v2 13005; GFX9-NEXT: v_mul_hi_u32 v8, v0, v2 13006; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 13007; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 13008; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 13009; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc 13010; GFX9-NEXT: v_mul_hi_u32 v8, v1, v2 13011; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 13012; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 13013; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc 13014; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v5, vcc 13015; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 13016; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 13017; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 13018; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 13019; GFX9-NEXT: v_mul_lo_u32 v2, v1, s4 13020; GFX9-NEXT: v_mul_hi_u32 v3, v0, s4 13021; GFX9-NEXT: v_mul_lo_u32 v4, v0, s4 13022; GFX9-NEXT: s_waitcnt lgkmcnt(0) 13023; GFX9-NEXT: s_ashr_i32 s4, s3, 31 13024; GFX9-NEXT: s_add_u32 s2, s2, s4 13025; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 13026; GFX9-NEXT: v_sub_u32_e32 v2, v2, v0 13027; GFX9-NEXT: v_mul_lo_u32 v7, v0, v2 13028; GFX9-NEXT: v_mul_hi_u32 v8, v0, v4 13029; GFX9-NEXT: v_mul_hi_u32 v9, v0, v2 13030; GFX9-NEXT: v_mul_hi_u32 v6, v1, v4 13031; GFX9-NEXT: v_mul_lo_u32 v4, v1, v4 13032; GFX9-NEXT: v_mul_hi_u32 v3, v1, v2 13033; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 13034; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc 13035; GFX9-NEXT: v_mul_lo_u32 v2, v1, v2 13036; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v7, v4 13037; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v8, v6, vcc 13038; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc 13039; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 13040; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 13041; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 13042; GFX9-NEXT: s_mov_b32 s5, s4 13043; GFX9-NEXT: s_addc_u32 s3, s3, s4 13044; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 13045; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] 13046; GFX9-NEXT: v_mul_lo_u32 v2, s2, v1 13047; GFX9-NEXT: v_mul_hi_u32 v3, s2, v0 13048; GFX9-NEXT: v_mul_hi_u32 v4, s2, v1 13049; GFX9-NEXT: v_mul_hi_u32 v6, s3, v1 13050; GFX9-NEXT: v_mul_lo_u32 v1, s3, v1 13051; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 13052; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc 13053; GFX9-NEXT: v_mul_lo_u32 v4, s3, v0 13054; GFX9-NEXT: v_mul_hi_u32 v0, s3, v0 13055; GFX9-NEXT: s_mov_b32 s5, 0x12d8fb 13056; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 13057; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v0, vcc 13058; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6, v5, vcc 13059; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 13060; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 13061; GFX9-NEXT: v_mul_lo_u32 v1, v1, s5 13062; GFX9-NEXT: v_mul_hi_u32 v2, v0, s5 13063; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5 13064; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 13065; GFX9-NEXT: v_mov_b32_e32 v2, s3 13066; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 13067; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 13068; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s5, v0 13069; GFX9-NEXT: v_subbrev_co_u32_e32 v3, vcc, 0, v1, vcc 13070; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s5, v2 13071; GFX9-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc 13072; GFX9-NEXT: s_mov_b32 s2, 0x12d8fa 13073; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s2, v2 13074; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 13075; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 13076; GFX9-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc 13077; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 13078; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 13079; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 13080; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s2, v0 13081; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc 13082; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 13083; GFX9-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc 13084; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 13085; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 13086; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 13087; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0 13088; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1 13089; GFX9-NEXT: v_mov_b32_e32 v2, s4 13090; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s4, v0 13091; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc 13092; GFX9-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] 13093; GFX9-NEXT: s_endpgm 13094; 13095; GFX90A-LABEL: srem_i64_oddk_denom: 13096; GFX90A: ; %bb.0: 13097; GFX90A-NEXT: v_mov_b32_e32 v0, 0x4f800000 13098; GFX90A-NEXT: v_madak_f32 v0, 0, v0, 0x4996c7d8 13099; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 13100; GFX90A-NEXT: s_mov_b32 s4, 0xffed2705 13101; GFX90A-NEXT: v_mov_b32_e32 v2, 0 13102; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 13103; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 13104; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 13105; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 13106; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 13107; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 13108; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 13109; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s4 13110; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s4 13111; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 13112; GFX90A-NEXT: v_sub_u32_e32 v3, v3, v0 13113; GFX90A-NEXT: v_mul_lo_u32 v6, v0, s4 13114; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 13115; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 13116; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 13117; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 13118; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 13119; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 13120; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 13121; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 13122; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 13123; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc 13124; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 13125; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 13126; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 13127; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 13128; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 13129; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 13130; GFX90A-NEXT: v_mul_lo_u32 v3, v1, s4 13131; GFX90A-NEXT: v_mul_hi_u32 v4, v0, s4 13132; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 13133; GFX90A-NEXT: v_sub_u32_e32 v3, v3, v0 13134; GFX90A-NEXT: v_mul_lo_u32 v5, v0, s4 13135; GFX90A-NEXT: v_mul_hi_u32 v6, v1, v5 13136; GFX90A-NEXT: v_mul_lo_u32 v7, v1, v5 13137; GFX90A-NEXT: v_mul_lo_u32 v9, v0, v3 13138; GFX90A-NEXT: v_mul_hi_u32 v5, v0, v5 13139; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v3 13140; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 13141; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v8, vcc 13142; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 13143; GFX90A-NEXT: v_mul_hi_u32 v4, v1, v3 13144; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc 13145; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc 13146; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 13147; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 13148; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 13149; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 13150; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 13151; GFX90A-NEXT: s_add_u32 s2, s2, s4 13152; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 13153; GFX90A-NEXT: s_mov_b32 s5, s4 13154; GFX90A-NEXT: s_addc_u32 s3, s3, s4 13155; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 13156; GFX90A-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] 13157; GFX90A-NEXT: v_mul_lo_u32 v4, s2, v1 13158; GFX90A-NEXT: v_mul_hi_u32 v5, s2, v0 13159; GFX90A-NEXT: v_mul_hi_u32 v3, s2, v1 13160; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 13161; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 13162; GFX90A-NEXT: v_mul_hi_u32 v6, s3, v0 13163; GFX90A-NEXT: v_mul_lo_u32 v0, s3, v0 13164; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 13165; GFX90A-NEXT: v_mul_hi_u32 v5, s3, v1 13166; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc 13167; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc 13168; GFX90A-NEXT: v_mul_lo_u32 v1, s3, v1 13169; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 13170; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc 13171; GFX90A-NEXT: s_mov_b32 s5, 0x12d8fb 13172; GFX90A-NEXT: v_mul_lo_u32 v1, v1, s5 13173; GFX90A-NEXT: v_mul_hi_u32 v3, v0, s5 13174; GFX90A-NEXT: v_mul_lo_u32 v0, v0, s5 13175; GFX90A-NEXT: v_add_u32_e32 v1, v3, v1 13176; GFX90A-NEXT: v_mov_b32_e32 v3, s3 13177; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 13178; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v1, vcc 13179; GFX90A-NEXT: v_subrev_co_u32_e32 v3, vcc, s5, v0 13180; GFX90A-NEXT: v_subbrev_co_u32_e32 v4, vcc, 0, v1, vcc 13181; GFX90A-NEXT: v_subrev_co_u32_e32 v5, vcc, s5, v3 13182; GFX90A-NEXT: v_subbrev_co_u32_e32 v6, vcc, 0, v4, vcc 13183; GFX90A-NEXT: s_mov_b32 s2, 0x12d8fa 13184; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s2, v3 13185; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 13186; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 13187; GFX90A-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc 13188; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 13189; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 13190; GFX90A-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc 13191; GFX90A-NEXT: v_cmp_lt_u32_e32 vcc, s2, v0 13192; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 13193; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 13194; GFX90A-NEXT: v_cndmask_b32_e32 v5, -1, v5, vcc 13195; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 13196; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 13197; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 13198; GFX90A-NEXT: v_xor_b32_e32 v0, s4, v0 13199; GFX90A-NEXT: v_xor_b32_e32 v1, s4, v1 13200; GFX90A-NEXT: v_mov_b32_e32 v3, s4 13201; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s4, v0 13202; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 13203; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 13204; GFX90A-NEXT: s_endpgm 13205 %r = srem i64 %x, 1235195 13206 store i64 %r, i64 addrspace(1)* %out 13207 ret void 13208} 13209 13210define amdgpu_kernel void @srem_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x) { 13211; CHECK-LABEL: @srem_i64_pow2k_denom( 13212; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], 4096 13213; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 13214; CHECK-NEXT: ret void 13215; 13216; GFX6-LABEL: srem_i64_pow2k_denom: 13217; GFX6: ; %bb.0: 13218; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 13219; GFX6-NEXT: s_mov_b32 s7, 0xf000 13220; GFX6-NEXT: s_mov_b32 s6, -1 13221; GFX6-NEXT: s_waitcnt lgkmcnt(0) 13222; GFX6-NEXT: s_mov_b32 s4, s0 13223; GFX6-NEXT: s_ashr_i32 s0, s3, 31 13224; GFX6-NEXT: s_lshr_b32 s0, s0, 20 13225; GFX6-NEXT: s_add_u32 s0, s2, s0 13226; GFX6-NEXT: s_mov_b32 s5, s1 13227; GFX6-NEXT: s_addc_u32 s1, s3, 0 13228; GFX6-NEXT: s_and_b32 s0, s0, 0xfffff000 13229; GFX6-NEXT: s_sub_u32 s0, s2, s0 13230; GFX6-NEXT: s_subb_u32 s1, s3, s1 13231; GFX6-NEXT: v_mov_b32_e32 v0, s0 13232; GFX6-NEXT: v_mov_b32_e32 v1, s1 13233; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 13234; GFX6-NEXT: s_endpgm 13235; 13236; GFX9-LABEL: srem_i64_pow2k_denom: 13237; GFX9: ; %bb.0: 13238; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 13239; GFX9-NEXT: v_mov_b32_e32 v2, 0 13240; GFX9-NEXT: s_waitcnt lgkmcnt(0) 13241; GFX9-NEXT: s_ashr_i32 s4, s3, 31 13242; GFX9-NEXT: s_lshr_b32 s4, s4, 20 13243; GFX9-NEXT: s_add_u32 s4, s2, s4 13244; GFX9-NEXT: s_addc_u32 s5, s3, 0 13245; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000 13246; GFX9-NEXT: s_sub_u32 s2, s2, s4 13247; GFX9-NEXT: s_subb_u32 s3, s3, s5 13248; GFX9-NEXT: v_mov_b32_e32 v0, s2 13249; GFX9-NEXT: v_mov_b32_e32 v1, s3 13250; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 13251; GFX9-NEXT: s_endpgm 13252; 13253; GFX90A-LABEL: srem_i64_pow2k_denom: 13254; GFX90A: ; %bb.0: 13255; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 13256; GFX90A-NEXT: v_mov_b32_e32 v2, 0 13257; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 13258; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 13259; GFX90A-NEXT: s_lshr_b32 s4, s4, 20 13260; GFX90A-NEXT: s_add_u32 s4, s2, s4 13261; GFX90A-NEXT: s_addc_u32 s5, s3, 0 13262; GFX90A-NEXT: s_and_b32 s4, s4, 0xfffff000 13263; GFX90A-NEXT: s_sub_u32 s2, s2, s4 13264; GFX90A-NEXT: s_subb_u32 s3, s3, s5 13265; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1] 13266; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] 13267; GFX90A-NEXT: s_endpgm 13268 %r = srem i64 %x, 4096 13269 store i64 %r, i64 addrspace(1)* %out 13270 ret void 13271} 13272 13273define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %x, i64 %y) { 13274; CHECK-LABEL: @srem_i64_pow2_shl_denom( 13275; CHECK-NEXT: [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]] 13276; CHECK-NEXT: [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]] 13277; CHECK-NEXT: store i64 [[R]], i64 addrspace(1)* [[OUT:%.*]], align 4 13278; CHECK-NEXT: ret void 13279; 13280; GFX6-LABEL: srem_i64_pow2_shl_denom: 13281; GFX6: ; %bb.0: 13282; GFX6-NEXT: s_load_dword s4, s[0:1], 0xd 13283; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 13284; GFX6-NEXT: s_mov_b32 s7, 0xf000 13285; GFX6-NEXT: s_mov_b32 s6, -1 13286; GFX6-NEXT: s_waitcnt lgkmcnt(0) 13287; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 13288; GFX6-NEXT: s_ashr_i32 s4, s3, 31 13289; GFX6-NEXT: s_add_u32 s2, s2, s4 13290; GFX6-NEXT: s_mov_b32 s5, s4 13291; GFX6-NEXT: s_addc_u32 s3, s3, s4 13292; GFX6-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] 13293; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8 13294; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9 13295; GFX6-NEXT: s_sub_u32 s4, 0, s8 13296; GFX6-NEXT: s_subb_u32 s5, 0, s9 13297; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 13298; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 13299; GFX6-NEXT: v_rcp_f32_e32 v0, v0 13300; GFX6-NEXT: s_waitcnt lgkmcnt(0) 13301; GFX6-NEXT: s_ashr_i32 s10, s3, 31 13302; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 13303; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 13304; GFX6-NEXT: v_trunc_f32_e32 v1, v1 13305; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 13306; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 13307; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 13308; GFX6-NEXT: s_add_u32 s2, s2, s10 13309; GFX6-NEXT: s_mov_b32 s11, s10 13310; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 13311; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 13312; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 13313; GFX6-NEXT: v_mul_lo_u32 v4, s4, v0 13314; GFX6-NEXT: s_addc_u32 s3, s3, s10 13315; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 13316; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 13317; GFX6-NEXT: v_mul_hi_u32 v3, v0, v4 13318; GFX6-NEXT: v_mul_lo_u32 v5, v0, v2 13319; GFX6-NEXT: v_mul_hi_u32 v6, v0, v2 13320; GFX6-NEXT: v_mul_hi_u32 v7, v1, v2 13321; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 13322; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 13323; GFX6-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc 13324; GFX6-NEXT: v_mul_lo_u32 v6, v1, v4 13325; GFX6-NEXT: v_mul_hi_u32 v4, v1, v4 13326; GFX6-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] 13327; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 13328; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc 13329; GFX6-NEXT: v_mov_b32_e32 v4, 0 13330; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v4, vcc 13331; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 13332; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 13333; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 13334; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 13335; GFX6-NEXT: v_mul_lo_u32 v2, s4, v1 13336; GFX6-NEXT: v_mul_hi_u32 v3, s4, v0 13337; GFX6-NEXT: v_mul_lo_u32 v5, s5, v0 13338; GFX6-NEXT: s_mov_b32 s5, s1 13339; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 13340; GFX6-NEXT: v_mul_lo_u32 v3, s4, v0 13341; GFX6-NEXT: v_add_i32_e32 v2, vcc, v5, v2 13342; GFX6-NEXT: v_mul_lo_u32 v7, v0, v2 13343; GFX6-NEXT: v_mul_hi_u32 v8, v0, v3 13344; GFX6-NEXT: v_mul_hi_u32 v9, v0, v2 13345; GFX6-NEXT: v_mul_hi_u32 v6, v1, v3 13346; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 13347; GFX6-NEXT: v_mul_hi_u32 v5, v1, v2 13348; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 13349; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc 13350; GFX6-NEXT: v_mul_lo_u32 v2, v1, v2 13351; GFX6-NEXT: v_add_i32_e32 v3, vcc, v7, v3 13352; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v8, v6, vcc 13353; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v4, vcc 13354; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 13355; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 13356; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 13357; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc 13358; GFX6-NEXT: v_mul_lo_u32 v2, s12, v1 13359; GFX6-NEXT: v_mul_hi_u32 v3, s12, v0 13360; GFX6-NEXT: v_mul_hi_u32 v5, s12, v1 13361; GFX6-NEXT: v_mul_hi_u32 v6, s13, v1 13362; GFX6-NEXT: v_mul_lo_u32 v1, s13, v1 13363; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 13364; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc 13365; GFX6-NEXT: v_mul_lo_u32 v5, s13, v0 13366; GFX6-NEXT: v_mul_hi_u32 v0, s13, v0 13367; GFX6-NEXT: s_mov_b32 s4, s0 13368; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v5 13369; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc 13370; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v6, v4, vcc 13371; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 13372; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc 13373; GFX6-NEXT: v_mul_lo_u32 v1, s8, v1 13374; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0 13375; GFX6-NEXT: v_mul_lo_u32 v3, s9, v0 13376; GFX6-NEXT: v_mul_lo_u32 v0, s8, v0 13377; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 13378; GFX6-NEXT: v_add_i32_e32 v1, vcc, v1, v3 13379; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s13, v1 13380; GFX6-NEXT: v_mov_b32_e32 v3, s9 13381; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 13382; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc 13383; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s8, v0 13384; GFX6-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] 13385; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v5 13386; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[2:3] 13387; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v4 13388; GFX6-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1] 13389; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 13390; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v5 13391; GFX6-NEXT: v_subrev_i32_e64 v3, s[0:1], s8, v4 13392; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[2:3] 13393; GFX6-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] 13394; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 13395; GFX6-NEXT: v_cndmask_b32_e64 v2, v5, v2, s[0:1] 13396; GFX6-NEXT: v_mov_b32_e32 v5, s13 13397; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v5, v1, vcc 13398; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 13399; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 13400; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 13401; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 13402; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 13403; GFX6-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 13404; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 13405; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 13406; GFX6-NEXT: v_cndmask_b32_e64 v2, v4, v3, s[0:1] 13407; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc 13408; GFX6-NEXT: v_xor_b32_e32 v0, s10, v0 13409; GFX6-NEXT: v_xor_b32_e32 v1, s10, v1 13410; GFX6-NEXT: v_mov_b32_e32 v2, s10 13411; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0 13412; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 13413; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 13414; GFX6-NEXT: s_endpgm 13415; 13416; GFX9-LABEL: srem_i64_pow2_shl_denom: 13417; GFX9: ; %bb.0: 13418; GFX9-NEXT: s_load_dword s4, s[0:1], 0x34 13419; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 13420; GFX9-NEXT: v_mov_b32_e32 v2, 0 13421; GFX9-NEXT: s_waitcnt lgkmcnt(0) 13422; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 13423; GFX9-NEXT: s_ashr_i32 s4, s3, 31 13424; GFX9-NEXT: s_add_u32 s2, s2, s4 13425; GFX9-NEXT: s_mov_b32 s5, s4 13426; GFX9-NEXT: s_addc_u32 s3, s3, s4 13427; GFX9-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] 13428; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8 13429; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9 13430; GFX9-NEXT: s_sub_u32 s2, 0, s8 13431; GFX9-NEXT: s_subb_u32 s3, 0, s9 13432; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 13433; GFX9-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 13434; GFX9-NEXT: v_rcp_f32_e32 v0, v0 13435; GFX9-NEXT: s_waitcnt lgkmcnt(0) 13436; GFX9-NEXT: s_ashr_i32 s10, s7, 31 13437; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 13438; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 13439; GFX9-NEXT: v_trunc_f32_e32 v1, v1 13440; GFX9-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 13441; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 13442; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0 13443; GFX9-NEXT: s_add_u32 s0, s6, s10 13444; GFX9-NEXT: s_mov_b32 s11, s10 13445; GFX9-NEXT: v_mul_lo_u32 v3, s2, v1 13446; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 13447; GFX9-NEXT: v_mul_lo_u32 v6, s3, v0 13448; GFX9-NEXT: v_mul_lo_u32 v5, s2, v0 13449; GFX9-NEXT: s_addc_u32 s1, s7, s10 13450; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 13451; GFX9-NEXT: v_add_u32_e32 v3, v3, v6 13452; GFX9-NEXT: v_mul_hi_u32 v4, v0, v5 13453; GFX9-NEXT: v_mul_lo_u32 v6, v0, v3 13454; GFX9-NEXT: v_mul_hi_u32 v8, v0, v3 13455; GFX9-NEXT: v_mul_hi_u32 v7, v1, v5 13456; GFX9-NEXT: v_mul_lo_u32 v5, v1, v5 13457; GFX9-NEXT: v_mul_hi_u32 v9, v1, v3 13458; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 13459; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc 13460; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 13461; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 13462; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v6, v7, vcc 13463; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v2, vcc 13464; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 13465; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 13466; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 13467; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 13468; GFX9-NEXT: v_mul_lo_u32 v3, s2, v1 13469; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 13470; GFX9-NEXT: v_mul_lo_u32 v5, s3, v0 13471; GFX9-NEXT: v_mul_lo_u32 v6, s2, v0 13472; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 13473; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 13474; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 13475; GFX9-NEXT: v_mul_lo_u32 v7, v0, v3 13476; GFX9-NEXT: v_mul_hi_u32 v8, v0, v6 13477; GFX9-NEXT: v_mul_hi_u32 v9, v0, v3 13478; GFX9-NEXT: v_mul_hi_u32 v5, v1, v6 13479; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6 13480; GFX9-NEXT: v_mul_hi_u32 v4, v1, v3 13481; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 13482; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc 13483; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 13484; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 13485; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v5, vcc 13486; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc 13487; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 13488; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 13489; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 13490; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 13491; GFX9-NEXT: v_mul_lo_u32 v3, s6, v1 13492; GFX9-NEXT: v_mul_hi_u32 v4, s6, v0 13493; GFX9-NEXT: v_mul_hi_u32 v5, s6, v1 13494; GFX9-NEXT: v_mul_hi_u32 v6, s7, v1 13495; GFX9-NEXT: v_mul_lo_u32 v1, s7, v1 13496; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 13497; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 13498; GFX9-NEXT: v_mul_lo_u32 v5, s7, v0 13499; GFX9-NEXT: v_mul_hi_u32 v0, s7, v0 13500; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 13501; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v4, v0, vcc 13502; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v2, vcc 13503; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 13504; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc 13505; GFX9-NEXT: v_mul_lo_u32 v1, s8, v1 13506; GFX9-NEXT: v_mul_hi_u32 v3, s8, v0 13507; GFX9-NEXT: v_mul_lo_u32 v4, s9, v0 13508; GFX9-NEXT: v_mul_lo_u32 v0, s8, v0 13509; GFX9-NEXT: v_add_u32_e32 v1, v3, v1 13510; GFX9-NEXT: v_add_u32_e32 v1, v1, v4 13511; GFX9-NEXT: v_sub_u32_e32 v3, s7, v1 13512; GFX9-NEXT: v_mov_b32_e32 v4, s9 13513; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 13514; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc 13515; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s8, v0 13516; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1] 13517; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v6 13518; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 13519; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v5 13520; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] 13521; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 13522; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v6 13523; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v5 13524; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] 13525; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] 13526; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 13527; GFX9-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[0:1] 13528; GFX9-NEXT: v_mov_b32_e32 v5, s7 13529; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc 13530; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 13531; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 13532; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 13533; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] 13534; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 13535; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 13536; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 13537; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 13538; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 13539; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 13540; GFX9-NEXT: v_xor_b32_e32 v0, s10, v0 13541; GFX9-NEXT: v_xor_b32_e32 v1, s10, v1 13542; GFX9-NEXT: v_mov_b32_e32 v3, s10 13543; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s10, v0 13544; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 13545; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 13546; GFX9-NEXT: s_endpgm 13547; 13548; GFX90A-LABEL: srem_i64_pow2_shl_denom: 13549; GFX90A: ; %bb.0: 13550; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x34 13551; GFX90A-NEXT: s_mov_b64 s[2:3], 0x1000 13552; GFX90A-NEXT: v_mov_b32_e32 v2, 0 13553; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 13554; GFX90A-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 13555; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 13556; GFX90A-NEXT: s_add_u32 s2, s2, s4 13557; GFX90A-NEXT: s_mov_b32 s5, s4 13558; GFX90A-NEXT: s_addc_u32 s3, s3, s4 13559; GFX90A-NEXT: s_xor_b64 s[8:9], s[2:3], s[4:5] 13560; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s8 13561; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s9 13562; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 13563; GFX90A-NEXT: s_sub_u32 s0, 0, s8 13564; GFX90A-NEXT: s_subb_u32 s1, 0, s9 13565; GFX90A-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1 13566; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 13567; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 13568; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 13569; GFX90A-NEXT: s_mov_b32 s11, s10 13570; GFX90A-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 13571; GFX90A-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 13572; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 13573; GFX90A-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1 13574; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 13575; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 13576; GFX90A-NEXT: v_mul_lo_u32 v3, s0, v1 13577; GFX90A-NEXT: v_mul_hi_u32 v5, s0, v0 13578; GFX90A-NEXT: v_mul_lo_u32 v4, s1, v0 13579; GFX90A-NEXT: v_add_u32_e32 v3, v5, v3 13580; GFX90A-NEXT: v_mul_lo_u32 v6, s0, v0 13581; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 13582; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v3 13583; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 13584; GFX90A-NEXT: v_mul_hi_u32 v4, v0, v3 13585; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 13586; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 13587; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 13588; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 13589; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 13590; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v3 13591; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v8, vcc 13592; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v2, vcc 13593; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 13594; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 13595; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 13596; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 13597; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 13598; GFX90A-NEXT: v_mul_lo_u32 v3, s0, v1 13599; GFX90A-NEXT: v_mul_hi_u32 v4, s0, v0 13600; GFX90A-NEXT: v_add_u32_e32 v3, v4, v3 13601; GFX90A-NEXT: v_mul_lo_u32 v4, s1, v0 13602; GFX90A-NEXT: v_add_u32_e32 v3, v3, v4 13603; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v0 13604; GFX90A-NEXT: v_mul_hi_u32 v6, v1, v5 13605; GFX90A-NEXT: v_mul_lo_u32 v7, v1, v5 13606; GFX90A-NEXT: v_mul_lo_u32 v9, v0, v3 13607; GFX90A-NEXT: v_mul_hi_u32 v5, v0, v5 13608; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v3 13609; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 13610; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v8, vcc 13611; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 13612; GFX90A-NEXT: v_mul_hi_u32 v4, v1, v3 13613; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc 13614; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v2, vcc 13615; GFX90A-NEXT: v_mul_lo_u32 v3, v1, v3 13616; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 13617; GFX90A-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 13618; GFX90A-NEXT: s_add_u32 s0, s6, s10 13619; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3 13620; GFX90A-NEXT: s_addc_u32 s1, s7, s10 13621; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 13622; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 13623; GFX90A-NEXT: v_mul_lo_u32 v4, s6, v1 13624; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v0 13625; GFX90A-NEXT: v_mul_hi_u32 v3, s6, v1 13626; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v5, v4 13627; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 13628; GFX90A-NEXT: v_mul_hi_u32 v6, s7, v0 13629; GFX90A-NEXT: v_mul_lo_u32 v0, s7, v0 13630; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 13631; GFX90A-NEXT: v_mul_hi_u32 v5, s7, v1 13632; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v3, v6, vcc 13633; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v2, vcc 13634; GFX90A-NEXT: v_mul_lo_u32 v1, s7, v1 13635; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 13636; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc 13637; GFX90A-NEXT: v_mul_lo_u32 v1, s8, v1 13638; GFX90A-NEXT: v_mul_hi_u32 v3, s8, v0 13639; GFX90A-NEXT: v_add_u32_e32 v1, v3, v1 13640; GFX90A-NEXT: v_mul_lo_u32 v3, s9, v0 13641; GFX90A-NEXT: v_add_u32_e32 v1, v1, v3 13642; GFX90A-NEXT: v_mul_lo_u32 v0, s8, v0 13643; GFX90A-NEXT: v_sub_u32_e32 v3, s7, v1 13644; GFX90A-NEXT: v_mov_b32_e32 v4, s9 13645; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0 13646; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc 13647; GFX90A-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s8, v0 13648; GFX90A-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1] 13649; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v6 13650; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 13651; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v5 13652; GFX90A-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] 13653; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 13654; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v6 13655; GFX90A-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s8, v5 13656; GFX90A-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] 13657; GFX90A-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] 13658; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 13659; GFX90A-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[0:1] 13660; GFX90A-NEXT: v_mov_b32_e32 v5, s7 13661; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc 13662; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 13663; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 13664; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s8, v0 13665; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] 13666; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 13667; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s9, v1 13668; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 13669; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 13670; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc 13671; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 13672; GFX90A-NEXT: v_xor_b32_e32 v0, s10, v0 13673; GFX90A-NEXT: v_xor_b32_e32 v1, s10, v1 13674; GFX90A-NEXT: v_mov_b32_e32 v3, s10 13675; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s10, v0 13676; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 13677; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] 13678; GFX90A-NEXT: s_endpgm 13679 %shl.y = shl i64 4096, %y 13680 %r = srem i64 %x, %shl.y 13681 store i64 %r, i64 addrspace(1)* %out 13682 ret void 13683} 13684 13685define amdgpu_kernel void @srem_v2i64_pow2k_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x) { 13686; CHECK-LABEL: @srem_v2i64_pow2k_denom( 13687; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 13688; CHECK-NEXT: [[TMP2:%.*]] = srem i64 [[TMP1]], 4096 13689; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i64 0 13690; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[X]], i64 1 13691; CHECK-NEXT: [[TMP5:%.*]] = srem i64 [[TMP4]], 4096 13692; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP5]], i64 1 13693; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 13694; CHECK-NEXT: ret void 13695; 13696; GFX6-LABEL: srem_v2i64_pow2k_denom: 13697; GFX6: ; %bb.0: 13698; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd 13699; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 13700; GFX6-NEXT: s_movk_i32 s8, 0xf000 13701; GFX6-NEXT: s_mov_b32 s3, 0xf000 13702; GFX6-NEXT: s_mov_b32 s2, -1 13703; GFX6-NEXT: s_waitcnt lgkmcnt(0) 13704; GFX6-NEXT: s_ashr_i32 s9, s5, 31 13705; GFX6-NEXT: s_lshr_b32 s9, s9, 20 13706; GFX6-NEXT: s_add_u32 s9, s4, s9 13707; GFX6-NEXT: s_addc_u32 s10, s5, 0 13708; GFX6-NEXT: s_and_b32 s9, s9, s8 13709; GFX6-NEXT: s_sub_u32 s4, s4, s9 13710; GFX6-NEXT: s_subb_u32 s5, s5, s10 13711; GFX6-NEXT: s_ashr_i32 s9, s7, 31 13712; GFX6-NEXT: s_lshr_b32 s9, s9, 20 13713; GFX6-NEXT: s_add_u32 s9, s6, s9 13714; GFX6-NEXT: s_addc_u32 s10, s7, 0 13715; GFX6-NEXT: s_and_b32 s8, s9, s8 13716; GFX6-NEXT: s_sub_u32 s6, s6, s8 13717; GFX6-NEXT: s_subb_u32 s7, s7, s10 13718; GFX6-NEXT: v_mov_b32_e32 v0, s4 13719; GFX6-NEXT: v_mov_b32_e32 v1, s5 13720; GFX6-NEXT: v_mov_b32_e32 v2, s6 13721; GFX6-NEXT: v_mov_b32_e32 v3, s7 13722; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 13723; GFX6-NEXT: s_endpgm 13724; 13725; GFX9-LABEL: srem_v2i64_pow2k_denom: 13726; GFX9: ; %bb.0: 13727; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 13728; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 13729; GFX9-NEXT: s_movk_i32 s0, 0xf000 13730; GFX9-NEXT: v_mov_b32_e32 v4, 0 13731; GFX9-NEXT: s_waitcnt lgkmcnt(0) 13732; GFX9-NEXT: s_ashr_i32 s1, s5, 31 13733; GFX9-NEXT: s_lshr_b32 s1, s1, 20 13734; GFX9-NEXT: s_add_u32 s1, s4, s1 13735; GFX9-NEXT: s_addc_u32 s8, s5, 0 13736; GFX9-NEXT: s_and_b32 s1, s1, s0 13737; GFX9-NEXT: s_sub_u32 s1, s4, s1 13738; GFX9-NEXT: s_subb_u32 s4, s5, s8 13739; GFX9-NEXT: s_ashr_i32 s5, s7, 31 13740; GFX9-NEXT: s_lshr_b32 s5, s5, 20 13741; GFX9-NEXT: s_add_u32 s5, s6, s5 13742; GFX9-NEXT: s_addc_u32 s8, s7, 0 13743; GFX9-NEXT: s_and_b32 s0, s5, s0 13744; GFX9-NEXT: s_sub_u32 s0, s6, s0 13745; GFX9-NEXT: s_subb_u32 s5, s7, s8 13746; GFX9-NEXT: v_mov_b32_e32 v0, s1 13747; GFX9-NEXT: v_mov_b32_e32 v1, s4 13748; GFX9-NEXT: v_mov_b32_e32 v2, s0 13749; GFX9-NEXT: v_mov_b32_e32 v3, s5 13750; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 13751; GFX9-NEXT: s_endpgm 13752; 13753; GFX90A-LABEL: srem_v2i64_pow2k_denom: 13754; GFX90A: ; %bb.0: 13755; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 13756; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 13757; GFX90A-NEXT: s_movk_i32 s0, 0xf000 13758; GFX90A-NEXT: v_mov_b32_e32 v4, 0 13759; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 13760; GFX90A-NEXT: s_ashr_i32 s1, s5, 31 13761; GFX90A-NEXT: s_lshr_b32 s1, s1, 20 13762; GFX90A-NEXT: s_add_u32 s1, s4, s1 13763; GFX90A-NEXT: s_addc_u32 s8, s5, 0 13764; GFX90A-NEXT: s_and_b32 s1, s1, s0 13765; GFX90A-NEXT: s_sub_u32 s1, s4, s1 13766; GFX90A-NEXT: s_subb_u32 s4, s5, s8 13767; GFX90A-NEXT: s_ashr_i32 s5, s7, 31 13768; GFX90A-NEXT: s_lshr_b32 s5, s5, 20 13769; GFX90A-NEXT: s_add_u32 s5, s6, s5 13770; GFX90A-NEXT: s_addc_u32 s8, s7, 0 13771; GFX90A-NEXT: s_and_b32 s0, s5, s0 13772; GFX90A-NEXT: s_sub_u32 s0, s6, s0 13773; GFX90A-NEXT: s_subb_u32 s5, s7, s8 13774; GFX90A-NEXT: v_mov_b32_e32 v0, s1 13775; GFX90A-NEXT: v_mov_b32_e32 v1, s4 13776; GFX90A-NEXT: v_mov_b32_e32 v2, s0 13777; GFX90A-NEXT: v_mov_b32_e32 v3, s5 13778; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] 13779; GFX90A-NEXT: s_endpgm 13780 %r = srem <2 x i64> %x, <i64 4096, i64 4096> 13781 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 13782 ret void 13783} 13784 13785define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %out, <2 x i64> %x, <2 x i64> %y) { 13786; CHECK-LABEL: @srem_v2i64_pow2_shl_denom( 13787; CHECK-NEXT: [[SHL_Y:%.*]] = shl <2 x i64> <i64 4096, i64 4096>, [[Y:%.*]] 13788; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[X:%.*]], i64 0 13789; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 0 13790; CHECK-NEXT: [[TMP3:%.*]] = srem i64 [[TMP1]], [[TMP2]] 13791; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> undef, i64 [[TMP3]], i64 0 13792; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[X]], i64 1 13793; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[SHL_Y]], i64 1 13794; CHECK-NEXT: [[TMP7:%.*]] = srem i64 [[TMP5]], [[TMP6]] 13795; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[TMP7]], i64 1 13796; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64> addrspace(1)* [[OUT:%.*]], align 16 13797; CHECK-NEXT: ret void 13798; 13799; GFX6-LABEL: srem_v2i64_pow2_shl_denom: 13800; GFX6: ; %bb.0: 13801; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x11 13802; GFX6-NEXT: s_mov_b64 s[2:3], 0x1000 13803; GFX6-NEXT: s_mov_b32 s18, 0x4f800000 13804; GFX6-NEXT: s_mov_b32 s19, 0x5f7ffffc 13805; GFX6-NEXT: s_mov_b32 s20, 0x2f800000 13806; GFX6-NEXT: s_waitcnt lgkmcnt(0) 13807; GFX6-NEXT: s_lshl_b64 s[14:15], s[2:3], s6 13808; GFX6-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 13809; GFX6-NEXT: s_ashr_i32 s4, s3, 31 13810; GFX6-NEXT: s_add_u32 s2, s2, s4 13811; GFX6-NEXT: s_mov_b32 s5, s4 13812; GFX6-NEXT: s_addc_u32 s3, s3, s4 13813; GFX6-NEXT: s_xor_b64 s[16:17], s[2:3], s[4:5] 13814; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s16 13815; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s17 13816; GFX6-NEXT: s_mov_b32 s21, 0xcf800000 13817; GFX6-NEXT: s_sub_u32 s2, 0, s16 13818; GFX6-NEXT: s_subb_u32 s3, 0, s17 13819; GFX6-NEXT: v_mac_f32_e32 v0, s18, v1 13820; GFX6-NEXT: v_rcp_f32_e32 v0, v0 13821; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 13822; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd 13823; GFX6-NEXT: s_mov_b32 s7, 0xf000 13824; GFX6-NEXT: s_mov_b32 s6, -1 13825; GFX6-NEXT: v_mul_f32_e32 v0, s19, v0 13826; GFX6-NEXT: v_mul_f32_e32 v1, s20, v0 13827; GFX6-NEXT: v_trunc_f32_e32 v1, v1 13828; GFX6-NEXT: v_mac_f32_e32 v0, s21, v1 13829; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 13830; GFX6-NEXT: v_cvt_u32_f32_e32 v2, v0 13831; GFX6-NEXT: s_waitcnt lgkmcnt(0) 13832; GFX6-NEXT: s_ashr_i32 s12, s9, 31 13833; GFX6-NEXT: s_add_u32 s0, s8, s12 13834; GFX6-NEXT: v_mul_lo_u32 v0, s2, v1 13835; GFX6-NEXT: v_mul_hi_u32 v3, s2, v2 13836; GFX6-NEXT: v_mul_lo_u32 v4, s3, v2 13837; GFX6-NEXT: v_mul_lo_u32 v5, s2, v2 13838; GFX6-NEXT: s_mov_b32 s13, s12 13839; GFX6-NEXT: v_add_i32_e32 v0, vcc, v3, v0 13840; GFX6-NEXT: v_add_i32_e32 v3, vcc, v0, v4 13841; GFX6-NEXT: v_mul_lo_u32 v0, v2, v3 13842; GFX6-NEXT: v_mul_hi_u32 v4, v2, v5 13843; GFX6-NEXT: v_mul_hi_u32 v6, v2, v3 13844; GFX6-NEXT: v_mul_hi_u32 v7, v1, v3 13845; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 13846; GFX6-NEXT: v_add_i32_e32 v0, vcc, v4, v0 13847; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v6, vcc 13848; GFX6-NEXT: v_mul_lo_u32 v6, v1, v5 13849; GFX6-NEXT: v_mul_hi_u32 v5, v1, v5 13850; GFX6-NEXT: s_addc_u32 s1, s9, s12 13851; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[12:13] 13852; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v6 13853; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v5, vcc 13854; GFX6-NEXT: v_mov_b32_e32 v0, 0 13855; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v7, v0, vcc 13856; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 13857; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc 13858; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 13859; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc 13860; GFX6-NEXT: v_mul_lo_u32 v3, s2, v1 13861; GFX6-NEXT: v_mul_hi_u32 v4, s2, v2 13862; GFX6-NEXT: v_mul_lo_u32 v5, s3, v2 13863; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 13864; GFX6-NEXT: v_mul_lo_u32 v4, s2, v2 13865; GFX6-NEXT: v_add_i32_e32 v3, vcc, v5, v3 13866; GFX6-NEXT: v_mul_lo_u32 v7, v2, v3 13867; GFX6-NEXT: v_mul_hi_u32 v8, v2, v4 13868; GFX6-NEXT: v_mul_hi_u32 v9, v2, v3 13869; GFX6-NEXT: v_mul_hi_u32 v6, v1, v4 13870; GFX6-NEXT: v_mul_lo_u32 v4, v1, v4 13871; GFX6-NEXT: v_mul_hi_u32 v5, v1, v3 13872; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 13873; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc 13874; GFX6-NEXT: v_mul_lo_u32 v3, v1, v3 13875; GFX6-NEXT: v_add_i32_e32 v4, vcc, v7, v4 13876; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v8, v6, vcc 13877; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v5, v0, vcc 13878; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 13879; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc 13880; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 13881; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc 13882; GFX6-NEXT: v_mul_lo_u32 v3, s8, v1 13883; GFX6-NEXT: v_mul_hi_u32 v4, s8, v2 13884; GFX6-NEXT: v_mul_hi_u32 v5, s8, v1 13885; GFX6-NEXT: v_mul_hi_u32 v6, s9, v1 13886; GFX6-NEXT: v_mul_lo_u32 v1, s9, v1 13887; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 13888; GFX6-NEXT: v_addc_u32_e32 v4, vcc, 0, v5, vcc 13889; GFX6-NEXT: v_mul_lo_u32 v5, s9, v2 13890; GFX6-NEXT: v_mul_hi_u32 v2, s9, v2 13891; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 13892; GFX6-NEXT: v_addc_u32_e32 v2, vcc, v4, v2, vcc 13893; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v0, vcc 13894; GFX6-NEXT: v_add_i32_e32 v1, vcc, v2, v1 13895; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc 13896; GFX6-NEXT: v_mul_lo_u32 v2, s16, v2 13897; GFX6-NEXT: v_mul_hi_u32 v3, s16, v1 13898; GFX6-NEXT: v_mul_lo_u32 v4, s17, v1 13899; GFX6-NEXT: v_mul_lo_u32 v1, s16, v1 13900; GFX6-NEXT: v_add_i32_e32 v2, vcc, v3, v2 13901; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 13902; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s9, v2 13903; GFX6-NEXT: v_mov_b32_e32 v4, s17 13904; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s8, v1 13905; GFX6-NEXT: v_subb_u32_e64 v3, s[0:1], v3, v4, vcc 13906; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s16, v1 13907; GFX6-NEXT: v_subbrev_u32_e64 v6, s[2:3], 0, v3, s[0:1] 13908; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s17, v6 13909; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 13910; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s16, v5 13911; GFX6-NEXT: v_subb_u32_e64 v3, s[0:1], v3, v4, s[0:1] 13912; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 13913; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s17, v6 13914; GFX6-NEXT: v_subrev_i32_e64 v4, s[0:1], s16, v5 13915; GFX6-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] 13916; GFX6-NEXT: v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1] 13917; GFX6-NEXT: s_ashr_i32 s2, s15, 31 13918; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 13919; GFX6-NEXT: s_add_u32 s8, s14, s2 13920; GFX6-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] 13921; GFX6-NEXT: v_mov_b32_e32 v6, s9 13922; GFX6-NEXT: s_mov_b32 s3, s2 13923; GFX6-NEXT: s_addc_u32 s9, s15, s2 13924; GFX6-NEXT: s_xor_b64 s[8:9], s[8:9], s[2:3] 13925; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s8 13926; GFX6-NEXT: v_cvt_f32_u32_e32 v8, s9 13927; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v6, v2, vcc 13928; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s17, v2 13929; GFX6-NEXT: v_mac_f32_e32 v7, s18, v8 13930; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 13931; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s16, v1 13932; GFX6-NEXT: v_rcp_f32_e32 v7, v7 13933; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc 13934; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s17, v2 13935; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc 13936; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 13937; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 13938; GFX6-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[0:1] 13939; GFX6-NEXT: v_mul_f32_e32 v4, s19, v7 13940; GFX6-NEXT: v_mul_f32_e32 v5, s20, v4 13941; GFX6-NEXT: v_trunc_f32_e32 v5, v5 13942; GFX6-NEXT: v_mac_f32_e32 v4, s21, v5 13943; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 13944; GFX6-NEXT: v_cvt_u32_f32_e32 v5, v5 13945; GFX6-NEXT: s_sub_u32 s0, 0, s8 13946; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc 13947; GFX6-NEXT: v_mul_hi_u32 v3, s0, v4 13948; GFX6-NEXT: v_mul_lo_u32 v6, s0, v5 13949; GFX6-NEXT: s_subb_u32 s1, 0, s9 13950; GFX6-NEXT: v_mul_lo_u32 v7, s1, v4 13951; GFX6-NEXT: s_ashr_i32 s14, s11, 31 13952; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v6 13953; GFX6-NEXT: v_mul_lo_u32 v6, s0, v4 13954; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v7 13955; GFX6-NEXT: v_mul_lo_u32 v7, v4, v3 13956; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 13957; GFX6-NEXT: v_mul_hi_u32 v9, v4, v3 13958; GFX6-NEXT: v_mul_hi_u32 v10, v5, v3 13959; GFX6-NEXT: v_mul_lo_u32 v3, v5, v3 13960; GFX6-NEXT: v_add_i32_e32 v7, vcc, v8, v7 13961; GFX6-NEXT: v_addc_u32_e32 v8, vcc, 0, v9, vcc 13962; GFX6-NEXT: v_mul_lo_u32 v9, v5, v6 13963; GFX6-NEXT: v_mul_hi_u32 v6, v5, v6 13964; GFX6-NEXT: s_mov_b32 s15, s14 13965; GFX6-NEXT: v_xor_b32_e32 v1, s12, v1 13966; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v9 13967; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v8, v6, vcc 13968; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v10, v0, vcc 13969; GFX6-NEXT: v_add_i32_e32 v3, vcc, v6, v3 13970; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc 13971; GFX6-NEXT: v_add_i32_e32 v3, vcc, v4, v3 13972; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v5, v6, vcc 13973; GFX6-NEXT: v_mul_lo_u32 v5, s0, v4 13974; GFX6-NEXT: v_mul_hi_u32 v6, s0, v3 13975; GFX6-NEXT: v_mul_lo_u32 v7, s1, v3 13976; GFX6-NEXT: v_xor_b32_e32 v2, s12, v2 13977; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 13978; GFX6-NEXT: v_mul_lo_u32 v6, s0, v3 13979; GFX6-NEXT: v_add_i32_e32 v5, vcc, v7, v5 13980; GFX6-NEXT: v_mul_lo_u32 v9, v3, v5 13981; GFX6-NEXT: v_mul_hi_u32 v10, v3, v6 13982; GFX6-NEXT: v_mul_hi_u32 v11, v3, v5 13983; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 13984; GFX6-NEXT: v_mul_lo_u32 v6, v4, v6 13985; GFX6-NEXT: v_mul_hi_u32 v7, v4, v5 13986; GFX6-NEXT: v_add_i32_e32 v9, vcc, v10, v9 13987; GFX6-NEXT: v_addc_u32_e32 v10, vcc, 0, v11, vcc 13988; GFX6-NEXT: v_mul_lo_u32 v5, v4, v5 13989; GFX6-NEXT: v_add_i32_e32 v6, vcc, v9, v6 13990; GFX6-NEXT: v_addc_u32_e32 v6, vcc, v10, v8, vcc 13991; GFX6-NEXT: v_addc_u32_e32 v7, vcc, v7, v0, vcc 13992; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 13993; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc 13994; GFX6-NEXT: s_add_u32 s0, s10, s14 13995; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 13996; GFX6-NEXT: s_addc_u32 s1, s11, s14 13997; GFX6-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc 13998; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[14:15] 13999; GFX6-NEXT: v_mul_lo_u32 v5, s10, v4 14000; GFX6-NEXT: v_mul_hi_u32 v6, s10, v3 14001; GFX6-NEXT: v_mul_hi_u32 v8, s10, v4 14002; GFX6-NEXT: v_mul_hi_u32 v9, s11, v4 14003; GFX6-NEXT: v_mul_lo_u32 v4, s11, v4 14004; GFX6-NEXT: v_add_i32_e32 v5, vcc, v6, v5 14005; GFX6-NEXT: v_addc_u32_e32 v6, vcc, 0, v8, vcc 14006; GFX6-NEXT: v_mul_lo_u32 v8, s11, v3 14007; GFX6-NEXT: v_mul_hi_u32 v3, s11, v3 14008; GFX6-NEXT: v_mov_b32_e32 v7, s12 14009; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v8 14010; GFX6-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc 14011; GFX6-NEXT: v_addc_u32_e32 v0, vcc, v9, v0, vcc 14012; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 14013; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v0, vcc 14014; GFX6-NEXT: v_mul_lo_u32 v4, s8, v0 14015; GFX6-NEXT: v_mul_hi_u32 v5, s8, v3 14016; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s12, v1 14017; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v7, vcc 14018; GFX6-NEXT: v_mul_lo_u32 v2, s9, v3 14019; GFX6-NEXT: v_mul_lo_u32 v3, s8, v3 14020; GFX6-NEXT: v_add_i32_e32 v4, vcc, v5, v4 14021; GFX6-NEXT: v_add_i32_e32 v2, vcc, v4, v2 14022; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s11, v2 14023; GFX6-NEXT: v_mov_b32_e32 v5, s9 14024; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s10, v3 14025; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc 14026; GFX6-NEXT: v_subrev_i32_e64 v6, s[0:1], s8, v3 14027; GFX6-NEXT: v_subbrev_u32_e64 v7, s[2:3], 0, v4, s[0:1] 14028; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s9, v7 14029; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 14030; GFX6-NEXT: v_cmp_le_u32_e64 s[2:3], s8, v6 14031; GFX6-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, s[0:1] 14032; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 14033; GFX6-NEXT: v_cmp_eq_u32_e64 s[2:3], s9, v7 14034; GFX6-NEXT: v_subrev_i32_e64 v5, s[0:1], s8, v6 14035; GFX6-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[2:3] 14036; GFX6-NEXT: v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1] 14037; GFX6-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v8 14038; GFX6-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[0:1] 14039; GFX6-NEXT: v_mov_b32_e32 v7, s11 14040; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v7, v2, vcc 14041; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s9, v2 14042; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 14043; GFX6-NEXT: v_cmp_le_u32_e32 vcc, s8, v3 14044; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 14045; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s9, v2 14046; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 14047; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 14048; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc 14049; GFX6-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] 14050; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc 14051; GFX6-NEXT: v_xor_b32_e32 v3, s14, v3 14052; GFX6-NEXT: v_xor_b32_e32 v4, s14, v2 14053; GFX6-NEXT: v_mov_b32_e32 v5, s14 14054; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, s14, v3 14055; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v4, v5, vcc 14056; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 14057; GFX6-NEXT: s_endpgm 14058; 14059; GFX9-LABEL: srem_v2i64_pow2_shl_denom: 14060; GFX9: ; %bb.0: 14061; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 14062; GFX9-NEXT: s_mov_b64 s[2:3], 0x1000 14063; GFX9-NEXT: s_mov_b32 s16, 0x4f800000 14064; GFX9-NEXT: s_mov_b32 s17, 0x5f7ffffc 14065; GFX9-NEXT: s_mov_b32 s18, 0x2f800000 14066; GFX9-NEXT: s_waitcnt lgkmcnt(0) 14067; GFX9-NEXT: s_lshl_b64 s[10:11], s[2:3], s6 14068; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 14069; GFX9-NEXT: s_ashr_i32 s4, s3, 31 14070; GFX9-NEXT: s_add_u32 s2, s2, s4 14071; GFX9-NEXT: s_mov_b32 s5, s4 14072; GFX9-NEXT: s_addc_u32 s3, s3, s4 14073; GFX9-NEXT: s_xor_b64 s[12:13], s[2:3], s[4:5] 14074; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s12 14075; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s13 14076; GFX9-NEXT: s_mov_b32 s19, 0xcf800000 14077; GFX9-NEXT: s_sub_u32 s2, 0, s12 14078; GFX9-NEXT: s_subb_u32 s3, 0, s13 14079; GFX9-NEXT: v_mac_f32_e32 v0, s16, v1 14080; GFX9-NEXT: v_rcp_f32_e32 v0, v0 14081; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 14082; GFX9-NEXT: v_mul_f32_e32 v0, s17, v0 14083; GFX9-NEXT: v_mul_f32_e32 v1, s18, v0 14084; GFX9-NEXT: v_trunc_f32_e32 v1, v1 14085; GFX9-NEXT: v_mac_f32_e32 v0, s19, v1 14086; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1 14087; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v0 14088; GFX9-NEXT: s_waitcnt lgkmcnt(0) 14089; GFX9-NEXT: s_ashr_i32 s8, s5, 31 14090; GFX9-NEXT: s_mov_b32 s9, s8 14091; GFX9-NEXT: v_mul_lo_u32 v0, s2, v1 14092; GFX9-NEXT: v_mul_hi_u32 v3, s2, v2 14093; GFX9-NEXT: v_mul_lo_u32 v5, s3, v2 14094; GFX9-NEXT: v_mul_lo_u32 v4, s2, v2 14095; GFX9-NEXT: v_add_u32_e32 v0, v3, v0 14096; GFX9-NEXT: v_add_u32_e32 v5, v0, v5 14097; GFX9-NEXT: v_mul_hi_u32 v3, v2, v4 14098; GFX9-NEXT: v_mul_lo_u32 v6, v2, v5 14099; GFX9-NEXT: v_mul_hi_u32 v7, v2, v5 14100; GFX9-NEXT: v_mul_hi_u32 v8, v1, v5 14101; GFX9-NEXT: v_mov_b32_e32 v0, 0 14102; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 14103; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 14104; GFX9-NEXT: v_mul_lo_u32 v7, v1, v4 14105; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 14106; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v7 14107; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v4, vcc 14108; GFX9-NEXT: v_mul_lo_u32 v4, v1, v5 14109; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v0, vcc 14110; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 14111; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 14112; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 14113; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 14114; GFX9-NEXT: v_mul_lo_u32 v3, s2, v1 14115; GFX9-NEXT: v_mul_hi_u32 v4, s2, v2 14116; GFX9-NEXT: v_mul_lo_u32 v5, s3, v2 14117; GFX9-NEXT: v_mul_lo_u32 v6, s2, v2 14118; GFX9-NEXT: s_add_u32 s2, s4, s8 14119; GFX9-NEXT: v_add_u32_e32 v3, v4, v3 14120; GFX9-NEXT: v_add_u32_e32 v3, v3, v5 14121; GFX9-NEXT: v_mul_lo_u32 v7, v2, v3 14122; GFX9-NEXT: v_mul_hi_u32 v8, v2, v6 14123; GFX9-NEXT: v_mul_hi_u32 v9, v2, v3 14124; GFX9-NEXT: v_mul_hi_u32 v5, v1, v6 14125; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6 14126; GFX9-NEXT: v_mul_hi_u32 v4, v1, v3 14127; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 14128; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc 14129; GFX9-NEXT: v_mul_lo_u32 v3, v1, v3 14130; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 14131; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v5, vcc 14132; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v0, vcc 14133; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 14134; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc 14135; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 14136; GFX9-NEXT: s_addc_u32 s3, s5, s8 14137; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v4, vcc 14138; GFX9-NEXT: s_xor_b64 s[14:15], s[2:3], s[8:9] 14139; GFX9-NEXT: v_mul_lo_u32 v3, s14, v1 14140; GFX9-NEXT: v_mul_hi_u32 v4, s14, v2 14141; GFX9-NEXT: v_mul_hi_u32 v5, s14, v1 14142; GFX9-NEXT: v_mul_hi_u32 v6, s15, v1 14143; GFX9-NEXT: v_mul_lo_u32 v1, s15, v1 14144; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 14145; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 14146; GFX9-NEXT: v_mul_lo_u32 v5, s15, v2 14147; GFX9-NEXT: v_mul_hi_u32 v2, s15, v2 14148; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 14149; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 14150; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v4, v2, vcc 14151; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v0, vcc 14152; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v2, v1 14153; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc 14154; GFX9-NEXT: v_mul_lo_u32 v2, s12, v2 14155; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1 14156; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1 14157; GFX9-NEXT: v_mul_lo_u32 v1, s12, v1 14158; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 14159; GFX9-NEXT: v_add_u32_e32 v2, v2, v4 14160; GFX9-NEXT: v_sub_u32_e32 v3, s15, v2 14161; GFX9-NEXT: v_mov_b32_e32 v4, s13 14162; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s14, v1 14163; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc 14164; GFX9-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s12, v1 14165; GFX9-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1] 14166; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v6 14167; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 14168; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v5 14169; GFX9-NEXT: v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1] 14170; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 14171; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v6 14172; GFX9-NEXT: v_subrev_co_u32_e64 v4, s[0:1], s12, v5 14173; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] 14174; GFX9-NEXT: v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1] 14175; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 14176; GFX9-NEXT: v_cndmask_b32_e64 v4, v5, v4, s[0:1] 14177; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] 14178; GFX9-NEXT: s_ashr_i32 s0, s11, 31 14179; GFX9-NEXT: s_add_u32 s2, s10, s0 14180; GFX9-NEXT: s_mov_b32 s1, s0 14181; GFX9-NEXT: s_addc_u32 s3, s11, s0 14182; GFX9-NEXT: v_mov_b32_e32 v5, s15 14183; GFX9-NEXT: s_xor_b64 s[10:11], s[2:3], s[0:1] 14184; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v5, v2, vcc 14185; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s10 14186; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s11 14187; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s13, v2 14188; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 14189; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s12, v1 14190; GFX9-NEXT: v_mac_f32_e32 v5, s16, v6 14191; GFX9-NEXT: v_rcp_f32_e32 v5, v5 14192; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 14193; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s13, v2 14194; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v8, vcc 14195; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6 14196; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc 14197; GFX9-NEXT: v_mul_f32_e32 v4, s17, v5 14198; GFX9-NEXT: v_mul_f32_e32 v5, s18, v4 14199; GFX9-NEXT: v_trunc_f32_e32 v5, v5 14200; GFX9-NEXT: v_mac_f32_e32 v4, s19, v5 14201; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v4 14202; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5 14203; GFX9-NEXT: s_sub_u32 s0, 0, s10 14204; GFX9-NEXT: s_subb_u32 s1, 0, s11 14205; GFX9-NEXT: v_mul_hi_u32 v6, s0, v4 14206; GFX9-NEXT: v_mul_lo_u32 v7, s0, v5 14207; GFX9-NEXT: v_mul_lo_u32 v8, s1, v4 14208; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc 14209; GFX9-NEXT: v_mul_lo_u32 v3, s0, v4 14210; GFX9-NEXT: v_add_u32_e32 v6, v6, v7 14211; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 14212; GFX9-NEXT: v_mul_lo_u32 v7, v4, v6 14213; GFX9-NEXT: v_mul_hi_u32 v8, v4, v3 14214; GFX9-NEXT: v_mul_hi_u32 v9, v4, v6 14215; GFX9-NEXT: v_mul_hi_u32 v10, v5, v6 14216; GFX9-NEXT: v_mul_lo_u32 v6, v5, v6 14217; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v7 14218; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v9, vcc 14219; GFX9-NEXT: v_mul_lo_u32 v9, v5, v3 14220; GFX9-NEXT: v_mul_hi_u32 v3, v5, v3 14221; GFX9-NEXT: s_ashr_i32 s12, s7, 31 14222; GFX9-NEXT: s_mov_b32 s13, s12 14223; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 14224; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8, v3, vcc 14225; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v0, vcc 14226; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v6 14227; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 14228; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v4, v3 14229; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v5, v6, vcc 14230; GFX9-NEXT: v_mul_lo_u32 v5, s0, v4 14231; GFX9-NEXT: v_mul_hi_u32 v6, s0, v3 14232; GFX9-NEXT: v_mul_lo_u32 v7, s1, v3 14233; GFX9-NEXT: v_mul_lo_u32 v8, s0, v3 14234; GFX9-NEXT: s_add_u32 s0, s6, s12 14235; GFX9-NEXT: v_add_u32_e32 v5, v6, v5 14236; GFX9-NEXT: v_add_u32_e32 v5, v5, v7 14237; GFX9-NEXT: v_mul_lo_u32 v9, v3, v5 14238; GFX9-NEXT: v_mul_hi_u32 v10, v3, v8 14239; GFX9-NEXT: v_mul_hi_u32 v11, v3, v5 14240; GFX9-NEXT: v_mul_hi_u32 v7, v4, v8 14241; GFX9-NEXT: v_mul_lo_u32 v8, v4, v8 14242; GFX9-NEXT: v_mul_hi_u32 v6, v4, v5 14243; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 14244; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v11, vcc 14245; GFX9-NEXT: v_mul_lo_u32 v5, v4, v5 14246; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 14247; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v7, vcc 14248; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v0, vcc 14249; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 14250; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 14251; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5 14252; GFX9-NEXT: s_addc_u32 s1, s7, s12 14253; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v6, vcc 14254; GFX9-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13] 14255; GFX9-NEXT: v_mul_lo_u32 v5, s6, v4 14256; GFX9-NEXT: v_mul_hi_u32 v6, s6, v3 14257; GFX9-NEXT: v_mul_hi_u32 v8, s6, v4 14258; GFX9-NEXT: v_mul_hi_u32 v9, s7, v4 14259; GFX9-NEXT: v_mul_lo_u32 v4, s7, v4 14260; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 14261; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v8, vcc 14262; GFX9-NEXT: v_mul_lo_u32 v8, s7, v3 14263; GFX9-NEXT: v_mul_hi_u32 v3, s7, v3 14264; GFX9-NEXT: v_xor_b32_e32 v1, s8, v1 14265; GFX9-NEXT: v_xor_b32_e32 v2, s8, v2 14266; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v8 14267; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v3, vcc 14268; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v0, vcc 14269; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v4 14270; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v5, vcc 14271; GFX9-NEXT: v_mul_lo_u32 v4, s10, v4 14272; GFX9-NEXT: v_mul_hi_u32 v5, s10, v3 14273; GFX9-NEXT: v_mul_lo_u32 v6, s11, v3 14274; GFX9-NEXT: v_mul_lo_u32 v3, s10, v3 14275; GFX9-NEXT: v_mov_b32_e32 v7, s8 14276; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s8, v1 14277; GFX9-NEXT: v_add_u32_e32 v4, v5, v4 14278; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v7, vcc 14279; GFX9-NEXT: v_add_u32_e32 v4, v4, v6 14280; GFX9-NEXT: v_sub_u32_e32 v5, s7, v4 14281; GFX9-NEXT: v_mov_b32_e32 v6, s11 14282; GFX9-NEXT: v_sub_co_u32_e32 v3, vcc, s6, v3 14283; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc 14284; GFX9-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s10, v3 14285; GFX9-NEXT: v_subbrev_co_u32_e64 v8, s[2:3], 0, v5, s[0:1] 14286; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s11, v8 14287; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 14288; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s10, v7 14289; GFX9-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, s[0:1] 14290; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[2:3] 14291; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], s11, v8 14292; GFX9-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s10, v7 14293; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[2:3] 14294; GFX9-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] 14295; GFX9-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 14296; GFX9-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[0:1] 14297; GFX9-NEXT: v_mov_b32_e32 v7, s7 14298; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v7, v4, vcc 14299; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s11, v4 14300; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 14301; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s10, v3 14302; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1] 14303; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 14304; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s11, v4 14305; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 14306; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 14307; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc 14308; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc 14309; GFX9-NEXT: v_xor_b32_e32 v3, s12, v3 14310; GFX9-NEXT: v_xor_b32_e32 v4, s12, v4 14311; GFX9-NEXT: v_mov_b32_e32 v5, s12 14312; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s12, v3 14313; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v5, vcc 14314; GFX9-NEXT: s_waitcnt lgkmcnt(0) 14315; GFX9-NEXT: global_store_dwordx4 v0, v[1:4], s[4:5] 14316; GFX9-NEXT: s_endpgm 14317; 14318; GFX90A-LABEL: srem_v2i64_pow2_shl_denom: 14319; GFX90A: ; %bb.0: 14320; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44 14321; GFX90A-NEXT: s_mov_b64 s[2:3], 0x1000 14322; GFX90A-NEXT: s_mov_b32 s16, 0x4f800000 14323; GFX90A-NEXT: s_mov_b32 s17, 0x5f7ffffc 14324; GFX90A-NEXT: s_mov_b32 s18, 0x2f800000 14325; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 14326; GFX90A-NEXT: s_lshl_b64 s[10:11], s[2:3], s6 14327; GFX90A-NEXT: s_lshl_b64 s[2:3], s[2:3], s4 14328; GFX90A-NEXT: s_ashr_i32 s4, s3, 31 14329; GFX90A-NEXT: s_add_u32 s2, s2, s4 14330; GFX90A-NEXT: s_mov_b32 s5, s4 14331; GFX90A-NEXT: s_addc_u32 s3, s3, s4 14332; GFX90A-NEXT: s_xor_b64 s[12:13], s[2:3], s[4:5] 14333; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s12 14334; GFX90A-NEXT: v_cvt_f32_u32_e32 v1, s13 14335; GFX90A-NEXT: s_mov_b32 s19, 0xcf800000 14336; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24 14337; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 14338; GFX90A-NEXT: s_sub_u32 s0, 0, s12 14339; GFX90A-NEXT: v_mac_f32_e32 v0, s16, v1 14340; GFX90A-NEXT: v_rcp_f32_e32 v0, v0 14341; GFX90A-NEXT: s_subb_u32 s1, 0, s13 14342; GFX90A-NEXT: v_mov_b32_e32 v4, 0 14343; GFX90A-NEXT: s_waitcnt lgkmcnt(0) 14344; GFX90A-NEXT: s_ashr_i32 s14, s5, 31 14345; GFX90A-NEXT: v_mul_f32_e32 v0, s17, v0 14346; GFX90A-NEXT: v_mul_f32_e32 v1, s18, v0 14347; GFX90A-NEXT: v_trunc_f32_e32 v1, v1 14348; GFX90A-NEXT: v_mac_f32_e32 v0, s19, v1 14349; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0 14350; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v1 14351; GFX90A-NEXT: s_mov_b32 s15, s14 14352; GFX90A-NEXT: v_mul_hi_u32 v3, s0, v0 14353; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v1 14354; GFX90A-NEXT: v_mul_lo_u32 v2, s1, v0 14355; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 14356; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 14357; GFX90A-NEXT: v_mul_lo_u32 v6, s0, v0 14358; GFX90A-NEXT: v_mul_lo_u32 v5, v0, v2 14359; GFX90A-NEXT: v_mul_hi_u32 v7, v0, v6 14360; GFX90A-NEXT: v_mul_hi_u32 v3, v0, v2 14361; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 14362; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 14363; GFX90A-NEXT: v_mul_hi_u32 v8, v1, v6 14364; GFX90A-NEXT: v_mul_lo_u32 v6, v1, v6 14365; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v6 14366; GFX90A-NEXT: v_mul_hi_u32 v7, v1, v2 14367; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v8, vcc 14368; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc 14369; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 14370; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v3, v2 14371; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc 14372; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 14373; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 14374; GFX90A-NEXT: v_mul_lo_u32 v2, s0, v1 14375; GFX90A-NEXT: v_mul_hi_u32 v3, s0, v0 14376; GFX90A-NEXT: v_add_u32_e32 v2, v3, v2 14377; GFX90A-NEXT: v_mul_lo_u32 v3, s1, v0 14378; GFX90A-NEXT: v_add_u32_e32 v2, v2, v3 14379; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v0 14380; GFX90A-NEXT: v_mul_hi_u32 v6, v1, v5 14381; GFX90A-NEXT: v_mul_lo_u32 v7, v1, v5 14382; GFX90A-NEXT: v_mul_lo_u32 v9, v0, v2 14383; GFX90A-NEXT: v_mul_hi_u32 v5, v0, v5 14384; GFX90A-NEXT: v_mul_hi_u32 v8, v0, v2 14385; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9 14386; GFX90A-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v8, vcc 14387; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v5, v7 14388; GFX90A-NEXT: v_mul_hi_u32 v3, v1, v2 14389; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v8, v6, vcc 14390; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc 14391; GFX90A-NEXT: v_mul_lo_u32 v2, v1, v2 14392; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v5, v2 14393; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc 14394; GFX90A-NEXT: s_add_u32 s0, s4, s14 14395; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2 14396; GFX90A-NEXT: s_addc_u32 s1, s5, s14 14397; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc 14398; GFX90A-NEXT: s_xor_b64 s[4:5], s[0:1], s[14:15] 14399; GFX90A-NEXT: v_mul_lo_u32 v3, s4, v1 14400; GFX90A-NEXT: v_mul_hi_u32 v5, s4, v0 14401; GFX90A-NEXT: v_mul_hi_u32 v2, s4, v1 14402; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, v5, v3 14403; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc 14404; GFX90A-NEXT: v_mul_hi_u32 v6, s5, v0 14405; GFX90A-NEXT: v_mul_lo_u32 v0, s5, v0 14406; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0 14407; GFX90A-NEXT: v_mul_hi_u32 v5, s5, v1 14408; GFX90A-NEXT: v_addc_co_u32_e32 v0, vcc, v2, v6, vcc 14409; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v4, vcc 14410; GFX90A-NEXT: v_mul_lo_u32 v1, s5, v1 14411; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1 14412; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc 14413; GFX90A-NEXT: v_mul_lo_u32 v1, s12, v1 14414; GFX90A-NEXT: v_mul_hi_u32 v2, s12, v0 14415; GFX90A-NEXT: v_add_u32_e32 v1, v2, v1 14416; GFX90A-NEXT: v_mul_lo_u32 v2, s13, v0 14417; GFX90A-NEXT: v_add_u32_e32 v1, v1, v2 14418; GFX90A-NEXT: v_mul_lo_u32 v0, s12, v0 14419; GFX90A-NEXT: v_sub_u32_e32 v2, s5, v1 14420; GFX90A-NEXT: v_mov_b32_e32 v3, s13 14421; GFX90A-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 14422; GFX90A-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc 14423; GFX90A-NEXT: v_subrev_co_u32_e64 v5, s[0:1], s12, v0 14424; GFX90A-NEXT: v_subbrev_co_u32_e64 v6, s[2:3], 0, v2, s[0:1] 14425; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s13, v6 14426; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[2:3] 14427; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s12, v5 14428; GFX90A-NEXT: v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1] 14429; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[2:3] 14430; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], s13, v6 14431; GFX90A-NEXT: v_subrev_co_u32_e64 v3, s[0:1], s12, v5 14432; GFX90A-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[2:3] 14433; GFX90A-NEXT: v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1] 14434; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v7 14435; GFX90A-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[0:1] 14436; GFX90A-NEXT: v_mov_b32_e32 v5, s5 14437; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v5, v1, vcc 14438; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s13, v1 14439; GFX90A-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[0:1] 14440; GFX90A-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc 14441; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s12, v0 14442; GFX90A-NEXT: s_ashr_i32 s0, s11, 31 14443; GFX90A-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc 14444; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s13, v1 14445; GFX90A-NEXT: s_add_u32 s2, s10, s0 14446; GFX90A-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc 14447; GFX90A-NEXT: s_mov_b32 s1, s0 14448; GFX90A-NEXT: s_addc_u32 s3, s11, s0 14449; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5 14450; GFX90A-NEXT: s_xor_b64 s[4:5], s[2:3], s[0:1] 14451; GFX90A-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc 14452; GFX90A-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc 14453; GFX90A-NEXT: v_cvt_f32_u32_e32 v2, s4 14454; GFX90A-NEXT: v_cvt_f32_u32_e32 v3, s5 14455; GFX90A-NEXT: v_xor_b32_e32 v0, s14, v0 14456; GFX90A-NEXT: s_sub_u32 s0, 0, s4 14457; GFX90A-NEXT: v_xor_b32_e32 v1, s14, v1 14458; GFX90A-NEXT: v_mac_f32_e32 v2, s16, v3 14459; GFX90A-NEXT: v_rcp_f32_e32 v2, v2 14460; GFX90A-NEXT: v_mov_b32_e32 v5, s14 14461; GFX90A-NEXT: v_subrev_co_u32_e32 v0, vcc, s14, v0 14462; GFX90A-NEXT: v_mul_f32_e32 v2, s17, v2 14463; GFX90A-NEXT: v_mul_f32_e32 v3, s18, v2 14464; GFX90A-NEXT: v_trunc_f32_e32 v3, v3 14465; GFX90A-NEXT: v_mac_f32_e32 v2, s19, v3 14466; GFX90A-NEXT: v_cvt_u32_f32_e32 v2, v2 14467; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v3 14468; GFX90A-NEXT: s_subb_u32 s1, 0, s5 14469; GFX90A-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc 14470; GFX90A-NEXT: v_mul_hi_u32 v6, s0, v2 14471; GFX90A-NEXT: v_mul_lo_u32 v7, s0, v3 14472; GFX90A-NEXT: v_mul_lo_u32 v5, s1, v2 14473; GFX90A-NEXT: v_add_u32_e32 v6, v6, v7 14474; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 14475; GFX90A-NEXT: v_mul_lo_u32 v8, s0, v2 14476; GFX90A-NEXT: v_mul_lo_u32 v7, v2, v5 14477; GFX90A-NEXT: v_mul_hi_u32 v9, v2, v8 14478; GFX90A-NEXT: v_mul_hi_u32 v6, v2, v5 14479; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v9, v7 14480; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 14481; GFX90A-NEXT: v_mul_hi_u32 v10, v3, v8 14482; GFX90A-NEXT: v_mul_lo_u32 v8, v3, v8 14483; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 14484; GFX90A-NEXT: v_mul_hi_u32 v9, v3, v5 14485; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v10, vcc 14486; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v9, v4, vcc 14487; GFX90A-NEXT: v_mul_lo_u32 v5, v3, v5 14488; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v6, v5 14489; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v7, vcc 14490; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 14491; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc 14492; GFX90A-NEXT: v_mul_lo_u32 v5, s0, v3 14493; GFX90A-NEXT: v_mul_hi_u32 v6, s0, v2 14494; GFX90A-NEXT: v_add_u32_e32 v5, v6, v5 14495; GFX90A-NEXT: v_mul_lo_u32 v6, s1, v2 14496; GFX90A-NEXT: v_add_u32_e32 v5, v5, v6 14497; GFX90A-NEXT: v_mul_lo_u32 v7, s0, v2 14498; GFX90A-NEXT: v_mul_hi_u32 v8, v3, v7 14499; GFX90A-NEXT: v_mul_lo_u32 v9, v3, v7 14500; GFX90A-NEXT: v_mul_lo_u32 v11, v2, v5 14501; GFX90A-NEXT: v_mul_hi_u32 v7, v2, v7 14502; GFX90A-NEXT: v_mul_hi_u32 v10, v2, v5 14503; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v11 14504; GFX90A-NEXT: v_addc_co_u32_e32 v10, vcc, 0, v10, vcc 14505; GFX90A-NEXT: v_add_co_u32_e32 v7, vcc, v7, v9 14506; GFX90A-NEXT: v_mul_hi_u32 v6, v3, v5 14507; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, v10, v8, vcc 14508; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, v6, v4, vcc 14509; GFX90A-NEXT: v_mul_lo_u32 v5, v3, v5 14510; GFX90A-NEXT: v_add_co_u32_e32 v5, vcc, v7, v5 14511; GFX90A-NEXT: s_ashr_i32 s10, s7, 31 14512; GFX90A-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 14513; GFX90A-NEXT: s_add_u32 s0, s6, s10 14514; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5 14515; GFX90A-NEXT: s_mov_b32 s11, s10 14516; GFX90A-NEXT: s_addc_u32 s1, s7, s10 14517; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc 14518; GFX90A-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11] 14519; GFX90A-NEXT: v_mul_lo_u32 v6, s6, v3 14520; GFX90A-NEXT: v_mul_hi_u32 v7, s6, v2 14521; GFX90A-NEXT: v_mul_hi_u32 v5, s6, v3 14522; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, v7, v6 14523; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc 14524; GFX90A-NEXT: v_mul_hi_u32 v8, s7, v2 14525; GFX90A-NEXT: v_mul_lo_u32 v2, s7, v2 14526; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v6, v2 14527; GFX90A-NEXT: v_mul_hi_u32 v7, s7, v3 14528; GFX90A-NEXT: v_addc_co_u32_e32 v2, vcc, v5, v8, vcc 14529; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v7, v4, vcc 14530; GFX90A-NEXT: v_mul_lo_u32 v3, s7, v3 14531; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3 14532; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc 14533; GFX90A-NEXT: v_mul_lo_u32 v3, s4, v3 14534; GFX90A-NEXT: v_mul_hi_u32 v5, s4, v2 14535; GFX90A-NEXT: v_add_u32_e32 v3, v5, v3 14536; GFX90A-NEXT: v_mul_lo_u32 v5, s5, v2 14537; GFX90A-NEXT: v_add_u32_e32 v3, v3, v5 14538; GFX90A-NEXT: v_mul_lo_u32 v2, s4, v2 14539; GFX90A-NEXT: v_sub_u32_e32 v5, s7, v3 14540; GFX90A-NEXT: v_mov_b32_e32 v6, s5 14541; GFX90A-NEXT: v_sub_co_u32_e32 v2, vcc, s6, v2 14542; GFX90A-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc 14543; GFX90A-NEXT: v_subrev_co_u32_e64 v7, s[0:1], s4, v2 14544; GFX90A-NEXT: v_subbrev_co_u32_e64 v8, s[2:3], 0, v5, s[0:1] 14545; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v8 14546; GFX90A-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[2:3] 14547; GFX90A-NEXT: v_cmp_le_u32_e64 s[2:3], s4, v7 14548; GFX90A-NEXT: v_subb_co_u32_e64 v5, s[0:1], v5, v6, s[0:1] 14549; GFX90A-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[2:3] 14550; GFX90A-NEXT: v_cmp_eq_u32_e64 s[2:3], s5, v8 14551; GFX90A-NEXT: v_subrev_co_u32_e64 v6, s[0:1], s4, v7 14552; GFX90A-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[2:3] 14553; GFX90A-NEXT: v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1] 14554; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 14555; GFX90A-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[0:1] 14556; GFX90A-NEXT: v_mov_b32_e32 v7, s7 14557; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v7, v3, vcc 14558; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s5, v3 14559; GFX90A-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc 14560; GFX90A-NEXT: v_cmp_le_u32_e32 vcc, s4, v2 14561; GFX90A-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1] 14562; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc 14563; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s5, v3 14564; GFX90A-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc 14565; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 14566; GFX90A-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc 14567; GFX90A-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc 14568; GFX90A-NEXT: v_xor_b32_e32 v2, s10, v2 14569; GFX90A-NEXT: v_xor_b32_e32 v3, s10, v3 14570; GFX90A-NEXT: v_mov_b32_e32 v5, s10 14571; GFX90A-NEXT: v_subrev_co_u32_e32 v2, vcc, s10, v2 14572; GFX90A-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc 14573; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] 14574; GFX90A-NEXT: s_endpgm 14575 %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y 14576 %r = srem <2 x i64> %x, %shl.y 14577 store <2 x i64> %r, <2 x i64> addrspace(1)* %out 14578 ret void 14579} 14580